diff --git a/wut-worker-mas b/wut-worker-mas new file mode 100755 index 0000000..c904033 --- /dev/null +++ b/wut-worker-mas @@ -0,0 +1,24 @@ +#!/bin/bash +# wut-worker-mas +# +# Starts worker client. +# +# Usage: +# wut-worker-mas +# Example: +# wut-worker-mas +# +# Note: +# Each node needs a unique index number. +# +# NOTE! +# This generates the node number based off the hostname. +# The hosts are ml0 through ml5. + +HOSTNUM=`hostname | sed -e 's/ml//g'` + +export TF_CONFIG='{"cluster": {"worker": [ "ml0-int:2222", "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}' + +echo $TF_CONFIG +python3 wut-worker-mas.py + diff --git a/wut-worker-mas.py b/wut-worker-mas.py index 45d0dda..334d9c9 100644 --- a/wut-worker-mas.py +++ b/wut-worker-mas.py @@ -43,8 +43,13 @@ BUFFER_SIZE = 10000 NUM_WORKERS = 6 GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS +# XXX POSITIVE_DIRECTORY = '/home/jebba/devel/spacecruft/satnogs-wut/data/pos' pos_dir = '/home/jebba/devel/spacecruft/satnogs-wut/data/posdir' + +strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy( + tf.distribute.experimental.CollectiveCommunication.RING) + def get_bytes_and_label(filepath): raw_bytes = tf.io.read_file(filepath) label = tf.strings.regex_full_match( @@ -88,7 +93,7 @@ dataset = dataset.map(process_image, num_parallel_calls=AUTOTUNE) dataset = dataset.batch(batch_size=32) dataset = dataset.prefetch(buffer_size=AUTOTUNE) -print(tf.__version__) +print("Tensorflow Version: ", tf.__version__) print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU'))) #with tf.device("GPU:0"): @@ -105,8 +110,6 @@ tf.config.optimizer.set_jit(True) tf.summary.trace_on(profiler=True) -strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy( - tf.distribute.experimental.CollectiveCommunication.RING) def compiled_model():