diff --git a/README.md b/README.md index 44db317..dd23b87 100644 --- a/README.md +++ b/README.md @@ -134,7 +134,10 @@ firefox https://github.com/bazelbuild/bazel/releases # Install Tensorflow git clone tensorflow... cd tensorflow -git checkout remotes/origin/r2.1 +git checkout v2.1.0 +bazel clean +# Get flags to pass: +grep flags -m1 /proc/cpuinfo | cut -d ":" -f 2 | tr '[:upper:]' '[:lower:]' | { read FLAGS; OPT="-march=native"; for flag in $FLAGS; do case "$flag" in "sse4_1" | "sse4_2" | "ssse3" | "fma" | "cx16" | "popcnt" | "avx" | "avx2") OPT+=" -m$flag";; esac; done; MODOPT=${OPT//_/\.}; echo "$MODOPT"; } ./configure # Run Bazel to build pip package. Takes nearly 2 hours to build. bazel build --config=opt //tensorflow/tools/pip_package:build_pip_package diff --git a/wut-worker-mas b/wut-worker-mas index b0852e1..9def451 100755 --- a/wut-worker-mas +++ b/wut-worker-mas @@ -18,8 +18,8 @@ HOSTNUM=`hostname | sed -e 's/ml//g'` #export TF_CONFIG='{"cluster": {"worker": [ "ml0-int:2222", "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}' -#export TF_CONFIG='{"cluster": {"worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}}' -export TF_CONFIG='{"cluster": {"chief": [ "ml0-int:2222" ], "worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}' +export TF_CONFIG='{"cluster": {"worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}}' +#export TF_CONFIG='{"cluster": {"chief": [ "ml0-int:2222" ], "worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}' echo $TF_CONFIG python3 wut-worker-mas.py diff --git a/wut-worker-mas.py b/wut-worker-mas.py index 2026d12..d0b90a2 100644 --- a/wut-worker-mas.py +++ b/wut-worker-mas.py @@ -26,11 +26,9 @@ from tensorflow.python.keras.preprocessing import image from tensorflow.python.keras.preprocessing.image import img_to_array from tensorflow.python.keras.preprocessing.image import ImageDataGenerator from tensorflow.python.keras.preprocessing.image import load_img -#import tensorflow.python.distribute.cluster_resolver -#from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver -#from tensorflow.python.distribute.cluster_resolver.TFConfigClusterResolver tf.keras.backend.clear_session() +tf.config.optimizer.set_jit(True) options = tf.data.Options() os.environ["TF_CONFIG"] = json.dumps({ "cluster": { @@ -39,27 +37,16 @@ os.environ["TF_CONFIG"] = json.dumps({ }, "task": {"type": "chief", "index": 0 }, }) -#os.environ["TF_CONFIG"] = json.dumps({ -# "cluster": { -# "worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222" ] -# }#, -# #"task": {"type": "worker", "index": 0 }, -#}) print("Tensorflow Version: ", tf.__version__) print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU'))) -#with tf.device("GPU:0"): -# tf.ones(()) # Make sure we can run on GPU - -# This ensures that XLA and ptxas work well together, and helps with scaling. print("XLA_FLAGS='{}'".format(os.getenv("XLA_FLAGS"))) IMG_HEIGHT = 416 IMG_WIDTH= 804 batch_size = 32 epochs = 4 - BUFFER_SIZE = 10000 NUM_WORKERS = 6 GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS @@ -68,12 +55,9 @@ GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS POSITIVE_DIRECTORY = '/home/jebba/devel/spacecruft/satnogs-wut/data/pos' pos_dir = '/home/jebba/devel/spacecruft/satnogs-wut/data/posdir' -from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver - strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy( tf.distribute.experimental.CollectiveCommunication.RING) - def get_bytes_and_label(filepath): raw_bytes = tf.io.read_file(filepath) label = tf.strings.regex_full_match( @@ -117,13 +101,9 @@ dataset = dataset.map(process_image, num_parallel_calls=AUTOTUNE) dataset = dataset.batch(batch_size=32) dataset = dataset.prefetch(buffer_size=AUTOTUNE) - os.makedirs(profile_dir, exist_ok=True) # tf.data.Dataset.from_generator - -tf.config.optimizer.set_jit(True) - #tf.summary.trace_on(profiler=True)