no more jupyter notebook for wut-worker-train-cluster-fn.py

2020-01-28 17:46:02 -07:00 · 2020-01-28 17:46:02 -07:00 · 983157846c
parent a74bd74a4d
commit 983157846c
1 changed files with 7 additions and 165 deletions
--- a/wut-worker-train-cluster-fn.py
+++ b/wut-worker-train-cluster-fn.py
@ -1,11 +1,7 @@
 #!/usr/bin/env python3
 #
-# wut-train-cluster-fn.py
+# wut-worker-train-cluster-fn.py
 #
-
-# In[ ]:
-
-
 # wut-train-cluster --- What U Think? SatNOGS Observation AI, training application cluster edition.
 #
 # https://spacecruft.org/spacecruft/satnogs-wut
@ -14,21 +10,12 @@
 # GPLv3+
 # Built using Jupyter, Tensorflow, Keras

-
-# In[ ]:
-
-
 from __future__ import absolute_import, division, print_function, unicode_literals
 from __future__ import print_function
 import os
 import numpy as np
 import simplejson as json
 import datetime
-
-
-# In[ ]:
-
-
 import tensorflow as tf
 import tensorflow.python.keras
 from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
@ -44,11 +31,6 @@ from tensorflow.python.keras.preprocessing.image import img_to_array
 from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
 from tensorflow.python.keras.preprocessing.image import load_img
 from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
-
-
-# In[ ]:
-
-
 get_ipython().run_line_magic('matplotlib', 'inline')
 import matplotlib.pyplot as plt
 import seaborn as sns
@ -57,16 +39,8 @@ import seaborn as sns
 #import ipywidgets as widgets
 #from IPython.display import display, Image

-
-# In[ ]:
-
-
 print('tf {}'.format(tf.__version__))

-
-# In[ ]:
-
-
 os.environ["TF_CONFIG"] = json.dumps({
    "cluster": {
        "worker": ["ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222" ]
@ -75,10 +49,6 @@ os.environ["TF_CONFIG"] = json.dumps({
   "num_workers": 5
 })

-
-# In[ ]:
-
-
 IMG_HEIGHT = 416
 IMG_WIDTH= 804
 batch_size = 32
@ -90,36 +60,19 @@ epochs = 1
 #batch_size = 128
 #epochs = 6

-
-# In[ ]:
-
-
 tf.keras.backend.clear_session()

-
-# In[ ]:
-
-
 options = tf.data.Options()
 #options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
 options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA
 # XXX
 #dataset = dataset.with_options(options)

-
-# In[ ]:
-
-
 strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
    tf.distribute.experimental.CollectiveCommunication.RING)
-
 #mirrored_strategy = tf.distribute.MirroredStrategy(
 #    cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())

-
-# In[ ]:
-
-
 root_data_dir = ('/srv/satnogs')
 train_dir = os.path.join(root_data_dir, 'data/', 'train')
 val_dir = os.path.join(root_data_dir,'data/', 'val')
@ -134,10 +87,6 @@ num_val_bad = len(os.listdir(val_bad_dir))
 total_train = num_train_good + num_train_bad
 total_val = num_val_good + num_val_bad

-
-# In[ ]:
-
-
 print('total training good images:', num_train_good)
 print('total training bad images:', num_train_bad)
 print("--")
@ -146,11 +95,6 @@ print('total validation good images:', num_val_good)
 print('total validation bad images:', num_val_bad)
 print("--")
 print("Total validation images:", total_val)
-
-
-# In[ ]:
-
-
 print("--")
 print("Reduce training and validation set when testing")
 total_train = 100
@ -158,10 +102,6 @@ total_val = 100
 print("Reduced training images:", total_train)
 print("Reduced validation images:", total_val)

-
-# In[ ]:
-
-
 train_image_generator = ImageDataGenerator(
    rescale=1./255
 )
@ -179,18 +119,8 @@ val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,
                                                       directory=val_dir,
                                                       target_size=(IMG_HEIGHT, IMG_WIDTH),
                                                       class_mode='binary')
-
-
-# In[ ]:
-
-
 #train_dist_dataset = strategy.experimental_distribute_dataset()
 #val_dist_dataset = strategy.experimental_distribute_dataset()
-
-
-# In[ ]:
-
-
 sample_train_images, _ = next(train_data_gen)
 sample_val_images, _ = next(val_data_gen)
 # This function will plot images in the form of a grid with 1 row and 3 columns where images are placed in each column.
@ -203,31 +133,18 @@ def plotImages(images_arr):
    plt.tight_layout()
    plt.show()
    
-plotImages(sample_train_images[0:3])
-plotImages(sample_val_images[0:3])
-
-
-# In[ ]:
-
-
-get_ipython().run_line_magic('load_ext', 'tensorboard')
-get_ipython().system('rm -rf ./clusterlogs/')
+#plotImages(sample_train_images[0:3])
+#plotImages(sample_val_images[0:3])
+#get_ipython().run_line_magic('load_ext', 'tensorboard')
+#get_ipython().system('rm -rf ./clusterlogs/')
 #log_dir="clusterlogs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
-log_dir="clusterlogs"
+#log_dir="clusterlogs"
 #tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
-tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir)
+#tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir)
 #%tensorboard --logdir clusterlogs --port 6006

-
-# In[ ]:
-
-
 strategy.num_replicas_in_sync

-
-# In[ ]:
-
-
 ## Compute global batch size using number of replicas.
 #GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
 BATCH_SIZE_PER_REPLICA = 8
@ -246,16 +163,8 @@ print("total_val   // batch_size", total_val // batch_size)
 #dataset = dataset.batch(global_batch_size)
 #LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}

-
-# In[ ]:
-
-
 #learning_rate = LEARNING_RATES_BY_BATCH_SIZE[global_batch_size]

-
-# In[ ]:
-
-
 def get_uncompiled_model():
  model = Sequential([
    Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
@ -270,16 +179,7 @@ def get_uncompiled_model():
  ])
  return model

-
-# In[ ]:
-
-
 #get_uncompiled_model()
-
-
-# In[ ]:
-
-
 def get_compiled_model():
  model = get_uncompiled_model()
  model.compile(optimizer='adam',
@ -287,25 +187,12 @@ def get_compiled_model():
              metrics=['accuracy'])
  return model

-
-# In[ ]:
-
-
 # Create a checkpoint directory to store the checkpoints.
 #checkpoint_dir = './training_checkpoints'
 #checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
-
-
-# In[ ]:
-
-
 #callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath='tmp/keras-ckpt')]
 #callbacks=[tensorboard_callback,callbacks]

-
-# In[ ]:
-
-
 #def get_fit_model():
 #    model = get_compiled_model()
 #    model.fit(
@ -318,10 +205,6 @@ def get_compiled_model():
 #        )
 #return model

-
-# In[ ]:
-
-
 with strategy.scope():
    model = get_compiled_model()
    history = model.fit(
@ -333,16 +216,8 @@ with strategy.scope():
        verbose=2
        ).batch(global_batch_size)

-
-# In[ ]:
-
-
 #model.summary()

-
-# In[ ]:
-
-
 print("TRAINING info")
 print(train_dir)
 print(train_good_dir)
@ -353,40 +228,13 @@ print(train_data_gen)
 #print(history)
 #model.to_json()

-
-# In[ ]:
-
-
 #history = model.fit(X, y, batch_size=32, epochs=40, validation_split=0.1)

-
-# In[ ]:
-
-
 model.save('data/models/FOO/wut-train-cluster2.tf')
-
-
-# In[ ]:
-
-
 model.save('data/models/FOO/wut-train-cluster2.h5')
-
-
-# In[ ]:
-
-
 model.save_weights('data/models/FOO/wut-weights-train-cluster2.tf')
-
-
-# In[ ]:
-
-
 model.save_weights('data/models/FOO/wut-weights-train-cluster2.h5')

-
-# In[ ]:
-
-
 acc = history.history['accuracy']
 val_acc = history.history['val_accuracy']
 loss = history.history['loss']
@ -405,9 +253,3 @@ plt.legend(loc='upper right')
 plt.title('Training and Validation Loss')
 plt.show()

-
-# In[ ]:
-
-
-# The End
-