no more jupyter notebook for wut-worker-train-cluster-fn.py

master
ml server 2020-01-28 17:46:02 -07:00
parent a74bd74a4d
commit 983157846c
1 changed files with 7 additions and 165 deletions

View File

@ -1,11 +1,7 @@
#!/usr/bin/env python3
#
# wut-train-cluster-fn.py
# wut-worker-train-cluster-fn.py
#
# In[ ]:
# wut-train-cluster --- What U Think? SatNOGS Observation AI, training application cluster edition.
#
# https://spacecruft.org/spacecruft/satnogs-wut
@ -14,21 +10,12 @@
# GPLv3+
# Built using Jupyter, Tensorflow, Keras
# In[ ]:
from __future__ import absolute_import, division, print_function, unicode_literals
from __future__ import print_function
import os
import numpy as np
import simplejson as json
import datetime
# In[ ]:
import tensorflow as tf
import tensorflow.python.keras
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
@ -44,11 +31,6 @@ from tensorflow.python.keras.preprocessing.image import img_to_array
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
from tensorflow.python.keras.preprocessing.image import load_img
from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
# In[ ]:
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import seaborn as sns
@ -57,16 +39,8 @@ import seaborn as sns
#import ipywidgets as widgets
#from IPython.display import display, Image
# In[ ]:
print('tf {}'.format(tf.__version__))
# In[ ]:
os.environ["TF_CONFIG"] = json.dumps({
"cluster": {
"worker": ["ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222" ]
@ -75,10 +49,6 @@ os.environ["TF_CONFIG"] = json.dumps({
"num_workers": 5
})
# In[ ]:
IMG_HEIGHT = 416
IMG_WIDTH= 804
batch_size = 32
@ -90,36 +60,19 @@ epochs = 1
#batch_size = 128
#epochs = 6
# In[ ]:
tf.keras.backend.clear_session()
# In[ ]:
options = tf.data.Options()
#options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA
# XXX
#dataset = dataset.with_options(options)
# In[ ]:
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
tf.distribute.experimental.CollectiveCommunication.RING)
#mirrored_strategy = tf.distribute.MirroredStrategy(
# cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
# In[ ]:
root_data_dir = ('/srv/satnogs')
train_dir = os.path.join(root_data_dir, 'data/', 'train')
val_dir = os.path.join(root_data_dir,'data/', 'val')
@ -134,10 +87,6 @@ num_val_bad = len(os.listdir(val_bad_dir))
total_train = num_train_good + num_train_bad
total_val = num_val_good + num_val_bad
# In[ ]:
print('total training good images:', num_train_good)
print('total training bad images:', num_train_bad)
print("--")
@ -146,11 +95,6 @@ print('total validation good images:', num_val_good)
print('total validation bad images:', num_val_bad)
print("--")
print("Total validation images:", total_val)
# In[ ]:
print("--")
print("Reduce training and validation set when testing")
total_train = 100
@ -158,10 +102,6 @@ total_val = 100
print("Reduced training images:", total_train)
print("Reduced validation images:", total_val)
# In[ ]:
train_image_generator = ImageDataGenerator(
rescale=1./255
)
@ -179,18 +119,8 @@ val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,
directory=val_dir,
target_size=(IMG_HEIGHT, IMG_WIDTH),
class_mode='binary')
# In[ ]:
#train_dist_dataset = strategy.experimental_distribute_dataset()
#val_dist_dataset = strategy.experimental_distribute_dataset()
# In[ ]:
sample_train_images, _ = next(train_data_gen)
sample_val_images, _ = next(val_data_gen)
# This function will plot images in the form of a grid with 1 row and 3 columns where images are placed in each column.
@ -203,31 +133,18 @@ def plotImages(images_arr):
plt.tight_layout()
plt.show()
plotImages(sample_train_images[0:3])
plotImages(sample_val_images[0:3])
# In[ ]:
get_ipython().run_line_magic('load_ext', 'tensorboard')
get_ipython().system('rm -rf ./clusterlogs/')
#plotImages(sample_train_images[0:3])
#plotImages(sample_val_images[0:3])
#get_ipython().run_line_magic('load_ext', 'tensorboard')
#get_ipython().system('rm -rf ./clusterlogs/')
#log_dir="clusterlogs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir="clusterlogs"
#log_dir="clusterlogs"
#tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir)
#tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir)
#%tensorboard --logdir clusterlogs --port 6006
# In[ ]:
strategy.num_replicas_in_sync
# In[ ]:
## Compute global batch size using number of replicas.
#GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
BATCH_SIZE_PER_REPLICA = 8
@ -246,16 +163,8 @@ print("total_val // batch_size", total_val // batch_size)
#dataset = dataset.batch(global_batch_size)
#LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}
# In[ ]:
#learning_rate = LEARNING_RATES_BY_BATCH_SIZE[global_batch_size]
# In[ ]:
def get_uncompiled_model():
model = Sequential([
Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
@ -270,16 +179,7 @@ def get_uncompiled_model():
])
return model
# In[ ]:
#get_uncompiled_model()
# In[ ]:
def get_compiled_model():
model = get_uncompiled_model()
model.compile(optimizer='adam',
@ -287,25 +187,12 @@ def get_compiled_model():
metrics=['accuracy'])
return model
# In[ ]:
# Create a checkpoint directory to store the checkpoints.
#checkpoint_dir = './training_checkpoints'
#checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
# In[ ]:
#callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath='tmp/keras-ckpt')]
#callbacks=[tensorboard_callback,callbacks]
# In[ ]:
#def get_fit_model():
# model = get_compiled_model()
# model.fit(
@ -318,10 +205,6 @@ def get_compiled_model():
# )
#return model
# In[ ]:
with strategy.scope():
model = get_compiled_model()
history = model.fit(
@ -333,16 +216,8 @@ with strategy.scope():
verbose=2
).batch(global_batch_size)
# In[ ]:
#model.summary()
# In[ ]:
print("TRAINING info")
print(train_dir)
print(train_good_dir)
@ -353,40 +228,13 @@ print(train_data_gen)
#print(history)
#model.to_json()
# In[ ]:
#history = model.fit(X, y, batch_size=32, epochs=40, validation_split=0.1)
# In[ ]:
model.save('data/models/FOO/wut-train-cluster2.tf')
# In[ ]:
model.save('data/models/FOO/wut-train-cluster2.h5')
# In[ ]:
model.save_weights('data/models/FOO/wut-weights-train-cluster2.tf')
# In[ ]:
model.save_weights('data/models/FOO/wut-weights-train-cluster2.h5')
# In[ ]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
@ -405,9 +253,3 @@ plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()
# In[ ]:
# The End