no more jupyter notebook for wut-worker-train-cluster-fn.py
parent
a74bd74a4d
commit
983157846c
|
@ -1,11 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
#
|
||||
# wut-train-cluster-fn.py
|
||||
# wut-worker-train-cluster-fn.py
|
||||
#
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# wut-train-cluster --- What U Think? SatNOGS Observation AI, training application cluster edition.
|
||||
#
|
||||
# https://spacecruft.org/spacecruft/satnogs-wut
|
||||
|
@ -14,21 +10,12 @@
|
|||
# GPLv3+
|
||||
# Built using Jupyter, Tensorflow, Keras
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import numpy as np
|
||||
import simplejson as json
|
||||
import datetime
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
import tensorflow as tf
|
||||
import tensorflow.python.keras
|
||||
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
|
||||
|
@ -44,11 +31,6 @@ from tensorflow.python.keras.preprocessing.image import img_to_array
|
|||
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
|
||||
from tensorflow.python.keras.preprocessing.image import load_img
|
||||
from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
get_ipython().run_line_magic('matplotlib', 'inline')
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
@ -57,16 +39,8 @@ import seaborn as sns
|
|||
#import ipywidgets as widgets
|
||||
#from IPython.display import display, Image
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
print('tf {}'.format(tf.__version__))
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
os.environ["TF_CONFIG"] = json.dumps({
|
||||
"cluster": {
|
||||
"worker": ["ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222" ]
|
||||
|
@ -75,10 +49,6 @@ os.environ["TF_CONFIG"] = json.dumps({
|
|||
"num_workers": 5
|
||||
})
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
IMG_HEIGHT = 416
|
||||
IMG_WIDTH= 804
|
||||
batch_size = 32
|
||||
|
@ -90,36 +60,19 @@ epochs = 1
|
|||
#batch_size = 128
|
||||
#epochs = 6
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
tf.keras.backend.clear_session()
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
options = tf.data.Options()
|
||||
#options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
|
||||
options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA
|
||||
# XXX
|
||||
#dataset = dataset.with_options(options)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
|
||||
tf.distribute.experimental.CollectiveCommunication.RING)
|
||||
|
||||
#mirrored_strategy = tf.distribute.MirroredStrategy(
|
||||
# cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
root_data_dir = ('/srv/satnogs')
|
||||
train_dir = os.path.join(root_data_dir, 'data/', 'train')
|
||||
val_dir = os.path.join(root_data_dir,'data/', 'val')
|
||||
|
@ -134,10 +87,6 @@ num_val_bad = len(os.listdir(val_bad_dir))
|
|||
total_train = num_train_good + num_train_bad
|
||||
total_val = num_val_good + num_val_bad
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
print('total training good images:', num_train_good)
|
||||
print('total training bad images:', num_train_bad)
|
||||
print("--")
|
||||
|
@ -146,11 +95,6 @@ print('total validation good images:', num_val_good)
|
|||
print('total validation bad images:', num_val_bad)
|
||||
print("--")
|
||||
print("Total validation images:", total_val)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
print("--")
|
||||
print("Reduce training and validation set when testing")
|
||||
total_train = 100
|
||||
|
@ -158,10 +102,6 @@ total_val = 100
|
|||
print("Reduced training images:", total_train)
|
||||
print("Reduced validation images:", total_val)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
train_image_generator = ImageDataGenerator(
|
||||
rescale=1./255
|
||||
)
|
||||
|
@ -179,18 +119,8 @@ val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,
|
|||
directory=val_dir,
|
||||
target_size=(IMG_HEIGHT, IMG_WIDTH),
|
||||
class_mode='binary')
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
#train_dist_dataset = strategy.experimental_distribute_dataset()
|
||||
#val_dist_dataset = strategy.experimental_distribute_dataset()
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
sample_train_images, _ = next(train_data_gen)
|
||||
sample_val_images, _ = next(val_data_gen)
|
||||
# This function will plot images in the form of a grid with 1 row and 3 columns where images are placed in each column.
|
||||
|
@ -203,31 +133,18 @@ def plotImages(images_arr):
|
|||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
plotImages(sample_train_images[0:3])
|
||||
plotImages(sample_val_images[0:3])
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
get_ipython().run_line_magic('load_ext', 'tensorboard')
|
||||
get_ipython().system('rm -rf ./clusterlogs/')
|
||||
#plotImages(sample_train_images[0:3])
|
||||
#plotImages(sample_val_images[0:3])
|
||||
#get_ipython().run_line_magic('load_ext', 'tensorboard')
|
||||
#get_ipython().system('rm -rf ./clusterlogs/')
|
||||
#log_dir="clusterlogs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
log_dir="clusterlogs"
|
||||
#log_dir="clusterlogs"
|
||||
#tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
|
||||
tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir)
|
||||
#tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir)
|
||||
#%tensorboard --logdir clusterlogs --port 6006
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
strategy.num_replicas_in_sync
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
## Compute global batch size using number of replicas.
|
||||
#GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
|
||||
BATCH_SIZE_PER_REPLICA = 8
|
||||
|
@ -246,16 +163,8 @@ print("total_val // batch_size", total_val // batch_size)
|
|||
#dataset = dataset.batch(global_batch_size)
|
||||
#LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
#learning_rate = LEARNING_RATES_BY_BATCH_SIZE[global_batch_size]
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
def get_uncompiled_model():
|
||||
model = Sequential([
|
||||
Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
|
||||
|
@ -270,16 +179,7 @@ def get_uncompiled_model():
|
|||
])
|
||||
return model
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
#get_uncompiled_model()
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
def get_compiled_model():
|
||||
model = get_uncompiled_model()
|
||||
model.compile(optimizer='adam',
|
||||
|
@ -287,25 +187,12 @@ def get_compiled_model():
|
|||
metrics=['accuracy'])
|
||||
return model
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# Create a checkpoint directory to store the checkpoints.
|
||||
#checkpoint_dir = './training_checkpoints'
|
||||
#checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
#callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath='tmp/keras-ckpt')]
|
||||
#callbacks=[tensorboard_callback,callbacks]
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
#def get_fit_model():
|
||||
# model = get_compiled_model()
|
||||
# model.fit(
|
||||
|
@ -318,10 +205,6 @@ def get_compiled_model():
|
|||
# )
|
||||
#return model
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
with strategy.scope():
|
||||
model = get_compiled_model()
|
||||
history = model.fit(
|
||||
|
@ -333,16 +216,8 @@ with strategy.scope():
|
|||
verbose=2
|
||||
).batch(global_batch_size)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
#model.summary()
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
print("TRAINING info")
|
||||
print(train_dir)
|
||||
print(train_good_dir)
|
||||
|
@ -353,40 +228,13 @@ print(train_data_gen)
|
|||
#print(history)
|
||||
#model.to_json()
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
#history = model.fit(X, y, batch_size=32, epochs=40, validation_split=0.1)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
model.save('data/models/FOO/wut-train-cluster2.tf')
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
model.save('data/models/FOO/wut-train-cluster2.h5')
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
model.save_weights('data/models/FOO/wut-weights-train-cluster2.tf')
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
model.save_weights('data/models/FOO/wut-weights-train-cluster2.h5')
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
acc = history.history['accuracy']
|
||||
val_acc = history.history['val_accuracy']
|
||||
loss = history.history['loss']
|
||||
|
@ -405,9 +253,3 @@ plt.legend(loc='upper right')
|
|||
plt.title('Training and Validation Loss')
|
||||
plt.show()
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
# The End
|
||||
|
||||
|
|
Loading…
Reference in New Issue