distributed working except fit()

master 0.63
ml server 2020-01-18 15:43:33 -07:00
parent 67cab8731a
commit c66b6d51e5
2 changed files with 414 additions and 1 deletions

View File

@ -0,0 +1,413 @@
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
# wut-train-cluster --- What U Think? SatNOGS Observation AI, training application cluster edition.
#
# https://spacecruft.org/spacecruft/satnogs-wut
#
# Based on data/train and data/val directories builds a wut.tf file.
# GPLv3+
# Built using Jupyter, Tensorflow, Keras
# In[ ]:
from __future__ import absolute_import, division, print_function, unicode_literals
from __future__ import print_function
import os
import numpy as np
import simplejson as json
import datetime
# In[ ]:
import tensorflow as tf
import tensorflow.python.keras
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from tensorflow.python.keras import optimizers
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Activation, Dropout, Flatten, Dense
from tensorflow.python.keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D
from tensorflow.python.keras.layers import Input, concatenate
from tensorflow.python.keras.models import load_model
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.preprocessing import image
from tensorflow.python.keras.preprocessing.image import img_to_array
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
from tensorflow.python.keras.preprocessing.image import load_img
# In[ ]:
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display, Image
# In[ ]:
os.environ["TF_CONFIG"] = json.dumps({
"cluster": {
"worker": [ "10.100.100.130:2222", "ml1:2222", "ml2:2222", "ml3:2222", "ml4:2222", "ml5:2222" ]
},
"task": {"type": "worker", "index": 0 },
"num_workers": 5
})
# In[ ]:
IMG_HEIGHT = 416
IMG_WIDTH= 804
batch_size = 32
epochs = 4
# Full size, machine barfs probably needs more RAM
#IMG_HEIGHT = 832
#IMG_WIDTH = 1606
# Good results
#batch_size = 128
#epochs = 6
# In[ ]:
tf.keras.backend.clear_session()
# In[ ]:
#from tensorflow.python.framework.ops import disable_eager_execution
#disable_eager_execution()
# In[ ]:
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
tf.distribute.experimental.CollectiveCommunication.RING)
#
# MultiWorkerMirroredStrategy needs TF_CONFIG
#multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
# Central Storage Strategy
#central_storage_strategy = tf.distribute.experimental.CentralStorageStrategy()
# ParameterServerStrategy needs TF_CONFIG
#ps_strategy = tf.distribute.experimental.ParameterServerStrategy()
# OneDeviceStrategy No cluster
#strategy = tf.distribute.OneDeviceStrategy(device="/CPU:0")
# Mirrored Strategy
#mirrored_strategy = tf.distribute.MirroredStrategy()
# In[ ]:
train_dir = os.path.join('data/', 'train')
val_dir = os.path.join('data/', 'val')
train_good_dir = os.path.join(train_dir, 'good')
train_bad_dir = os.path.join(train_dir, 'bad')
val_good_dir = os.path.join(val_dir, 'good')
val_bad_dir = os.path.join(val_dir, 'bad')
num_train_good = len(os.listdir(train_good_dir))
num_train_bad = len(os.listdir(train_bad_dir))
num_val_good = len(os.listdir(val_good_dir))
num_val_bad = len(os.listdir(val_bad_dir))
total_train = num_train_good + num_train_bad
total_val = num_val_good + num_val_bad
# In[ ]:
print('total training good images:', num_train_good)
print('total training bad images:', num_train_bad)
print("--")
print("Total training images:", total_train)
print('total validation good images:', num_val_good)
print('total validation bad images:', num_val_bad)
print("--")
print("Total validation images:", total_val)
# In[ ]:
print("--")
print("Reduce training and validation set when testing")
#total_train = 16
#total_val = 16
print("Reduced training images:", total_train)
print("Reduced validation images:", total_val)
# In[ ]:
train_image_generator = ImageDataGenerator(
rescale=1./255
)
val_image_generator = ImageDataGenerator(
rescale=1./255
)
train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,
directory=train_dir,
shuffle=True,
target_size=(IMG_HEIGHT, IMG_WIDTH),
class_mode='binary')
val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,
directory=val_dir,
target_size=(IMG_HEIGHT, IMG_WIDTH),
class_mode='binary')
# In[ ]:
sample_train_images, _ = next(train_data_gen)
sample_val_images, _ = next(val_data_gen)
# This function will plot images in the form of a grid with 1 row and 3 columns where images are placed in each column.
def plotImages(images_arr):
fig, axes = plt.subplots(1, 3, figsize=(20,20))
axes = axes.flatten()
for img, ax in zip( images_arr, axes):
ax.imshow(img)
ax.axis('off')
plt.tight_layout()
plt.show()
#plotImages(sample_train_images[0:3])
#plotImages(sample_val_images[0:3])
# In[ ]:
get_ipython().run_line_magic('load_ext', 'tensorboard')
get_ipython().system('rm -rf ./clusterlogs/')
#log_dir="clusterlogs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir="clusterlogs"
#tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir)
#%tensorboard --logdir clusterlogs --port 6006
# In[ ]:
#strategy.num_replicas_in_sync
## Compute global batch size using number of replicas.
#BATCH_SIZE_PER_REPLICA = 5
#print(BATCH_SIZE_PER_REPLICA)
#global_batch_size = (BATCH_SIZE_PER_REPLICA *
# strategy.num_replicas_in_sync)
#print(global_batch_size)
#dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)
#dataset = dataset.batch(global_batch_size)
#LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}
# In[ ]:
#learning_rate = LEARNING_RATES_BY_BATCH_SIZE[global_batch_size]
# In[ ]:
def get_uncompiled_model():
model = Sequential([
Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
MaxPooling2D(),
Conv2D(32, 3, padding='same', activation='relu'),
MaxPooling2D(),
Conv2D(64, 3, padding='same', activation='relu'),
MaxPooling2D(),
Flatten(),
Dense(512, activation='relu'),
Dense(1, activation='sigmoid')
])
return model
# In[ ]:
#get_uncompiled_model()
# In[ ]:
def get_compiled_model():
model = get_uncompiled_model()
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
return model
# In[ ]:
#model = get_compiled_model()
# In[ ]:
#callbacks = [tf.keras.callbacks.ModelCheckpoint(filepath='tmp/keras-ckpt')]
#callbacks=[tensorboard_callback,callbacks]
# In[ ]:
def get_fit_model():
model = get_compiled_model()
model.fit(
train_data_gen,
steps_per_epoch=total_train // batch_size,
epochs=epochs,
validation_data=val_data_gen,
validation_steps=total_val // batch_size,
verbose=2
)
return model
# In[ ]:
with strategy.scope():
get_uncompiled_model()
# In[ ]:
with strategy.scope():
get_compiled_model()
# In[ ]:
#with strategy.scope():
# get_fit_model()
# In[ ]:
#multi_worker_model = get_compiled_model()
#multi_worker_model.fit(
# x=train_data_gen,
# epochs=epochs,
# steps_per_epoch=total_train // batch_size
# )
# In[ ]:
with strategy.scope():
multi_worker_model = get_compiled_model()
multi_worker_model.fit(
x=train_data_gen,
epochs=epochs,
steps_per_epoch=total_train // batch_size
)
# In[ ]:
#model.summary()
# In[ ]:
print("TRAINING info")
print(train_dir)
print(train_good_dir)
print(train_bad_dir)
print(train_image_generator)
print(train_data_gen)
#print(sample_train_images)
#print(history)
#model.to_json()
# In[ ]:
#history = model.fit(X, y, batch_size=32, epochs=40, validation_split=0.1)
# In[ ]:
model.save('data/models/FOO/wut-train-cluster.tf')
# In[ ]:
model.save('data/models/FOO/wut-train-cluster.h5')
# In[ ]:
model.save_weights('data/models/FOO/wut-weights-train-cluster.tf')
# In[ ]:
model.save_weights('data/models/FOO/wut-weights-train-cluster.h5')
# In[ ]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs_range = range(epochs)
plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')
plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()
# In[ ]:
# The End

View File

@ -57,7 +57,7 @@ def get_compiled_model():
def get_fit_model():
model = get_compiled_model()
model.fit(
)
model )
return model
#def get_fit_model():