temporary dev notebook to convert workflow to functions for distribution

master
ml server 2020-01-18 11:46:58 -07:00
parent cd80fa6d31
commit 99672dd02f
1 changed files with 446 additions and 0 deletions

View File

@ -0,0 +1,446 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# wut-train-cluster --- What U Think? SatNOGS Observation AI, training application cluster edition.\n",
"#\n",
"# https://spacecruft.org/spacecruft/satnogs-wut\n",
"#\n",
"# Based on data/train and data/val directories builds a wut.tf file.\n",
"# GPLv3+\n",
"# Built using Jupyter, Tensorflow, Keras"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from __future__ import absolute_import, division, print_function, unicode_literals\n",
"from __future__ import print_function\n",
"import os\n",
"import numpy as np\n",
"import simplejson as json\n",
"import datetime"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"import tensorflow.python.keras\n",
"from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D\n",
"from tensorflow.python.keras import optimizers\n",
"from tensorflow.python.keras import Sequential\n",
"from tensorflow.python.keras.layers import Activation, Dropout, Flatten, Dense\n",
"from tensorflow.python.keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D\n",
"from tensorflow.python.keras.layers import Input, concatenate\n",
"from tensorflow.python.keras.models import load_model\n",
"from tensorflow.python.keras.models import Model\n",
"from tensorflow.python.keras.preprocessing import image\n",
"from tensorflow.python.keras.preprocessing.image import img_to_array\n",
"from tensorflow.python.keras.preprocessing.image import ImageDataGenerator\n",
"from tensorflow.python.keras.preprocessing.image import load_img"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.decomposition import PCA\n",
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
"import ipywidgets as widgets\n",
"from IPython.display import display, Image"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"os.environ[\"TF_CONFIG\"] = json.dumps({\n",
" \"cluster\": {\n",
" \"worker\": [ \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
" },\n",
" \"task\": {\"type\": \"worker\", \"index\": 1 },\n",
" \"num_workers\": 5\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"IMG_HEIGHT = 416\n",
"IMG_WIDTH= 804\n",
"batch_size = 128\n",
"epochs = 32\n",
"# Full size, machine barfs probably needs more RAM\n",
"#IMG_HEIGHT = 832\n",
"#IMG_WIDTH = 1606\n",
"# Good results\n",
"#batch_size = 128\n",
"#epochs = 6"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"tf.keras.backend.clear_session()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Initializing local devices since in-graph multi-worker training with `MirroredStrategy` is not supported in eager mode. TF_CONFIG will be ignored when when initializing `MirroredStrategy`.\n",
"WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce.\n",
"INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)\n"
]
}
],
"source": [
"#from tensorflow.python.framework.ops import disable_eager_execution\n",
"#disable_eager_execution()\n",
"# MultiWorkerMirroredStrategy needs TF_CONFIG\n",
"#multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n",
"# Central Storage Strategy\n",
"#central_storage_strategy = tf.distribute.experimental.CentralStorageStrategy()\n",
"# ParameterServerStrategy needs TF_CONFIG\n",
"#ps_strategy = tf.distribute.experimental.ParameterServerStrategy()\n",
"# OneDeviceStrategy No cluster\n",
"#strategy = tf.distribute.OneDeviceStrategy(device=\"/CPU:0\")\n",
"# Mirrored Strategy\n",
"mirrored_strategy = tf.distribute.MirroredStrategy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_dir = os.path.join('data/', 'train')\n",
"val_dir = os.path.join('data/', 'val')\n",
"train_good_dir = os.path.join(train_dir, 'good')\n",
"train_bad_dir = os.path.join(train_dir, 'bad')\n",
"val_good_dir = os.path.join(val_dir, 'good')\n",
"val_bad_dir = os.path.join(val_dir, 'bad')\n",
"num_train_good = len(os.listdir(train_good_dir))\n",
"num_train_bad = len(os.listdir(train_bad_dir))\n",
"num_val_good = len(os.listdir(val_good_dir))\n",
"num_val_bad = len(os.listdir(val_bad_dir))\n",
"total_train = num_train_good + num_train_bad\n",
"total_val = num_val_good + num_val_bad"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('total training good images:', num_train_good)\n",
"print('total training bad images:', num_train_bad)\n",
"print(\"--\")\n",
"print(\"Total training images:\", total_train)\n",
"print('total validation good images:', num_val_good)\n",
"print('total validation bad images:', num_val_bad)\n",
"print(\"--\")\n",
"print(\"Total validation images:\", total_val)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"--\")\n",
"print(\"Reduce training and validation set when testing\")\n",
"total_train = 16\n",
"total_val = 16\n",
"print(\"Reduced training images:\", total_train)\n",
"print(\"Reduced validation images:\", total_val)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_image_generator = ImageDataGenerator(\n",
" rescale=1./255\n",
")\n",
"val_image_generator = ImageDataGenerator(\n",
" rescale=1./255\n",
")\n",
"train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
" directory=train_dir,\n",
" shuffle=True,\n",
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
" class_mode='binary')\n",
"val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
" directory=val_dir,\n",
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
" class_mode='binary')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_train_images, _ = next(train_data_gen)\n",
"sample_val_images, _ = next(val_data_gen)\n",
"# This function will plot images in the form of a grid with 1 row and 3 columns where images are placed in each column.\n",
"def plotImages(images_arr):\n",
" fig, axes = plt.subplots(1, 3, figsize=(20,20))\n",
" axes = axes.flatten()\n",
" for img, ax in zip( images_arr, axes):\n",
" ax.imshow(img)\n",
" ax.axis('off')\n",
" plt.tight_layout()\n",
" plt.show()\n",
" \n",
"plotImages(sample_train_images[0:3])\n",
"plotImages(sample_val_images[0:3])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%load_ext tensorboard\n",
"!rm -rf ./clusterlogs/\n",
"#log_dir=\"clusterlogs/fit/\" + datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
"log_dir=\"clusterlogs\"\n",
"#tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)\n",
"tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir)\n",
"%tensorboard --logdir clusterlogs --port 6006"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"class MyModel(tensorflow.keras.Model):\n",
" def __init__(self):\n",
" super(MyModel, self).__init__()\n",
" self.dense1 = tensorflow.keras.layers.Dense(4, activation=tensorflow.nn.relu)\n",
" self.dense2 = tensorflow.keras.layers.Dense(5, activation=tensorflow.nn.softmax)\n",
" def call(self, inputs):\n",
" x = self.dense1(inputs)\n",
" return self.dense2(x)\n",
"model = MyModel()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#with multiworker_strategy.scope():\n",
"with mirrored_strategy.scope():\n",
" model = Sequential([\n",
" Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),\n",
" MaxPooling2D(),\n",
" Conv2D(32, 3, padding='same', activation='relu'),\n",
" MaxPooling2D(),\n",
" Conv2D(64, 3, padding='same', activation='relu'),\n",
" MaxPooling2D(),\n",
" Flatten(),\n",
" Dense(512, activation='relu'),\n",
" Dense(1, activation='sigmoid')\n",
" ])\n",
" model.compile(optimizer='adam',\n",
" loss='binary_crossentropy',\n",
" metrics=['accuracy'])\n",
" history = model.fit_generator(\n",
" train_data_gen,\n",
" steps_per_epoch=total_train // batch_size,\n",
" epochs=epochs,\n",
" validation_data=val_data_gen,\n",
" validation_steps=total_val // batch_size,\n",
" verbose=2,\n",
" callbacks=[tensorboard_callback]\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#strategy.num_replicas_in_sync"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.summary()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Image.LOAD_TRUNCATED_IMAGES = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"acc = history.history['accuracy']\n",
"val_acc = history.history['val_accuracy']\n",
"loss = history.history['loss']\n",
"val_loss = history.history['val_loss']\n",
"epochs_range = range(epochs)\n",
"plt.figure(figsize=(8, 8))\n",
"plt.subplot(1, 2, 1)\n",
"plt.plot(epochs_range, acc, label='Training Accuracy')\n",
"plt.plot(epochs_range, val_acc, label='Validation Accuracy')\n",
"plt.legend(loc='lower right')\n",
"plt.title('Training and Validation Accuracy')\n",
"plt.subplot(1, 2, 2)\n",
"plt.plot(epochs_range, loss, label='Training Loss')\n",
"plt.plot(epochs_range, val_loss, label='Validation Loss')\n",
"plt.legend(loc='upper right')\n",
"plt.title('Training and Validation Loss')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"TRAINING info\")\n",
"print(train_dir)\n",
"print(train_good_dir)\n",
"print(train_bad_dir)\n",
"print(train_image_generator)\n",
"print(train_data_gen)\n",
"#print(sample_train_images)\n",
"print(history)\n",
"model.to_json()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Save .tf model data here"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.save('data/models/DUV/wut-train-cluster.tf')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.save('data/models/DUV/wut-train-cluster.h5')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.save_weights('data/models/DUV/wut-weights-train-cluster.tf')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.save_weights('data/models/DUV/wut-weights-train-cluster.h5')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The End"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}