temporary dev notebook to convert workflow to functions for distribution
parent
cd80fa6d31
commit
99672dd02f
|
@ -0,0 +1,446 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# wut-train-cluster --- What U Think? SatNOGS Observation AI, training application cluster edition.\n",
|
||||
"#\n",
|
||||
"# https://spacecruft.org/spacecruft/satnogs-wut\n",
|
||||
"#\n",
|
||||
"# Based on data/train and data/val directories builds a wut.tf file.\n",
|
||||
"# GPLv3+\n",
|
||||
"# Built using Jupyter, Tensorflow, Keras"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from __future__ import absolute_import, division, print_function, unicode_literals\n",
|
||||
"from __future__ import print_function\n",
|
||||
"import os\n",
|
||||
"import numpy as np\n",
|
||||
"import simplejson as json\n",
|
||||
"import datetime"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import tensorflow as tf\n",
|
||||
"import tensorflow.python.keras\n",
|
||||
"from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D\n",
|
||||
"from tensorflow.python.keras import optimizers\n",
|
||||
"from tensorflow.python.keras import Sequential\n",
|
||||
"from tensorflow.python.keras.layers import Activation, Dropout, Flatten, Dense\n",
|
||||
"from tensorflow.python.keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D\n",
|
||||
"from tensorflow.python.keras.layers import Input, concatenate\n",
|
||||
"from tensorflow.python.keras.models import load_model\n",
|
||||
"from tensorflow.python.keras.models import Model\n",
|
||||
"from tensorflow.python.keras.preprocessing import image\n",
|
||||
"from tensorflow.python.keras.preprocessing.image import img_to_array\n",
|
||||
"from tensorflow.python.keras.preprocessing.image import ImageDataGenerator\n",
|
||||
"from tensorflow.python.keras.preprocessing.image import load_img"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import seaborn as sns\n",
|
||||
"from sklearn.decomposition import PCA\n",
|
||||
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
|
||||
"import ipywidgets as widgets\n",
|
||||
"from IPython.display import display, Image"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"os.environ[\"TF_CONFIG\"] = json.dumps({\n",
|
||||
" \"cluster\": {\n",
|
||||
" \"worker\": [ \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
|
||||
" },\n",
|
||||
" \"task\": {\"type\": \"worker\", \"index\": 1 },\n",
|
||||
" \"num_workers\": 5\n",
|
||||
"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"IMG_HEIGHT = 416\n",
|
||||
"IMG_WIDTH= 804\n",
|
||||
"batch_size = 128\n",
|
||||
"epochs = 32\n",
|
||||
"# Full size, machine barfs probably needs more RAM\n",
|
||||
"#IMG_HEIGHT = 832\n",
|
||||
"#IMG_WIDTH = 1606\n",
|
||||
"# Good results\n",
|
||||
"#batch_size = 128\n",
|
||||
"#epochs = 6"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tf.keras.backend.clear_session()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:tensorflow:Initializing local devices since in-graph multi-worker training with `MirroredStrategy` is not supported in eager mode. TF_CONFIG will be ignored when when initializing `MirroredStrategy`.\n",
|
||||
"WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce.\n",
|
||||
"INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#from tensorflow.python.framework.ops import disable_eager_execution\n",
|
||||
"#disable_eager_execution()\n",
|
||||
"# MultiWorkerMirroredStrategy needs TF_CONFIG\n",
|
||||
"#multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n",
|
||||
"# Central Storage Strategy\n",
|
||||
"#central_storage_strategy = tf.distribute.experimental.CentralStorageStrategy()\n",
|
||||
"# ParameterServerStrategy needs TF_CONFIG\n",
|
||||
"#ps_strategy = tf.distribute.experimental.ParameterServerStrategy()\n",
|
||||
"# OneDeviceStrategy No cluster\n",
|
||||
"#strategy = tf.distribute.OneDeviceStrategy(device=\"/CPU:0\")\n",
|
||||
"# Mirrored Strategy\n",
|
||||
"mirrored_strategy = tf.distribute.MirroredStrategy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_dir = os.path.join('data/', 'train')\n",
|
||||
"val_dir = os.path.join('data/', 'val')\n",
|
||||
"train_good_dir = os.path.join(train_dir, 'good')\n",
|
||||
"train_bad_dir = os.path.join(train_dir, 'bad')\n",
|
||||
"val_good_dir = os.path.join(val_dir, 'good')\n",
|
||||
"val_bad_dir = os.path.join(val_dir, 'bad')\n",
|
||||
"num_train_good = len(os.listdir(train_good_dir))\n",
|
||||
"num_train_bad = len(os.listdir(train_bad_dir))\n",
|
||||
"num_val_good = len(os.listdir(val_good_dir))\n",
|
||||
"num_val_bad = len(os.listdir(val_bad_dir))\n",
|
||||
"total_train = num_train_good + num_train_bad\n",
|
||||
"total_val = num_val_good + num_val_bad"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('total training good images:', num_train_good)\n",
|
||||
"print('total training bad images:', num_train_bad)\n",
|
||||
"print(\"--\")\n",
|
||||
"print(\"Total training images:\", total_train)\n",
|
||||
"print('total validation good images:', num_val_good)\n",
|
||||
"print('total validation bad images:', num_val_bad)\n",
|
||||
"print(\"--\")\n",
|
||||
"print(\"Total validation images:\", total_val)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"--\")\n",
|
||||
"print(\"Reduce training and validation set when testing\")\n",
|
||||
"total_train = 16\n",
|
||||
"total_val = 16\n",
|
||||
"print(\"Reduced training images:\", total_train)\n",
|
||||
"print(\"Reduced validation images:\", total_val)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_image_generator = ImageDataGenerator(\n",
|
||||
" rescale=1./255\n",
|
||||
")\n",
|
||||
"val_image_generator = ImageDataGenerator(\n",
|
||||
" rescale=1./255\n",
|
||||
")\n",
|
||||
"train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
|
||||
" directory=train_dir,\n",
|
||||
" shuffle=True,\n",
|
||||
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
|
||||
" class_mode='binary')\n",
|
||||
"val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
|
||||
" directory=val_dir,\n",
|
||||
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
|
||||
" class_mode='binary')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sample_train_images, _ = next(train_data_gen)\n",
|
||||
"sample_val_images, _ = next(val_data_gen)\n",
|
||||
"# This function will plot images in the form of a grid with 1 row and 3 columns where images are placed in each column.\n",
|
||||
"def plotImages(images_arr):\n",
|
||||
" fig, axes = plt.subplots(1, 3, figsize=(20,20))\n",
|
||||
" axes = axes.flatten()\n",
|
||||
" for img, ax in zip( images_arr, axes):\n",
|
||||
" ax.imshow(img)\n",
|
||||
" ax.axis('off')\n",
|
||||
" plt.tight_layout()\n",
|
||||
" plt.show()\n",
|
||||
" \n",
|
||||
"plotImages(sample_train_images[0:3])\n",
|
||||
"plotImages(sample_val_images[0:3])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%load_ext tensorboard\n",
|
||||
"!rm -rf ./clusterlogs/\n",
|
||||
"#log_dir=\"clusterlogs/fit/\" + datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
|
||||
"log_dir=\"clusterlogs\"\n",
|
||||
"#tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)\n",
|
||||
"tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir)\n",
|
||||
"%tensorboard --logdir clusterlogs --port 6006"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class MyModel(tensorflow.keras.Model):\n",
|
||||
" def __init__(self):\n",
|
||||
" super(MyModel, self).__init__()\n",
|
||||
" self.dense1 = tensorflow.keras.layers.Dense(4, activation=tensorflow.nn.relu)\n",
|
||||
" self.dense2 = tensorflow.keras.layers.Dense(5, activation=tensorflow.nn.softmax)\n",
|
||||
" def call(self, inputs):\n",
|
||||
" x = self.dense1(inputs)\n",
|
||||
" return self.dense2(x)\n",
|
||||
"model = MyModel()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#with multiworker_strategy.scope():\n",
|
||||
"with mirrored_strategy.scope():\n",
|
||||
" model = Sequential([\n",
|
||||
" Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),\n",
|
||||
" MaxPooling2D(),\n",
|
||||
" Conv2D(32, 3, padding='same', activation='relu'),\n",
|
||||
" MaxPooling2D(),\n",
|
||||
" Conv2D(64, 3, padding='same', activation='relu'),\n",
|
||||
" MaxPooling2D(),\n",
|
||||
" Flatten(),\n",
|
||||
" Dense(512, activation='relu'),\n",
|
||||
" Dense(1, activation='sigmoid')\n",
|
||||
" ])\n",
|
||||
" model.compile(optimizer='adam',\n",
|
||||
" loss='binary_crossentropy',\n",
|
||||
" metrics=['accuracy'])\n",
|
||||
" history = model.fit_generator(\n",
|
||||
" train_data_gen,\n",
|
||||
" steps_per_epoch=total_train // batch_size,\n",
|
||||
" epochs=epochs,\n",
|
||||
" validation_data=val_data_gen,\n",
|
||||
" validation_steps=total_val // batch_size,\n",
|
||||
" verbose=2,\n",
|
||||
" callbacks=[tensorboard_callback]\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#strategy.num_replicas_in_sync"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.summary()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Image.LOAD_TRUNCATED_IMAGES = True"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"acc = history.history['accuracy']\n",
|
||||
"val_acc = history.history['val_accuracy']\n",
|
||||
"loss = history.history['loss']\n",
|
||||
"val_loss = history.history['val_loss']\n",
|
||||
"epochs_range = range(epochs)\n",
|
||||
"plt.figure(figsize=(8, 8))\n",
|
||||
"plt.subplot(1, 2, 1)\n",
|
||||
"plt.plot(epochs_range, acc, label='Training Accuracy')\n",
|
||||
"plt.plot(epochs_range, val_acc, label='Validation Accuracy')\n",
|
||||
"plt.legend(loc='lower right')\n",
|
||||
"plt.title('Training and Validation Accuracy')\n",
|
||||
"plt.subplot(1, 2, 2)\n",
|
||||
"plt.plot(epochs_range, loss, label='Training Loss')\n",
|
||||
"plt.plot(epochs_range, val_loss, label='Validation Loss')\n",
|
||||
"plt.legend(loc='upper right')\n",
|
||||
"plt.title('Training and Validation Loss')\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"TRAINING info\")\n",
|
||||
"print(train_dir)\n",
|
||||
"print(train_good_dir)\n",
|
||||
"print(train_bad_dir)\n",
|
||||
"print(train_image_generator)\n",
|
||||
"print(train_data_gen)\n",
|
||||
"#print(sample_train_images)\n",
|
||||
"print(history)\n",
|
||||
"model.to_json()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Save .tf model data here"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.save('data/models/DUV/wut-train-cluster.tf')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.save('data/models/DUV/wut-train-cluster.h5')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.save_weights('data/models/DUV/wut-weights-train-cluster.tf')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.save_weights('data/models/DUV/wut-weights-train-cluster.h5')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# The End"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
Loading…
Reference in New Issue