temporary dev notebook to convert workflow to functions for distribution

2020-01-18 11:46:58 -07:00 · 2020-01-18 11:46:58 -07:00 · 99672dd02f
parent cd80fa6d31
commit 99672dd02f
1 changed files with 446 additions and 0 deletions
--- a/jupyter/wut-train-cluster-fn.ipynb
+++ b/jupyter/wut-train-cluster-fn.ipynb
@ -0,0 +1,446 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# wut-train-cluster --- What U Think? SatNOGS Observation AI, training application cluster edition.\n",
+    "#\n",
+    "# https://spacecruft.org/spacecruft/satnogs-wut\n",
+    "#\n",
+    "# Based on data/train and data/val directories builds a wut.tf file.\n",
+    "# GPLv3+\n",
+    "# Built using Jupyter, Tensorflow, Keras"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from __future__ import absolute_import, division, print_function, unicode_literals\n",
+    "from __future__ import print_function\n",
+    "import os\n",
+    "import numpy as np\n",
+    "import simplejson as json\n",
+    "import datetime"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import tensorflow.python.keras\n",
+    "from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D\n",
+    "from tensorflow.python.keras import optimizers\n",
+    "from tensorflow.python.keras import Sequential\n",
+    "from tensorflow.python.keras.layers import Activation, Dropout, Flatten, Dense\n",
+    "from tensorflow.python.keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D\n",
+    "from tensorflow.python.keras.layers import Input, concatenate\n",
+    "from tensorflow.python.keras.models import load_model\n",
+    "from tensorflow.python.keras.models import Model\n",
+    "from tensorflow.python.keras.preprocessing import image\n",
+    "from tensorflow.python.keras.preprocessing.image import img_to_array\n",
+    "from tensorflow.python.keras.preprocessing.image import ImageDataGenerator\n",
+    "from tensorflow.python.keras.preprocessing.image import load_img"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn.decomposition import PCA\n",
+    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
+    "import ipywidgets as widgets\n",
+    "from IPython.display import display, Image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"TF_CONFIG\"] = json.dumps({\n",
+    "    \"cluster\": {\n",
+    "        \"worker\": [ \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
+    "    },\n",
+    "   \"task\": {\"type\": \"worker\", \"index\": 1 },\n",
+    "   \"num_workers\": 5\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "IMG_HEIGHT = 416\n",
+    "IMG_WIDTH= 804\n",
+    "batch_size = 128\n",
+    "epochs = 32\n",
+    "# Full size, machine barfs probably needs more RAM\n",
+    "#IMG_HEIGHT = 832\n",
+    "#IMG_WIDTH = 1606\n",
+    "# Good results\n",
+    "#batch_size = 128\n",
+    "#epochs = 6"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.keras.backend.clear_session()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Initializing local devices since in-graph multi-worker training with `MirroredStrategy` is not supported in eager mode. TF_CONFIG will be ignored when when initializing `MirroredStrategy`.\n",
+      "WARNING:tensorflow:There are non-GPU devices in `tf.distribute.Strategy`, not using nccl allreduce.\n",
+      "INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#from tensorflow.python.framework.ops import disable_eager_execution\n",
+    "#disable_eager_execution()\n",
+    "# MultiWorkerMirroredStrategy needs TF_CONFIG\n",
+    "#multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n",
+    "# Central Storage Strategy\n",
+    "#central_storage_strategy = tf.distribute.experimental.CentralStorageStrategy()\n",
+    "# ParameterServerStrategy needs TF_CONFIG\n",
+    "#ps_strategy = tf.distribute.experimental.ParameterServerStrategy()\n",
+    "# OneDeviceStrategy No cluster\n",
+    "#strategy = tf.distribute.OneDeviceStrategy(device=\"/CPU:0\")\n",
+    "# Mirrored Strategy\n",
+    "mirrored_strategy = tf.distribute.MirroredStrategy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dir = os.path.join('data/', 'train')\n",
+    "val_dir = os.path.join('data/', 'val')\n",
+    "train_good_dir = os.path.join(train_dir, 'good')\n",
+    "train_bad_dir = os.path.join(train_dir, 'bad')\n",
+    "val_good_dir = os.path.join(val_dir, 'good')\n",
+    "val_bad_dir = os.path.join(val_dir, 'bad')\n",
+    "num_train_good = len(os.listdir(train_good_dir))\n",
+    "num_train_bad = len(os.listdir(train_bad_dir))\n",
+    "num_val_good = len(os.listdir(val_good_dir))\n",
+    "num_val_bad = len(os.listdir(val_bad_dir))\n",
+    "total_train = num_train_good + num_train_bad\n",
+    "total_val = num_val_good + num_val_bad"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('total training good images:', num_train_good)\n",
+    "print('total training bad images:', num_train_bad)\n",
+    "print(\"--\")\n",
+    "print(\"Total training images:\", total_train)\n",
+    "print('total validation good images:', num_val_good)\n",
+    "print('total validation bad images:', num_val_bad)\n",
+    "print(\"--\")\n",
+    "print(\"Total validation images:\", total_val)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"--\")\n",
+    "print(\"Reduce training and validation set when testing\")\n",
+    "total_train = 16\n",
+    "total_val = 16\n",
+    "print(\"Reduced training images:\", total_train)\n",
+    "print(\"Reduced validation images:\", total_val)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_image_generator = ImageDataGenerator(\n",
+    "    rescale=1./255\n",
+    ")\n",
+    "val_image_generator = ImageDataGenerator(\n",
+    "    rescale=1./255\n",
+    ")\n",
+    "train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
+    "                                                           directory=train_dir,\n",
+    "                                                           shuffle=True,\n",
+    "                                                           target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
+    "                                                           class_mode='binary')\n",
+    "val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
+    "                                                       directory=val_dir,\n",
+    "                                                       target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
+    "                                                       class_mode='binary')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_train_images, _ = next(train_data_gen)\n",
+    "sample_val_images, _ = next(val_data_gen)\n",
+    "# This function will plot images in the form of a grid with 1 row and 3 columns where images are placed in each column.\n",
+    "def plotImages(images_arr):\n",
+    "    fig, axes = plt.subplots(1, 3, figsize=(20,20))\n",
+    "    axes = axes.flatten()\n",
+    "    for img, ax in zip( images_arr, axes):\n",
+    "        ax.imshow(img)\n",
+    "        ax.axis('off')\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "    \n",
+    "plotImages(sample_train_images[0:3])\n",
+    "plotImages(sample_val_images[0:3])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext tensorboard\n",
+    "!rm -rf ./clusterlogs/\n",
+    "#log_dir=\"clusterlogs/fit/\" + datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
+    "log_dir=\"clusterlogs\"\n",
+    "#tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)\n",
+    "tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir)\n",
+    "%tensorboard --logdir clusterlogs --port 6006"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MyModel(tensorflow.keras.Model):\n",
+    "  def __init__(self):\n",
+    "    super(MyModel, self).__init__()\n",
+    "    self.dense1 = tensorflow.keras.layers.Dense(4, activation=tensorflow.nn.relu)\n",
+    "    self.dense2 = tensorflow.keras.layers.Dense(5, activation=tensorflow.nn.softmax)\n",
+    "  def call(self, inputs):\n",
+    "    x = self.dense1(inputs)\n",
+    "    return self.dense2(x)\n",
+    "model = MyModel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#with multiworker_strategy.scope():\n",
+    "with mirrored_strategy.scope():\n",
+    "    model = Sequential([\n",
+    "    Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),\n",
+    "    MaxPooling2D(),\n",
+    "    Conv2D(32, 3, padding='same', activation='relu'),\n",
+    "    MaxPooling2D(),\n",
+    "    Conv2D(64, 3, padding='same', activation='relu'),\n",
+    "    MaxPooling2D(),\n",
+    "    Flatten(),\n",
+    "    Dense(512, activation='relu'),\n",
+    "    Dense(1, activation='sigmoid')\n",
+    "  ])\n",
+    "    model.compile(optimizer='adam',\n",
+    "              loss='binary_crossentropy',\n",
+    "              metrics=['accuracy'])\n",
+    "    history = model.fit_generator(\n",
+    "    train_data_gen,\n",
+    "    steps_per_epoch=total_train // batch_size,\n",
+    "    epochs=epochs,\n",
+    "    validation_data=val_data_gen,\n",
+    "    validation_steps=total_val // batch_size,\n",
+    "    verbose=2,\n",
+    "    callbacks=[tensorboard_callback]\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#strategy.num_replicas_in_sync"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Image.LOAD_TRUNCATED_IMAGES = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "acc = history.history['accuracy']\n",
+    "val_acc = history.history['val_accuracy']\n",
+    "loss = history.history['loss']\n",
+    "val_loss = history.history['val_loss']\n",
+    "epochs_range = range(epochs)\n",
+    "plt.figure(figsize=(8, 8))\n",
+    "plt.subplot(1, 2, 1)\n",
+    "plt.plot(epochs_range, acc, label='Training Accuracy')\n",
+    "plt.plot(epochs_range, val_acc, label='Validation Accuracy')\n",
+    "plt.legend(loc='lower right')\n",
+    "plt.title('Training and Validation Accuracy')\n",
+    "plt.subplot(1, 2, 2)\n",
+    "plt.plot(epochs_range, loss, label='Training Loss')\n",
+    "plt.plot(epochs_range, val_loss, label='Validation Loss')\n",
+    "plt.legend(loc='upper right')\n",
+    "plt.title('Training and Validation Loss')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"TRAINING info\")\n",
+    "print(train_dir)\n",
+    "print(train_good_dir)\n",
+    "print(train_bad_dir)\n",
+    "print(train_image_generator)\n",
+    "print(train_data_gen)\n",
+    "#print(sample_train_images)\n",
+    "print(history)\n",
+    "model.to_json()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save .tf model data here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save('data/models/DUV/wut-train-cluster.tf')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save('data/models/DUV/wut-train-cluster.h5')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_weights('data/models/DUV/wut-weights-train-cluster.tf')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_weights('data/models/DUV/wut-weights-train-cluster.h5')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The End"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}