more setup for cluster

2020-01-17 17:46:23 -07:00 · 2020-01-17 17:46:23 -07:00 · 7efde346fe
parent 1770f749b2
commit 7efde346fe
3 changed files with 313 additions and 79 deletions
--- a/jupyter/wut-predict.ipynb
+++ b/jupyter/wut-predict.ipynb
@ -93,16 +93,6 @@
    "from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from tensorflow.python.keras.models import Model\n",
-    "from tensorflow.python.keras.layers import Input, concatenate"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -158,6 +148,24 @@
    "print(\"Python import done\")"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Load HDF file\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = load_model('data/hdf/wut-KgazZMKEa74VnquqXLwAvD.h5')"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -268,25 +276,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "plotImages(sample_test_images[0:3])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load .h5 file here plz"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = load_model('data/wut.h5')"
+    "plotImages(sample_test_images[0:1])"
   ]
  },
  {
@ -318,7 +308,7 @@
   "source": [
    "prediction = model.predict(\n",
    "    x=test_data_gen,\n",
-    "    verbose=2\n",
+    "    verbose=1\n",
    ")\n",
    "print(\"end predict\")"
   ]
@ -377,32 +367,6 @@
    "print('Observation: %s' % (rating))"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if prediction_bool[1] == False:\n",
-    "  rating = 'bad'\n",
-    "else:\n",
-    "  rating = 'good'\n",
-    "print('Observation: %s' % (rating))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if prediction_bool[2] == False:\n",
-    "  rating = 'bad'\n",
-    "else:\n",
-    "  rating = 'good'\n",
-    "print('Observation: %s' % (rating))"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/jupyter/wut-train-cluster.ipynb
+++ b/jupyter/wut-train-cluster.ipynb
@ -6,11 +6,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# wut-train --- What U Think? SatNOGS Observation AI, training application.\n",
+    "# wut-train-cluster --- What U Think? SatNOGS Observation AI, training application cluster edition.\n",
    "#\n",
    "# https://spacecruft.org/spacecruft/satnogs-wut\n",
    "#\n",
-    "# Based on data/train and data/val directories builds a wut.h5 file."
+    "# Based on data/train and data/val directories builds a wut.tf file."
   ]
  },
  {
@ -149,6 +149,18 @@
    "from IPython.display import display, Image"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cluster\n",
+    "from __future__ import absolute_import, division, print_function, unicode_literals\n",
+    "import tensorflow as tf\n",
+    "import simplejson as json"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -158,6 +170,141 @@
    "print(\"Python import done\")"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# CLUSTER"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Valid roles: \"chief\", \"worker\", \"ps\" and \"evaluator\".\n",
+    "os.environ[\"TF_CONFIG\"] = json.dumps({\n",
+    "    \"cluster\": {\n",
+    "        \"worker\": [ \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
+    "    },\n",
+    "   \"task\": {\"type\": \"worker\", \"index\": 1},\n",
+    "   \"num_workers\": 5\n",
+    "})\n",
+    "# \"worker\": [\"ml1:port\", \"ml2:port\", \"ml3:port\", \"ml4:port\", \"ml5:port\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# MultiWorkerMirroredStrategy needs TF_CONFIG\n",
+    "multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#os.environ[\"TF_CONFIG\"] = json.dumps({\n",
+    "#    \"cluster\": {\n",
+    "#        \"worker\": [\"host1:port\", \"host2:port\", \"host3:port\"]\n",
+    "#    },\n",
+    "#   \"task\": {\"type\": \"worker\", \"index\": 1}\n",
+    "#})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Pick one Strategy Below\n",
+    "# moved further down above Sequence()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Central Storage Strategy\n",
+    "#central_storage_strategy = tf.distribute.experimental.CentralStorageStrategy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ParameterServerStrategy needs TF_CONFIG\n",
+    "#ps_strategy = tf.distribute.experimental.ParameterServerStrategy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# OneDeviceStrategy No cluster\n",
+    "#strategy = tf.distribute.OneDeviceStrategy(device=\"/CPU:0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mirrored Strategy\n",
+    "#mirrored_strategy = tf.distribute.MirroredStrategy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mirrored Strategy\n",
+    "#mirrored_strategy = tf.distribute.MirroredStrategy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# END CLUSTER"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -259,8 +406,8 @@
   "outputs": [],
   "source": [
    "print(\"Reduce training and validation set when testing\")\n",
-    "#total_train = 1\n",
-    "#total_val = 1\n",
+    "total_train = 100\n",
+    "total_val = 100\n",
    "print(\"Train =\")\n",
    "print(total_train)\n",
    "print(\"Validation =\")\n",
@ -284,8 +431,8 @@
    "#epochs = 16 # BEST SO FAR\n",
    "#\n",
    "# Fast, but reasonable answers\n",
-    "batch_size = 64\n",
-    "epochs = 4\n",
+    "#batch_size = 64\n",
+    "#epochs = 4\n",
    "# Faster, but reasonable answers ?\n",
    "#batch_size = 32\n",
    "#epochs = 2\n",
@ -295,8 +442,8 @@
    "#epochs = 3\n",
    "#\n",
    "# Smallest set for testing\n",
-    "#batch_size = 1\n",
-    "#epochs = 1"
+    "batch_size = 8\n",
+    "epochs = 4"
   ]
  },
  {
@ -401,7 +548,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "plotImages(sample_train_images[0:3])"
+    "#plotImages(sample_train_images[0:3])"
   ]
  },
  {
@ -410,7 +557,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "plotImages(sample_val_images[0:3])"
+    "#plotImages(sample_val_images[0:3])"
   ]
  },
  {
@ -419,7 +566,47 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "model = Sequential([\n",
+    "# CLUSTER"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# USE CPU only (doesn't work)\n",
+    "#import tensorflow as tf\n",
+    "#sess = Session(config=tf.ConfigProto(device_count={'GPU': 0}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#with mirrored_strategy.scope():\n",
+    "#  model = tf.keras.Sequential([tf.keras.layers.Dense(1, input_shape=(1,))])\n",
+    "#  model.compile(loss='mse', optimizer='sgd')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#with mirrored_strategy.scope():\n",
+    "with multiworker_strategy.scope():\n",
+    "  model = Sequential([\n",
    "    Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),\n",
    "    MaxPooling2D(),\n",
    "    Conv2D(32, 3, padding='same', activation='relu'),\n",
@ -429,7 +616,10 @@
    "    Flatten(),\n",
    "    Dense(512, activation='relu'),\n",
    "    Dense(1, activation='sigmoid')\n",
-    "])"
+    "  ])\n",
+    "  model.compile(optimizer='adam',\n",
+    "              loss='binary_crossentropy',\n",
+    "              metrics=['accuracy'])"
   ]
  },
  {
@ -438,9 +628,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "model.compile(optimizer='adam',\n",
-    "              loss='binary_crossentropy',\n",
-    "              metrics=['accuracy'])"
+    "strategy.num_replicas_in_sync"
   ]
  },
  {
@ -576,7 +764,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Save .h5 data here"
+    "model.to_json()"
   ]
  },
  {
@ -585,9 +773,80 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "model.save('data/hdf/wut-KgazZMKEa74VnquqXLwAvD.h5')"
+    "# Save .tf model data here"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save('data/models/DUV/wut-train-cluster.tf')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save('data/models/DUV/wut-train-cluster.h5')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_weights('data/models/DUV/wut-weights-train-cluster.tf')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_weights('data/models/DUV/wut-weights-train-cluster.h5')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/jupyter/wut-train.ipynb
+++ b/jupyter/wut-train.ipynb
@ -259,8 +259,8 @@
   "outputs": [],
   "source": [
    "print(\"Reduce training and validation set when testing\")\n",
-    "#total_train = 100\n",
-    "#total_val = 100\n",
+    "#total_train = 1\n",
+    "#total_val = 1\n",
    "print(\"Train =\")\n",
    "print(total_train)\n",
    "print(\"Validation =\")\n",
@ -280,12 +280,23 @@
    "# Large Test\n",
    "#batch_size = 512 # FAIL\n",
    "#batch_size = 256 # FAIL\n",
-    "batch_size = 192\n",
-    "epochs = 16\n",
+    "#batch_size = 192 # BEST SO FAR\n",
+    "#epochs = 16 # BEST SO FAR\n",
+    "#\n",
+    "# Fast, but reasonable answers\n",
+    "batch_size = 64\n",
+    "epochs = 4\n",
+    "# Faster, but reasonable answers ?\n",
+    "#batch_size = 32\n",
+    "#epochs = 2\n",
    "#\n",
    "# Testing, faster more inaccurate results\n",
    "#batch_size = 16\n",
-    "#epochs = 3"
+    "#epochs = 3\n",
+    "#\n",
+    "# Smallest set for testing\n",
+    "#batch_size = 1\n",
+    "#epochs = 1"
   ]
  },
  {
@ -574,7 +585,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "model.save('data/hdf/wut-train.h5')"
+    "model.save('data/hdf/wut-KgazZMKEa74VnquqXLwAvD.h5')"
   ]
  },
  {