From bc94c25849bec6f281aec17a3e8e61f65eec4d2c Mon Sep 17 00:00:00 2001
From: ml server <ml@spacecruft.org>
Date: Sat, 18 Jan 2020 17:14:49 -0700
Subject: [PATCH] Distribution is working

---
 README-distributed.md              |   3 +-
 jupyter/wut-train-cluster-fn.ipynb | 250 +++++++++++++++++++----------
 2 files changed, 167 insertions(+), 86 deletions(-)

diff --git a/README-distributed.md b/README-distributed.md
index 9163b4b..07c1f8e 100644
--- a/README-distributed.md
+++ b/README-distributed.md
@@ -41,11 +41,12 @@ done
 ```
 # On worker nodes:
 sudo apt update
-sudo apt install python3-pip
+sudo apt install python3-pip sshfs
 # XXX deps...
 pip3 install --upgrade setuptools
 pip3 install --user tensorflow-2.1.0-cp37-cp37m-linux_x86_64.whl
 pip3 install --user simplejson
+pip3 install --user pillow
 ```
 
 # Usage
diff --git a/jupyter/wut-train-cluster-fn.ipynb b/jupyter/wut-train-cluster-fn.ipynb
index 531bac2..cc8046e 100644
--- a/jupyter/wut-train-cluster-fn.ipynb
+++ b/jupyter/wut-train-cluster-fn.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -17,7 +17,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -31,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -53,7 +53,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -68,7 +68,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tf 2.1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('tf {}'.format(tf.__version__))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -83,7 +100,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,7 +118,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -110,7 +127,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -120,11 +137,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:XLA_CPU:0']\n",
+      "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
+      "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n"
+     ]
+    }
+   ],
    "source": [
-    "strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n",
+    "strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(\n",
+    "    tf.distribute.experimental.CollectiveCommunication.RING)\n",
     "#\n",
     "# MultiWorkerMirroredStrategy needs TF_CONFIG\n",
     "#multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n",
@@ -140,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -160,9 +188,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total training good images: 3291\n",
+      "total training bad images: 609\n",
+      "--\n",
+      "Total training images: 3900\n",
+      "total validation good images: 3361\n",
+      "total validation bad images: 601\n",
+      "--\n",
+      "Total validation images: 3962\n"
+     ]
+    }
+   ],
    "source": [
     "print('total training good images:', num_train_good)\n",
     "print('total training bad images:', num_train_bad)\n",
@@ -176,9 +219,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--\n",
+      "Reduce training and validation set when testing\n",
+      "Reduced training images: 3900\n",
+      "Reduced validation images: 3962\n"
+     ]
+    }
+   ],
    "source": [
     "print(\"--\")\n",
     "print(\"Reduce training and validation set when testing\")\n",
@@ -190,9 +244,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 3900 images belonging to 2 classes.\n",
+      "Found 3962 images belonging to 2 classes.\n"
+     ]
+    }
+   ],
    "source": [
     "train_image_generator = ImageDataGenerator(\n",
     "    rescale=1./255\n",
@@ -213,7 +276,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -235,7 +298,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -250,7 +313,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -268,7 +331,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -277,7 +340,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -298,7 +361,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -307,7 +370,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -321,7 +384,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -330,7 +393,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -340,53 +403,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def get_fit_model():\n",
-    "    model = get_compiled_model()\n",
-    "    model.fit(\n",
-    "        train_data_gen,\n",
-    "        steps_per_epoch=total_train // batch_size,\n",
-    "        epochs=epochs,\n",
-    "        validation_data=val_data_gen,\n",
-    "        validation_steps=total_val // batch_size,\n",
-    "        verbose=2\n",
-    "        )\n",
-    "    return model"
+    "#def get_fit_model():\n",
+    "#    model = get_compiled_model()\n",
+    "#    model.fit(\n",
+    "#        train_data_gen,\n",
+    "#        steps_per_epoch=total_train // batch_size,\n",
+    "#        epochs=epochs,\n",
+    "#        validation_data=val_data_gen,\n",
+    "#        validation_steps=total_val // batch_size,\n",
+    "#        verbose=2\n",
+    "#        )\n",
+    "#return model"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#get_uncompiled_model()"
+    "#with strategy.scope():\n",
+    "#   get_uncompiled_model()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#get_compiled_model()"
+    "#with strategy.scope():\n",
+    "#    get_compiled_model()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#get_fit_model()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -396,42 +452,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with strategy.scope():\n",
-    "   get_uncompiled_model()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with strategy.scope():\n",
-    "    get_compiled_model()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with strategy.scope():\n",
-    "    get_fit_model()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
     "#multi_worker_model = get_compiled_model()\n",
-    "#\n",
     "#multi_worker_model.fit(\n",
     "#    x=train_data_gen,\n",
     "#    epochs=epochs,\n",
@@ -439,6 +464,51 @@
     "#    )"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n",
+      "INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n",
+      "INFO:tensorflow:Running Distribute Coordinator with mode = 'independent_worker', cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, environment = None, rpc_layer = 'grpc'\n",
+      "WARNING:tensorflow:`eval_fn` is not passed in. The `worker_fn` will be used if an \"evaluator\" task exists in the cluster.\n",
+      "WARNING:tensorflow:`eval_strategy` is not passed in. No distribution strategy will be used for evaluation.\n",
+      "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
+      "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n",
+      "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
+      "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n",
+      "WARNING:tensorflow:ModelCheckpoint callback is not provided. Workers will need to restart training if any fails.\n",
+      "WARNING:tensorflow:sample_weight modes were coerced from\n",
+      "  ...\n",
+      "    to  \n",
+      "  ['...']\n",
+      "WARNING:tensorflow:sample_weight modes were coerced from\n",
+      "  ...\n",
+      "    to  \n",
+      "  ['...']\n",
+      "Train for 121 steps, validate for 123 steps\n",
+      "Epoch 1/4\n"
+     ]
+    }
+   ],
+   "source": [
+    "with strategy.scope():\n",
+    "    model = get_compiled_model()\n",
+    "    model.fit(\n",
+    "        train_data_gen,\n",
+    "        steps_per_epoch=total_train // batch_size,\n",
+    "        epochs=epochs,\n",
+    "        validation_data=val_data_gen,\n",
+    "        validation_steps=total_val // batch_size,\n",
+    "        verbose=2\n",
+    "        )"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -446,6 +516,7 @@
    "outputs": [],
    "source": [
     "#with strategy.scope():\n",
+    "#    multi_worker_model = get_compiled_model()\n",
     "#    multi_worker_model.fit(\n",
     "#        x=train_data_gen,\n",
     "#        epochs=epochs,\n",
@@ -479,6 +550,15 @@
     "#model.to_json()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#history = model.fit(X, y, batch_size=32, epochs=40, validation_split=0.1)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,