Distribution is working

2020-01-18 17:14:49 -07:00 · 2020-01-18 17:14:49 -07:00 · bc94c25849
parent cb0f180c87
commit bc94c25849
2 changed files with 167 additions and 86 deletions
--- a/README-distributed.md
+++ b/README-distributed.md
@ -41,11 +41,12 @@ done
 ```
 # On worker nodes:
 sudo apt update
-sudo apt install python3-pip
+sudo apt install python3-pip sshfs
 # XXX deps...
 pip3 install --upgrade setuptools
 pip3 install --user tensorflow-2.1.0-cp37-cp37m-linux_x86_64.whl
 pip3 install --user simplejson
+pip3 install --user pillow
 ```

 # Usage
--- a/jupyter/wut-train-cluster-fn.ipynb
+++ b/jupyter/wut-train-cluster-fn.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -17,7 +17,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -31,7 +31,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -53,7 +53,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -68,7 +68,24 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tf 2.1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('tf {}'.format(tf.__version__))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -83,7 +100,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
@ -101,7 +118,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -110,7 +127,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
@ -120,11 +137,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:XLA_CPU:0']\n",
+      "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
+      "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n"
+     ]
+    }
+   ],
   "source": [
-    "strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n",
+    "strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(\n",
+    "    tf.distribute.experimental.CollectiveCommunication.RING)\n",
    "#\n",
    "# MultiWorkerMirroredStrategy needs TF_CONFIG\n",
    "#multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n",
@ -140,7 +168,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
@ -160,9 +188,24 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total training good images: 3291\n",
+      "total training bad images: 609\n",
+      "--\n",
+      "Total training images: 3900\n",
+      "total validation good images: 3361\n",
+      "total validation bad images: 601\n",
+      "--\n",
+      "Total validation images: 3962\n"
+     ]
+    }
+   ],
   "source": [
    "print('total training good images:', num_train_good)\n",
    "print('total training bad images:', num_train_bad)\n",
@ -176,9 +219,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--\n",
+      "Reduce training and validation set when testing\n",
+      "Reduced training images: 3900\n",
+      "Reduced validation images: 3962\n"
+     ]
+    }
+   ],
   "source": [
    "print(\"--\")\n",
    "print(\"Reduce training and validation set when testing\")\n",
@ -190,9 +244,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 3900 images belonging to 2 classes.\n",
+      "Found 3962 images belonging to 2 classes.\n"
+     ]
+    }
+   ],
   "source": [
    "train_image_generator = ImageDataGenerator(\n",
    "    rescale=1./255\n",
@ -213,7 +276,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
@ -235,7 +298,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
@ -250,7 +313,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
@ -268,7 +331,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
@ -277,7 +340,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
@ -298,7 +361,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
@ -307,7 +370,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
@ -321,7 +384,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
@ -330,7 +393,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
@ -340,53 +403,46 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
-    "def get_fit_model():\n",
-    "    model = get_compiled_model()\n",
-    "    model.fit(\n",
-    "        train_data_gen,\n",
-    "        steps_per_epoch=total_train // batch_size,\n",
-    "        epochs=epochs,\n",
-    "        validation_data=val_data_gen,\n",
-    "        validation_steps=total_val // batch_size,\n",
-    "        verbose=2\n",
-    "        )\n",
-    "    return model"
+    "#def get_fit_model():\n",
+    "#    model = get_compiled_model()\n",
+    "#    model.fit(\n",
+    "#        train_data_gen,\n",
+    "#        steps_per_epoch=total_train // batch_size,\n",
+    "#        epochs=epochs,\n",
+    "#        validation_data=val_data_gen,\n",
+    "#        validation_steps=total_val // batch_size,\n",
+    "#        verbose=2\n",
+    "#        )\n",
+    "#return model"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
-    "#get_uncompiled_model()"
+    "#with strategy.scope():\n",
+    "#   get_uncompiled_model()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
-    "#get_compiled_model()"
+    "#with strategy.scope():\n",
+    "#    get_compiled_model()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#get_fit_model()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
@ -396,42 +452,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with strategy.scope():\n",
-    "   get_uncompiled_model()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with strategy.scope():\n",
-    "    get_compiled_model()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with strategy.scope():\n",
-    "    get_fit_model()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "#multi_worker_model = get_compiled_model()\n",
-    "#\n",
    "#multi_worker_model.fit(\n",
    "#    x=train_data_gen,\n",
    "#    epochs=epochs,\n",
@ -439,6 +464,51 @@
    "#    )"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n",
+      "INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n",
+      "INFO:tensorflow:Running Distribute Coordinator with mode = 'independent_worker', cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, environment = None, rpc_layer = 'grpc'\n",
+      "WARNING:tensorflow:`eval_fn` is not passed in. The `worker_fn` will be used if an \"evaluator\" task exists in the cluster.\n",
+      "WARNING:tensorflow:`eval_strategy` is not passed in. No distribution strategy will be used for evaluation.\n",
+      "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
+      "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n",
+      "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
+      "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n",
+      "WARNING:tensorflow:ModelCheckpoint callback is not provided. Workers will need to restart training if any fails.\n",
+      "WARNING:tensorflow:sample_weight modes were coerced from\n",
+      "  ...\n",
+      "    to  \n",
+      "  ['...']\n",
+      "WARNING:tensorflow:sample_weight modes were coerced from\n",
+      "  ...\n",
+      "    to  \n",
+      "  ['...']\n",
+      "Train for 121 steps, validate for 123 steps\n",
+      "Epoch 1/4\n"
+     ]
+    }
+   ],
+   "source": [
+    "with strategy.scope():\n",
+    "    model = get_compiled_model()\n",
+    "    model.fit(\n",
+    "        train_data_gen,\n",
+    "        steps_per_epoch=total_train // batch_size,\n",
+    "        epochs=epochs,\n",
+    "        validation_data=val_data_gen,\n",
+    "        validation_steps=total_val // batch_size,\n",
+    "        verbose=2\n",
+    "        )"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -446,6 +516,7 @@
   "outputs": [],
   "source": [
    "#with strategy.scope():\n",
+    "#    multi_worker_model = get_compiled_model()\n",
    "#    multi_worker_model.fit(\n",
    "#        x=train_data_gen,\n",
    "#        epochs=epochs,\n",
@ -479,6 +550,15 @@
    "#model.to_json()"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#history = model.fit(X, y, batch_size=32, epochs=40, validation_split=0.1)"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,