distributed getting closer...

2020-01-18 15:02:34 -07:00 · 2020-01-18 15:02:34 -07:00 · 0be648d1d1
parent 59d0b82469
commit 0be648d1d1
3 changed files with 212 additions and 69 deletions
--- a/jupyter/wut-train-cluster-fn.ipynb
+++ b/jupyter/wut-train-cluster-fn.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -17,7 +17,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -31,7 +31,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -53,7 +53,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -68,22 +68,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\"TF_CONFIG\"] = json.dumps({\n",
    "    \"cluster\": {\n",
-    "        \"worker\": [ \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
+    "        \"worker\": [ \"10.100.100.130:2222\", \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
    "    },\n",
-    "   \"task\": {\"type\": \"worker\", \"index\": 1 },\n",
+    "   \"task\": {\"type\": \"worker\", \"index\": 0 },\n",
    "   \"num_workers\": 5\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -101,7 +101,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
@ -110,7 +110,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -120,9 +120,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:XLA_CPU:0']\n",
+      "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
+      "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.AUTO\n"
+     ]
+    }
+   ],
   "source": [
    "strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n",
    "#\n",
@ -140,7 +150,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
@ -160,9 +170,24 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total training good images: 3291\n",
+      "total training bad images: 609\n",
+      "--\n",
+      "Total training images: 3900\n",
+      "total validation good images: 3361\n",
+      "total validation bad images: 601\n",
+      "--\n",
+      "Total validation images: 3962\n"
+     ]
+    }
+   ],
   "source": [
    "print('total training good images:', num_train_good)\n",
    "print('total training bad images:', num_train_bad)\n",
@ -176,9 +201,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--\n",
+      "Reduce training and validation set when testing\n",
+      "Reduced training images: 3900\n",
+      "Reduced validation images: 3962\n"
+     ]
+    }
+   ],
   "source": [
    "print(\"--\")\n",
    "print(\"Reduce training and validation set when testing\")\n",
@ -190,9 +226,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found 3900 images belonging to 2 classes.\n",
+      "Found 3962 images belonging to 2 classes.\n"
+     ]
+    }
+   ],
   "source": [
    "train_image_generator = ImageDataGenerator(\n",
    "    rescale=1./255\n",
@ -213,7 +258,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@ -235,7 +280,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
@ -250,29 +295,55 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#strategy.num_replicas_in_sync\n",
+    "## Compute global batch size using number of replicas.\n",
+    "#BATCH_SIZE_PER_REPLICA = 5\n",
+    "#print(BATCH_SIZE_PER_REPLICA)\n",
+    "#global_batch_size = (BATCH_SIZE_PER_REPLICA *\n",
+    "#                     strategy.num_replicas_in_sync)\n",
+    "#print(global_batch_size)\n",
+    "#dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)\n",
+    "#dataset = dataset.batch(global_batch_size)\n",
+    "#LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#learning_rate = LEARNING_RATES_BY_BATCH_SIZE[global_batch_size]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_uncompiled_model():\n",
-    "  inputs = Input(shape=(IMG_HEIGHT, IMG_WIDTH ,3))\n",
-    "  x = Conv2D(16, 3, padding='same', activation='relu')(inputs)\n",
-    "  x = MaxPooling2D()(x)\n",
-    "  x = Conv2D(32, 3, padding='same', activation='relu')(x)\n",
-    "  x = MaxPooling2D()(x)\n",
-    "  x = Conv2D(64, 3, padding='same', activation='relu')(x)\n",
-    "  x = MaxPooling2D()(x)\n",
-    "  x = Flatten()(x)\n",
-    "  x = Dense(512, activation='relu')(x)\n",
-    "  x = Dense(1, activation='sigmoid')(x)\n",
-    "  outputs = Dense(10, activation='softmax', name='predictions')(x)\n",
-    "  model = Model(inputs=inputs, outputs=outputs)\n",
+    "  model = Sequential([\n",
+    "    Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),\n",
+    "    MaxPooling2D(),\n",
+    "    Conv2D(32, 3, padding='same', activation='relu'),\n",
+    "    MaxPooling2D(),\n",
+    "    Conv2D(64, 3, padding='same', activation='relu'),\n",
+    "    MaxPooling2D(),\n",
+    "    Flatten(),\n",
+    "    Dense(512, activation='relu'),\n",
+    "    Dense(1, activation='sigmoid')\n",
+    "  ])\n",
    "  return model"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
@ -281,7 +352,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
@ -295,7 +366,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
@ -304,7 +375,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
@ -314,7 +385,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
@ -333,20 +404,68 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
-    "#model = get_fit_model()"
+    "#get_uncompiled_model()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
-    "strategy.num_replicas_in_sync"
+    "#get_compiled_model()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#get_fit_model()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#with strategy.scope():\n",
+    "#    get_fit_model()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with strategy.scope():\n",
+    "   get_uncompiled_model()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = AUTO\n",
+      "INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = AUTO\n"
+     ]
+    }
+   ],
+   "source": [
+    "with strategy.scope():\n",
+    "    get_compiled_model()"
   ]
  },
  {
@ -356,8 +475,6 @@
   "outputs": [],
   "source": [
    "with strategy.scope():\n",
-    "    get_uncompiled_model()\n",
-    "    get_compiled_model()\n",
    "    get_fit_model()"
   ]
  },
@ -367,7 +484,13 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "model.summary()"
+    "#multi_worker_model = get_compiled_model()\n",
+    "#\n",
+    "#multi_worker_model.fit(\n",
+    "#    x=train_data_gen,\n",
+    "#    epochs=epochs,\n",
+    "#    steps_per_epoch=total_train // batch_size\n",
+    "#    )"
   ]
  },
  {
@ -376,7 +499,21 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "#Image.LOAD_TRUNCATED_IMAGES = True"
+    "#with strategy.scope():\n",
+    "#    multi_worker_model.fit(\n",
+    "#        x=train_data_gen,\n",
+    "#        epochs=epochs,\n",
+    "#        steps_per_epoch=total_train // batch_size\n",
+    "#        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#model.summary()"
   ]
  },
  {
@ -392,8 +529,8 @@
    "print(train_image_generator)\n",
    "print(train_data_gen)\n",
    "#print(sample_train_images)\n",
-    "print(history)\n",
-    "model.to_json()"
+    "#print(history)\n",
+    "#model.to_json()"
   ]
  },
  {
--- a/4
+++ b/4
@ -17,9 +17,9 @@
 # so the index is hostname minus one (without alpha).

 HOSTNUM=`hostname | sed -e 's/ml//g'`
-let HOSTNUM=$HOSTNUM-1
+#let HOSTNUM=$HOSTNUM-1

-export TF_CONFIG='{"cluster": {"worker": ["ml1:2222", "ml2:2222", "ml3:2222", "ml4:2222", "ml5:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
+export TF_CONFIG='{"cluster": {"worker": [ "10.100.100.130:2222", "ml1:2222", "ml2:2222", "ml3:2222", "ml4:2222", "ml5:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'

 echo $TF_CONFIG
 python3 wut-worker.py
--- a/wut-worker.py
+++ b/wut-worker.py
@ -32,19 +32,19 @@ IMG_WIDTH= 804

 strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

+
 def get_uncompiled_model():
-  inputs = Input(shape=(IMG_HEIGHT, IMG_WIDTH ,3))
-  x = Conv2D(16, 3, padding='same', activation='relu')(inputs)
-  x = MaxPooling2D()(x)
-  x = Conv2D(32, 3, padding='same', activation='relu')(x)
-  x = MaxPooling2D()(x)
-  x = Conv2D(64, 3, padding='same', activation='relu')(x)
-  x = MaxPooling2D()(x)
-  x = Flatten()(x)
-  x = Dense(512, activation='relu')(x)
-  x = Dense(1, activation='sigmoid')(x)
-  outputs = Dense(10, activation='softmax', name='predictions')(x)
-  model = Model(inputs=inputs, outputs=outputs)
+  model = Sequential([
+    Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
+    MaxPooling2D(),
+    Conv2D(32, 3, padding='same', activation='relu'),
+    MaxPooling2D(),
+    Conv2D(64, 3, padding='same', activation='relu'),
+    MaxPooling2D(),
+    Flatten(),
+    Dense(512, activation='relu'),
+    Dense(1, activation='sigmoid')
+  ])
  return model

 def get_compiled_model():
@ -57,15 +57,21 @@ def get_compiled_model():
 def get_fit_model():
    model = get_compiled_model()
    model.fit(
-        train_data_gen,
-        steps_per_epoch=total_train // batch_size,
-        epochs=epochs,
-        validation_data=val_data_gen,
-        validation_steps=total_val // batch_size,
-        verbose=2
        )
    return model

+#def get_fit_model():
+#    model = get_compiled_model()
+#    model.fit(
+#        train_data_gen,
+#        steps_per_epoch=total_train // batch_size,
+#        epochs=epochs,
+#        validation_data=val_data_gen,
+#        validation_steps=total_val // batch_size,
+#        verbose=2
+#        )
+#    return model
+
 with strategy.scope():
    get_uncompiled_model()
    get_compiled_model()