From bc94c25849bec6f281aec17a3e8e61f65eec4d2c Mon Sep 17 00:00:00 2001 From: ml server Date: Sat, 18 Jan 2020 17:14:49 -0700 Subject: [PATCH] Distribution is working --- README-distributed.md | 3 +- jupyter/wut-train-cluster-fn.ipynb | 250 +++++++++++++++++++---------- 2 files changed, 167 insertions(+), 86 deletions(-) diff --git a/README-distributed.md b/README-distributed.md index 9163b4b..07c1f8e 100644 --- a/README-distributed.md +++ b/README-distributed.md @@ -41,11 +41,12 @@ done ``` # On worker nodes: sudo apt update -sudo apt install python3-pip +sudo apt install python3-pip sshfs # XXX deps... pip3 install --upgrade setuptools pip3 install --user tensorflow-2.1.0-cp37-cp37m-linux_x86_64.whl pip3 install --user simplejson +pip3 install --user pillow ``` # Usage diff --git a/jupyter/wut-train-cluster-fn.ipynb b/jupyter/wut-train-cluster-fn.ipynb index 531bac2..cc8046e 100644 --- a/jupyter/wut-train-cluster-fn.ipynb +++ b/jupyter/wut-train-cluster-fn.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tf 2.1.0\n" + ] + } + ], + "source": [ + "print('tf {}'.format(tf.__version__))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -83,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -101,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -110,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -120,11 +137,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:XLA_CPU:0']\n", + "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n", + "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n" + ] + } + ], "source": [ - "strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n", + "strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(\n", + " tf.distribute.experimental.CollectiveCommunication.RING)\n", "#\n", "# MultiWorkerMirroredStrategy needs TF_CONFIG\n", "#multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n", @@ -140,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -160,9 +188,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total training good images: 3291\n", + "total training bad images: 609\n", + "--\n", + "Total training images: 3900\n", + "total validation good images: 3361\n", + "total validation bad images: 601\n", + "--\n", + "Total validation images: 3962\n" + ] + } + ], "source": [ "print('total training good images:', num_train_good)\n", "print('total training bad images:', num_train_bad)\n", @@ -176,9 +219,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--\n", + "Reduce training and validation set when testing\n", + "Reduced training images: 3900\n", + "Reduced validation images: 3962\n" + ] + } + ], "source": [ "print(\"--\")\n", "print(\"Reduce training and validation set when testing\")\n", @@ -190,9 +244,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 3900 images belonging to 2 classes.\n", + "Found 3962 images belonging to 2 classes.\n" + ] + } + ], "source": [ "train_image_generator = ImageDataGenerator(\n", " rescale=1./255\n", @@ -213,7 +276,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -235,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -250,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -268,7 +331,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -277,7 +340,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -298,7 +361,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -307,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -321,7 +384,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -330,7 +393,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -340,53 +403,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ - "def get_fit_model():\n", - " model = get_compiled_model()\n", - " model.fit(\n", - " train_data_gen,\n", - " steps_per_epoch=total_train // batch_size,\n", - " epochs=epochs,\n", - " validation_data=val_data_gen,\n", - " validation_steps=total_val // batch_size,\n", - " verbose=2\n", - " )\n", - " return model" + "#def get_fit_model():\n", + "# model = get_compiled_model()\n", + "# model.fit(\n", + "# train_data_gen,\n", + "# steps_per_epoch=total_train // batch_size,\n", + "# epochs=epochs,\n", + "# validation_data=val_data_gen,\n", + "# validation_steps=total_val // batch_size,\n", + "# verbose=2\n", + "# )\n", + "#return model" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ - "#get_uncompiled_model()" + "#with strategy.scope():\n", + "# get_uncompiled_model()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ - "#get_compiled_model()" + "#with strategy.scope():\n", + "# get_compiled_model()" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#get_fit_model()" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -396,42 +452,11 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with strategy.scope():\n", - " get_uncompiled_model()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with strategy.scope():\n", - " get_compiled_model()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with strategy.scope():\n", - " get_fit_model()" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "#multi_worker_model = get_compiled_model()\n", - "#\n", "#multi_worker_model.fit(\n", "# x=train_data_gen,\n", "# epochs=epochs,\n", @@ -439,6 +464,51 @@ "# )" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n", + "INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n", + "INFO:tensorflow:Running Distribute Coordinator with mode = 'independent_worker', cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, environment = None, rpc_layer = 'grpc'\n", + "WARNING:tensorflow:`eval_fn` is not passed in. The `worker_fn` will be used if an \"evaluator\" task exists in the cluster.\n", + "WARNING:tensorflow:`eval_strategy` is not passed in. No distribution strategy will be used for evaluation.\n", + "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n", + "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n", + "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n", + "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n", + "WARNING:tensorflow:ModelCheckpoint callback is not provided. Workers will need to restart training if any fails.\n", + "WARNING:tensorflow:sample_weight modes were coerced from\n", + " ...\n", + " to \n", + " ['...']\n", + "WARNING:tensorflow:sample_weight modes were coerced from\n", + " ...\n", + " to \n", + " ['...']\n", + "Train for 121 steps, validate for 123 steps\n", + "Epoch 1/4\n" + ] + } + ], + "source": [ + "with strategy.scope():\n", + " model = get_compiled_model()\n", + " model.fit(\n", + " train_data_gen,\n", + " steps_per_epoch=total_train // batch_size,\n", + " epochs=epochs,\n", + " validation_data=val_data_gen,\n", + " validation_steps=total_val // batch_size,\n", + " verbose=2\n", + " )" + ] + }, { "cell_type": "code", "execution_count": null, @@ -446,6 +516,7 @@ "outputs": [], "source": [ "#with strategy.scope():\n", + "# multi_worker_model = get_compiled_model()\n", "# multi_worker_model.fit(\n", "# x=train_data_gen,\n", "# epochs=epochs,\n", @@ -479,6 +550,15 @@ "#model.to_json()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#history = model.fit(X, y, batch_size=32, epochs=40, validation_split=0.1)" + ] + }, { "cell_type": "code", "execution_count": null,