From cc31af10627b13726f268f75b535c72699fc2c70 Mon Sep 17 00:00:00 2001 From: ml server Date: Sat, 18 Jan 2020 19:25:23 -0700 Subject: [PATCH] Distribution is working, then breaks... --- jupyter/wut-train-cluster-fn.ipynb | 148 +++++++---------------------- 1 file changed, 34 insertions(+), 114 deletions(-) diff --git a/jupyter/wut-train-cluster-fn.ipynb b/jupyter/wut-train-cluster-fn.ipynb index cc8046e..2ad5653 100644 --- a/jupyter/wut-train-cluster-fn.ipynb +++ b/jupyter/wut-train-cluster-fn.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -68,24 +68,16 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tf 2.1.0\n" - ] - } - ], + "outputs": [], "source": [ "print('tf {}'.format(tf.__version__))" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -100,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -118,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -127,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -137,19 +129,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:XLA_CPU:0']\n", - "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n", - "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n" - ] - } - ], + "outputs": [], "source": [ "strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(\n", " tf.distribute.experimental.CollectiveCommunication.RING)\n", @@ -168,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -188,24 +170,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total training good images: 3291\n", - "total training bad images: 609\n", - "--\n", - "Total training images: 3900\n", - "total validation good images: 3361\n", - "total validation bad images: 601\n", - "--\n", - "Total validation images: 3962\n" - ] - } - ], + "outputs": [], "source": [ "print('total training good images:', num_train_good)\n", "print('total training bad images:', num_train_bad)\n", @@ -219,20 +186,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--\n", - "Reduce training and validation set when testing\n", - "Reduced training images: 3900\n", - "Reduced validation images: 3962\n" - ] - } - ], + "outputs": [], "source": [ "print(\"--\")\n", "print(\"Reduce training and validation set when testing\")\n", @@ -244,18 +200,9 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 3900 images belonging to 2 classes.\n", - "Found 3962 images belonging to 2 classes.\n" - ] - } - ], + "outputs": [], "source": [ "train_image_generator = ImageDataGenerator(\n", " rescale=1./255\n", @@ -276,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -298,7 +245,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -313,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -331,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -340,7 +287,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -361,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -370,7 +317,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -384,7 +331,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -393,7 +340,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -403,7 +350,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -422,7 +369,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -432,7 +379,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -442,7 +389,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -452,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -468,34 +415,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n", - "INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n", - "INFO:tensorflow:Running Distribute Coordinator with mode = 'independent_worker', cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, environment = None, rpc_layer = 'grpc'\n", - "WARNING:tensorflow:`eval_fn` is not passed in. The `worker_fn` will be used if an \"evaluator\" task exists in the cluster.\n", - "WARNING:tensorflow:`eval_strategy` is not passed in. No distribution strategy will be used for evaluation.\n", - "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n", - "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n", - "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n", - "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n", - "WARNING:tensorflow:ModelCheckpoint callback is not provided. Workers will need to restart training if any fails.\n", - "WARNING:tensorflow:sample_weight modes were coerced from\n", - " ...\n", - " to \n", - " ['...']\n", - "WARNING:tensorflow:sample_weight modes were coerced from\n", - " ...\n", - " to \n", - " ['...']\n", - "Train for 121 steps, validate for 123 steps\n", - "Epoch 1/4\n" - ] - } - ], + "outputs": [], "source": [ "with strategy.scope():\n", " model = get_compiled_model()\n",