From ced0cdf495b8a29eb150e9c512b4ef375b091469 Mon Sep 17 00:00:00 2001 From: ml server Date: Mon, 20 Jan 2020 19:03:02 -0700 Subject: [PATCH] ml-int --- jupyter/wut-train-cluster-fn.ipynb | 16 ++++++++-------- wut-worker-train-cluster-fn | 3 +-- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/jupyter/wut-train-cluster-fn.ipynb b/jupyter/wut-train-cluster-fn.ipynb index 7f3296a..4b187d9 100644 --- a/jupyter/wut-train-cluster-fn.ipynb +++ b/jupyter/wut-train-cluster-fn.ipynb @@ -83,10 +83,10 @@ "source": [ "os.environ[\"TF_CONFIG\"] = json.dumps({\n", " \"cluster\": {\n", - " \"worker\": [ \"10.100.100.130:2222\", \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n", + " \"worker\": [ \"ml0-int:2222\", \"ml1-int:2222\", \"ml2-int:2222\", \"ml3-int:2222\", \"ml4-int:2222\", \"ml5-int:2222\" ]\n", " },\n", " \"task\": {\"type\": \"worker\", \"index\": 0 },\n", - " \"num_workers\": 5\n", + " \"num_workers\": 6\n", "})" ] }, @@ -219,14 +219,14 @@ "val_image_generator = ImageDataGenerator(\n", " rescale=1./255\n", ")\n", - "#train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n", - "train_data_gen = train_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n", + "#train_data_gen = train_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n", + "train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n", " directory=train_dir,\n", " shuffle=True,\n", " target_size=(IMG_HEIGHT, IMG_WIDTH),\n", " class_mode='binary')\n", - "#val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n", - "val_data_gen = val_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n", + "#val_data_gen = val_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n", + "val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n", " directory=val_dir,\n", " target_size=(IMG_HEIGHT, IMG_WIDTH),\n", " class_mode='binary')" @@ -374,8 +374,8 @@ "outputs": [], "source": [ "# Create a checkpoint directory to store the checkpoints.\n", - "checkpoint_dir = './training_checkpoints'\n", - "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")" + "#checkpoint_dir = './training_checkpoints'\n", + "#checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")" ] }, { diff --git a/wut-worker-train-cluster-fn b/wut-worker-train-cluster-fn index 38fd80f..a570458 100755 --- a/wut-worker-train-cluster-fn +++ b/wut-worker-train-cluster-fn @@ -17,9 +17,8 @@ # so the index is hostname minus one (without alpha). HOSTNUM=`hostname | sed -e 's/ml//g'` -#let HOSTNUM=$HOSTNUM-1 -export TF_CONFIG='{"cluster": {"worker": [ "10.100.100.130:2222", "ml1:2222", "ml2:2222", "ml3:2222", "ml4:2222", "ml5:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}' +export TF_CONFIG='{"cluster": {"worker": [ "ml0-int:2222", "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}' echo $TF_CONFIG python3 wut-train-cluster-fn.py