master
ml server 2020-01-20 19:03:02 -07:00
parent eaf2785986
commit ced0cdf495
2 changed files with 9 additions and 10 deletions

View File

@ -83,10 +83,10 @@
"source": [
"os.environ[\"TF_CONFIG\"] = json.dumps({\n",
" \"cluster\": {\n",
" \"worker\": [ \"10.100.100.130:2222\", \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
" \"worker\": [ \"ml0-int:2222\", \"ml1-int:2222\", \"ml2-int:2222\", \"ml3-int:2222\", \"ml4-int:2222\", \"ml5-int:2222\" ]\n",
" },\n",
" \"task\": {\"type\": \"worker\", \"index\": 0 },\n",
" \"num_workers\": 5\n",
" \"num_workers\": 6\n",
"})"
]
},
@ -219,14 +219,14 @@
"val_image_generator = ImageDataGenerator(\n",
" rescale=1./255\n",
")\n",
"#train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
"train_data_gen = train_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
"#train_data_gen = train_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
"train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
" directory=train_dir,\n",
" shuffle=True,\n",
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
" class_mode='binary')\n",
"#val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
"val_data_gen = val_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
"#val_data_gen = val_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
"val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
" directory=val_dir,\n",
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
" class_mode='binary')"
@ -374,8 +374,8 @@
"outputs": [],
"source": [
"# Create a checkpoint directory to store the checkpoints.\n",
"checkpoint_dir = './training_checkpoints'\n",
"checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")"
"#checkpoint_dir = './training_checkpoints'\n",
"#checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")"
]
},
{

View File

@ -17,9 +17,8 @@
# so the index is hostname minus one (without alpha).
HOSTNUM=`hostname | sed -e 's/ml//g'`
#let HOSTNUM=$HOSTNUM-1
export TF_CONFIG='{"cluster": {"worker": [ "10.100.100.130:2222", "ml1:2222", "ml2:2222", "ml3:2222", "ml4:2222", "ml5:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
export TF_CONFIG='{"cluster": {"worker": [ "ml0-int:2222", "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
echo $TF_CONFIG
python3 wut-train-cluster-fn.py