ml-int
parent
eaf2785986
commit
ced0cdf495
|
@ -83,10 +83,10 @@
|
|||
"source": [
|
||||
"os.environ[\"TF_CONFIG\"] = json.dumps({\n",
|
||||
" \"cluster\": {\n",
|
||||
" \"worker\": [ \"10.100.100.130:2222\", \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
|
||||
" \"worker\": [ \"ml0-int:2222\", \"ml1-int:2222\", \"ml2-int:2222\", \"ml3-int:2222\", \"ml4-int:2222\", \"ml5-int:2222\" ]\n",
|
||||
" },\n",
|
||||
" \"task\": {\"type\": \"worker\", \"index\": 0 },\n",
|
||||
" \"num_workers\": 5\n",
|
||||
" \"num_workers\": 6\n",
|
||||
"})"
|
||||
]
|
||||
},
|
||||
|
@ -219,14 +219,14 @@
|
|||
"val_image_generator = ImageDataGenerator(\n",
|
||||
" rescale=1./255\n",
|
||||
")\n",
|
||||
"#train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
|
||||
"train_data_gen = train_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
|
||||
"#train_data_gen = train_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
|
||||
"train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
|
||||
" directory=train_dir,\n",
|
||||
" shuffle=True,\n",
|
||||
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
|
||||
" class_mode='binary')\n",
|
||||
"#val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
|
||||
"val_data_gen = val_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
|
||||
"#val_data_gen = val_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
|
||||
"val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
|
||||
" directory=val_dir,\n",
|
||||
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
|
||||
" class_mode='binary')"
|
||||
|
@ -374,8 +374,8 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a checkpoint directory to store the checkpoints.\n",
|
||||
"checkpoint_dir = './training_checkpoints'\n",
|
||||
"checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")"
|
||||
"#checkpoint_dir = './training_checkpoints'\n",
|
||||
"#checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -17,9 +17,8 @@
|
|||
# so the index is hostname minus one (without alpha).
|
||||
|
||||
HOSTNUM=`hostname | sed -e 's/ml//g'`
|
||||
#let HOSTNUM=$HOSTNUM-1
|
||||
|
||||
export TF_CONFIG='{"cluster": {"worker": [ "10.100.100.130:2222", "ml1:2222", "ml2:2222", "ml3:2222", "ml4:2222", "ml5:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
|
||||
export TF_CONFIG='{"cluster": {"worker": [ "ml0-int:2222", "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
|
||||
|
||||
echo $TF_CONFIG
|
||||
python3 wut-train-cluster-fn.py
|
||||
|
|
Loading…
Reference in New Issue