parent
cb0f180c87
commit
bc94c25849
|
@ -41,11 +41,12 @@ done
|
|||
```
|
||||
# On worker nodes:
|
||||
sudo apt update
|
||||
sudo apt install python3-pip
|
||||
sudo apt install python3-pip sshfs
|
||||
# XXX deps...
|
||||
pip3 install --upgrade setuptools
|
||||
pip3 install --user tensorflow-2.1.0-cp37-cp37m-linux_x86_64.whl
|
||||
pip3 install --user simplejson
|
||||
pip3 install --user pillow
|
||||
```
|
||||
|
||||
# Usage
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -17,7 +17,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -31,7 +31,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -53,7 +53,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -68,7 +68,24 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tf 2.1.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print('tf {}'.format(tf.__version__))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -83,7 +100,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -101,7 +118,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -110,7 +127,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -120,11 +137,22 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:XLA_CPU:0']\n",
|
||||
"INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
|
||||
"INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n",
|
||||
"strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(\n",
|
||||
" tf.distribute.experimental.CollectiveCommunication.RING)\n",
|
||||
"#\n",
|
||||
"# MultiWorkerMirroredStrategy needs TF_CONFIG\n",
|
||||
"#multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n",
|
||||
|
@ -140,7 +168,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -160,9 +188,24 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"total training good images: 3291\n",
|
||||
"total training bad images: 609\n",
|
||||
"--\n",
|
||||
"Total training images: 3900\n",
|
||||
"total validation good images: 3361\n",
|
||||
"total validation bad images: 601\n",
|
||||
"--\n",
|
||||
"Total validation images: 3962\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print('total training good images:', num_train_good)\n",
|
||||
"print('total training bad images:', num_train_bad)\n",
|
||||
|
@ -176,9 +219,20 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--\n",
|
||||
"Reduce training and validation set when testing\n",
|
||||
"Reduced training images: 3900\n",
|
||||
"Reduced validation images: 3962\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"--\")\n",
|
||||
"print(\"Reduce training and validation set when testing\")\n",
|
||||
|
@ -190,9 +244,18 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found 3900 images belonging to 2 classes.\n",
|
||||
"Found 3962 images belonging to 2 classes.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"train_image_generator = ImageDataGenerator(\n",
|
||||
" rescale=1./255\n",
|
||||
|
@ -213,7 +276,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -235,7 +298,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -250,7 +313,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -268,7 +331,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -277,7 +340,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -298,7 +361,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -307,7 +370,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -321,7 +384,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -330,7 +393,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -340,53 +403,46 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_fit_model():\n",
|
||||
" model = get_compiled_model()\n",
|
||||
" model.fit(\n",
|
||||
" train_data_gen,\n",
|
||||
" steps_per_epoch=total_train // batch_size,\n",
|
||||
" epochs=epochs,\n",
|
||||
" validation_data=val_data_gen,\n",
|
||||
" validation_steps=total_val // batch_size,\n",
|
||||
" verbose=2\n",
|
||||
" )\n",
|
||||
" return model"
|
||||
"#def get_fit_model():\n",
|
||||
"# model = get_compiled_model()\n",
|
||||
"# model.fit(\n",
|
||||
"# train_data_gen,\n",
|
||||
"# steps_per_epoch=total_train // batch_size,\n",
|
||||
"# epochs=epochs,\n",
|
||||
"# validation_data=val_data_gen,\n",
|
||||
"# validation_steps=total_val // batch_size,\n",
|
||||
"# verbose=2\n",
|
||||
"# )\n",
|
||||
"#return model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#get_uncompiled_model()"
|
||||
"#with strategy.scope():\n",
|
||||
"# get_uncompiled_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#get_compiled_model()"
|
||||
"#with strategy.scope():\n",
|
||||
"# get_compiled_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#get_fit_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -396,42 +452,11 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with strategy.scope():\n",
|
||||
" get_uncompiled_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with strategy.scope():\n",
|
||||
" get_compiled_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with strategy.scope():\n",
|
||||
" get_fit_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#multi_worker_model = get_compiled_model()\n",
|
||||
"#\n",
|
||||
"#multi_worker_model.fit(\n",
|
||||
"# x=train_data_gen,\n",
|
||||
"# epochs=epochs,\n",
|
||||
|
@ -439,6 +464,51 @@
|
|||
"# )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n",
|
||||
"INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n",
|
||||
"INFO:tensorflow:Running Distribute Coordinator with mode = 'independent_worker', cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, environment = None, rpc_layer = 'grpc'\n",
|
||||
"WARNING:tensorflow:`eval_fn` is not passed in. The `worker_fn` will be used if an \"evaluator\" task exists in the cluster.\n",
|
||||
"WARNING:tensorflow:`eval_strategy` is not passed in. No distribution strategy will be used for evaluation.\n",
|
||||
"INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
|
||||
"INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n",
|
||||
"INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
|
||||
"INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n",
|
||||
"WARNING:tensorflow:ModelCheckpoint callback is not provided. Workers will need to restart training if any fails.\n",
|
||||
"WARNING:tensorflow:sample_weight modes were coerced from\n",
|
||||
" ...\n",
|
||||
" to \n",
|
||||
" ['...']\n",
|
||||
"WARNING:tensorflow:sample_weight modes were coerced from\n",
|
||||
" ...\n",
|
||||
" to \n",
|
||||
" ['...']\n",
|
||||
"Train for 121 steps, validate for 123 steps\n",
|
||||
"Epoch 1/4\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with strategy.scope():\n",
|
||||
" model = get_compiled_model()\n",
|
||||
" model.fit(\n",
|
||||
" train_data_gen,\n",
|
||||
" steps_per_epoch=total_train // batch_size,\n",
|
||||
" epochs=epochs,\n",
|
||||
" validation_data=val_data_gen,\n",
|
||||
" validation_steps=total_val // batch_size,\n",
|
||||
" verbose=2\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -446,6 +516,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"#with strategy.scope():\n",
|
||||
"# multi_worker_model = get_compiled_model()\n",
|
||||
"# multi_worker_model.fit(\n",
|
||||
"# x=train_data_gen,\n",
|
||||
"# epochs=epochs,\n",
|
||||
|
@ -479,6 +550,15 @@
|
|||
"#model.to_json()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#history = model.fit(X, y, batch_size=32, epochs=40, validation_split=0.1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
|
Loading…
Reference in New Issue