distributed getting closer...
parent
59d0b82469
commit
0be648d1d1
|
@ -2,7 +2,7 @@
|
|||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -17,7 +17,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -31,7 +31,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -53,7 +53,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -68,22 +68,22 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"os.environ[\"TF_CONFIG\"] = json.dumps({\n",
|
||||
" \"cluster\": {\n",
|
||||
" \"worker\": [ \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
|
||||
" \"worker\": [ \"10.100.100.130:2222\", \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
|
||||
" },\n",
|
||||
" \"task\": {\"type\": \"worker\", \"index\": 1 },\n",
|
||||
" \"task\": {\"type\": \"worker\", \"index\": 0 },\n",
|
||||
" \"num_workers\": 5\n",
|
||||
"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -101,7 +101,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -110,7 +110,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -120,9 +120,19 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:XLA_CPU:0']\n",
|
||||
"INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
|
||||
"INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.AUTO\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n",
|
||||
"#\n",
|
||||
|
@ -140,7 +150,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -160,9 +170,24 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"total training good images: 3291\n",
|
||||
"total training bad images: 609\n",
|
||||
"--\n",
|
||||
"Total training images: 3900\n",
|
||||
"total validation good images: 3361\n",
|
||||
"total validation bad images: 601\n",
|
||||
"--\n",
|
||||
"Total validation images: 3962\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print('total training good images:', num_train_good)\n",
|
||||
"print('total training bad images:', num_train_bad)\n",
|
||||
|
@ -176,9 +201,20 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"--\n",
|
||||
"Reduce training and validation set when testing\n",
|
||||
"Reduced training images: 3900\n",
|
||||
"Reduced validation images: 3962\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"--\")\n",
|
||||
"print(\"Reduce training and validation set when testing\")\n",
|
||||
|
@ -190,9 +226,18 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found 3900 images belonging to 2 classes.\n",
|
||||
"Found 3962 images belonging to 2 classes.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"train_image_generator = ImageDataGenerator(\n",
|
||||
" rescale=1./255\n",
|
||||
|
@ -213,7 +258,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -235,7 +280,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -250,29 +295,55 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#strategy.num_replicas_in_sync\n",
|
||||
"## Compute global batch size using number of replicas.\n",
|
||||
"#BATCH_SIZE_PER_REPLICA = 5\n",
|
||||
"#print(BATCH_SIZE_PER_REPLICA)\n",
|
||||
"#global_batch_size = (BATCH_SIZE_PER_REPLICA *\n",
|
||||
"# strategy.num_replicas_in_sync)\n",
|
||||
"#print(global_batch_size)\n",
|
||||
"#dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)\n",
|
||||
"#dataset = dataset.batch(global_batch_size)\n",
|
||||
"#LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#learning_rate = LEARNING_RATES_BY_BATCH_SIZE[global_batch_size]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_uncompiled_model():\n",
|
||||
" inputs = Input(shape=(IMG_HEIGHT, IMG_WIDTH ,3))\n",
|
||||
" x = Conv2D(16, 3, padding='same', activation='relu')(inputs)\n",
|
||||
" x = MaxPooling2D()(x)\n",
|
||||
" x = Conv2D(32, 3, padding='same', activation='relu')(x)\n",
|
||||
" x = MaxPooling2D()(x)\n",
|
||||
" x = Conv2D(64, 3, padding='same', activation='relu')(x)\n",
|
||||
" x = MaxPooling2D()(x)\n",
|
||||
" x = Flatten()(x)\n",
|
||||
" x = Dense(512, activation='relu')(x)\n",
|
||||
" x = Dense(1, activation='sigmoid')(x)\n",
|
||||
" outputs = Dense(10, activation='softmax', name='predictions')(x)\n",
|
||||
" model = Model(inputs=inputs, outputs=outputs)\n",
|
||||
" model = Sequential([\n",
|
||||
" Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),\n",
|
||||
" MaxPooling2D(),\n",
|
||||
" Conv2D(32, 3, padding='same', activation='relu'),\n",
|
||||
" MaxPooling2D(),\n",
|
||||
" Conv2D(64, 3, padding='same', activation='relu'),\n",
|
||||
" MaxPooling2D(),\n",
|
||||
" Flatten(),\n",
|
||||
" Dense(512, activation='relu'),\n",
|
||||
" Dense(1, activation='sigmoid')\n",
|
||||
" ])\n",
|
||||
" return model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -281,7 +352,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -295,7 +366,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -304,7 +375,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -314,7 +385,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -333,20 +404,68 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#model = get_fit_model()"
|
||||
"#get_uncompiled_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"strategy.num_replicas_in_sync"
|
||||
"#get_compiled_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#get_fit_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#with strategy.scope():\n",
|
||||
"# get_fit_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with strategy.scope():\n",
|
||||
" get_uncompiled_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = AUTO\n",
|
||||
"INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = AUTO\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with strategy.scope():\n",
|
||||
" get_compiled_model()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -356,8 +475,6 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"with strategy.scope():\n",
|
||||
" get_uncompiled_model()\n",
|
||||
" get_compiled_model()\n",
|
||||
" get_fit_model()"
|
||||
]
|
||||
},
|
||||
|
@ -367,7 +484,13 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.summary()"
|
||||
"#multi_worker_model = get_compiled_model()\n",
|
||||
"#\n",
|
||||
"#multi_worker_model.fit(\n",
|
||||
"# x=train_data_gen,\n",
|
||||
"# epochs=epochs,\n",
|
||||
"# steps_per_epoch=total_train // batch_size\n",
|
||||
"# )"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -376,7 +499,21 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#Image.LOAD_TRUNCATED_IMAGES = True"
|
||||
"#with strategy.scope():\n",
|
||||
"# multi_worker_model.fit(\n",
|
||||
"# x=train_data_gen,\n",
|
||||
"# epochs=epochs,\n",
|
||||
"# steps_per_epoch=total_train // batch_size\n",
|
||||
"# )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#model.summary()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -392,8 +529,8 @@
|
|||
"print(train_image_generator)\n",
|
||||
"print(train_data_gen)\n",
|
||||
"#print(sample_train_images)\n",
|
||||
"print(history)\n",
|
||||
"model.to_json()"
|
||||
"#print(history)\n",
|
||||
"#model.to_json()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -17,9 +17,9 @@
|
|||
# so the index is hostname minus one (without alpha).
|
||||
|
||||
HOSTNUM=`hostname | sed -e 's/ml//g'`
|
||||
let HOSTNUM=$HOSTNUM-1
|
||||
#let HOSTNUM=$HOSTNUM-1
|
||||
|
||||
export TF_CONFIG='{"cluster": {"worker": ["ml1:2222", "ml2:2222", "ml3:2222", "ml4:2222", "ml5:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
|
||||
export TF_CONFIG='{"cluster": {"worker": [ "10.100.100.130:2222", "ml1:2222", "ml2:2222", "ml3:2222", "ml4:2222", "ml5:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
|
||||
|
||||
echo $TF_CONFIG
|
||||
python3 wut-worker.py
|
||||
|
|
|
@ -32,19 +32,19 @@ IMG_WIDTH= 804
|
|||
|
||||
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
|
||||
|
||||
|
||||
def get_uncompiled_model():
|
||||
inputs = Input(shape=(IMG_HEIGHT, IMG_WIDTH ,3))
|
||||
x = Conv2D(16, 3, padding='same', activation='relu')(inputs)
|
||||
x = MaxPooling2D()(x)
|
||||
x = Conv2D(32, 3, padding='same', activation='relu')(x)
|
||||
x = MaxPooling2D()(x)
|
||||
x = Conv2D(64, 3, padding='same', activation='relu')(x)
|
||||
x = MaxPooling2D()(x)
|
||||
x = Flatten()(x)
|
||||
x = Dense(512, activation='relu')(x)
|
||||
x = Dense(1, activation='sigmoid')(x)
|
||||
outputs = Dense(10, activation='softmax', name='predictions')(x)
|
||||
model = Model(inputs=inputs, outputs=outputs)
|
||||
model = Sequential([
|
||||
Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
|
||||
MaxPooling2D(),
|
||||
Conv2D(32, 3, padding='same', activation='relu'),
|
||||
MaxPooling2D(),
|
||||
Conv2D(64, 3, padding='same', activation='relu'),
|
||||
MaxPooling2D(),
|
||||
Flatten(),
|
||||
Dense(512, activation='relu'),
|
||||
Dense(1, activation='sigmoid')
|
||||
])
|
||||
return model
|
||||
|
||||
def get_compiled_model():
|
||||
|
@ -57,15 +57,21 @@ def get_compiled_model():
|
|||
def get_fit_model():
|
||||
model = get_compiled_model()
|
||||
model.fit(
|
||||
train_data_gen,
|
||||
steps_per_epoch=total_train // batch_size,
|
||||
epochs=epochs,
|
||||
validation_data=val_data_gen,
|
||||
validation_steps=total_val // batch_size,
|
||||
verbose=2
|
||||
)
|
||||
return model
|
||||
|
||||
#def get_fit_model():
|
||||
# model = get_compiled_model()
|
||||
# model.fit(
|
||||
# train_data_gen,
|
||||
# steps_per_epoch=total_train // batch_size,
|
||||
# epochs=epochs,
|
||||
# validation_data=val_data_gen,
|
||||
# validation_steps=total_val // batch_size,
|
||||
# verbose=2
|
||||
# )
|
||||
# return model
|
||||
|
||||
with strategy.scope():
|
||||
get_uncompiled_model()
|
||||
get_compiled_model()
|
||||
|
|
Loading…
Reference in New Issue