distributed getting closer...

master
ml server 2020-01-18 15:02:34 -07:00
parent 59d0b82469
commit 0be648d1d1
3 changed files with 212 additions and 69 deletions

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -17,7 +17,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@ -31,7 +31,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@ -53,7 +53,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -68,22 +68,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"os.environ[\"TF_CONFIG\"] = json.dumps({\n",
" \"cluster\": {\n",
" \"worker\": [ \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
" \"worker\": [ \"10.100.100.130:2222\", \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
" },\n",
" \"task\": {\"type\": \"worker\", \"index\": 1 },\n",
" \"task\": {\"type\": \"worker\", \"index\": 0 },\n",
" \"num_workers\": 5\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@ -101,7 +101,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@ -110,7 +110,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@ -120,9 +120,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:XLA_CPU:0']\n",
"INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
"INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.AUTO\n"
]
}
],
"source": [
"strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()\n",
"#\n",
@ -140,7 +150,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@ -160,9 +170,24 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total training good images: 3291\n",
"total training bad images: 609\n",
"--\n",
"Total training images: 3900\n",
"total validation good images: 3361\n",
"total validation bad images: 601\n",
"--\n",
"Total validation images: 3962\n"
]
}
],
"source": [
"print('total training good images:', num_train_good)\n",
"print('total training bad images:', num_train_bad)\n",
@ -176,9 +201,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--\n",
"Reduce training and validation set when testing\n",
"Reduced training images: 3900\n",
"Reduced validation images: 3962\n"
]
}
],
"source": [
"print(\"--\")\n",
"print(\"Reduce training and validation set when testing\")\n",
@ -190,9 +226,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 3900 images belonging to 2 classes.\n",
"Found 3962 images belonging to 2 classes.\n"
]
}
],
"source": [
"train_image_generator = ImageDataGenerator(\n",
" rescale=1./255\n",
@ -213,7 +258,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@ -235,7 +280,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@ -250,29 +295,55 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"#strategy.num_replicas_in_sync\n",
"## Compute global batch size using number of replicas.\n",
"#BATCH_SIZE_PER_REPLICA = 5\n",
"#print(BATCH_SIZE_PER_REPLICA)\n",
"#global_batch_size = (BATCH_SIZE_PER_REPLICA *\n",
"# strategy.num_replicas_in_sync)\n",
"#print(global_batch_size)\n",
"#dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)\n",
"#dataset = dataset.batch(global_batch_size)\n",
"#LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"#learning_rate = LEARNING_RATES_BY_BATCH_SIZE[global_batch_size]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"def get_uncompiled_model():\n",
" inputs = Input(shape=(IMG_HEIGHT, IMG_WIDTH ,3))\n",
" x = Conv2D(16, 3, padding='same', activation='relu')(inputs)\n",
" x = MaxPooling2D()(x)\n",
" x = Conv2D(32, 3, padding='same', activation='relu')(x)\n",
" x = MaxPooling2D()(x)\n",
" x = Conv2D(64, 3, padding='same', activation='relu')(x)\n",
" x = MaxPooling2D()(x)\n",
" x = Flatten()(x)\n",
" x = Dense(512, activation='relu')(x)\n",
" x = Dense(1, activation='sigmoid')(x)\n",
" outputs = Dense(10, activation='softmax', name='predictions')(x)\n",
" model = Model(inputs=inputs, outputs=outputs)\n",
" model = Sequential([\n",
" Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),\n",
" MaxPooling2D(),\n",
" Conv2D(32, 3, padding='same', activation='relu'),\n",
" MaxPooling2D(),\n",
" Conv2D(64, 3, padding='same', activation='relu'),\n",
" MaxPooling2D(),\n",
" Flatten(),\n",
" Dense(512, activation='relu'),\n",
" Dense(1, activation='sigmoid')\n",
" ])\n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
@ -281,7 +352,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
@ -295,7 +366,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@ -304,7 +375,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
@ -314,7 +385,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
@ -333,20 +404,68 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"#model = get_fit_model()"
"#get_uncompiled_model()"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"strategy.num_replicas_in_sync"
"#get_compiled_model()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"#get_fit_model()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"#with strategy.scope():\n",
"# get_fit_model()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"with strategy.scope():\n",
" get_uncompiled_model()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = AUTO\n",
"INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = AUTO\n"
]
}
],
"source": [
"with strategy.scope():\n",
" get_compiled_model()"
]
},
{
@ -356,8 +475,6 @@
"outputs": [],
"source": [
"with strategy.scope():\n",
" get_uncompiled_model()\n",
" get_compiled_model()\n",
" get_fit_model()"
]
},
@ -367,7 +484,13 @@
"metadata": {},
"outputs": [],
"source": [
"model.summary()"
"#multi_worker_model = get_compiled_model()\n",
"#\n",
"#multi_worker_model.fit(\n",
"# x=train_data_gen,\n",
"# epochs=epochs,\n",
"# steps_per_epoch=total_train // batch_size\n",
"# )"
]
},
{
@ -376,7 +499,21 @@
"metadata": {},
"outputs": [],
"source": [
"#Image.LOAD_TRUNCATED_IMAGES = True"
"#with strategy.scope():\n",
"# multi_worker_model.fit(\n",
"# x=train_data_gen,\n",
"# epochs=epochs,\n",
"# steps_per_epoch=total_train // batch_size\n",
"# )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#model.summary()"
]
},
{
@ -392,8 +529,8 @@
"print(train_image_generator)\n",
"print(train_data_gen)\n",
"#print(sample_train_images)\n",
"print(history)\n",
"model.to_json()"
"#print(history)\n",
"#model.to_json()"
]
},
{

View File

@ -17,9 +17,9 @@
# so the index is hostname minus one (without alpha).
HOSTNUM=`hostname | sed -e 's/ml//g'`
let HOSTNUM=$HOSTNUM-1
#let HOSTNUM=$HOSTNUM-1
export TF_CONFIG='{"cluster": {"worker": ["ml1:2222", "ml2:2222", "ml3:2222", "ml4:2222", "ml5:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
export TF_CONFIG='{"cluster": {"worker": [ "10.100.100.130:2222", "ml1:2222", "ml2:2222", "ml3:2222", "ml4:2222", "ml5:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
echo $TF_CONFIG
python3 wut-worker.py

View File

@ -32,19 +32,19 @@ IMG_WIDTH= 804
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
def get_uncompiled_model():
inputs = Input(shape=(IMG_HEIGHT, IMG_WIDTH ,3))
x = Conv2D(16, 3, padding='same', activation='relu')(inputs)
x = MaxPooling2D()(x)
x = Conv2D(32, 3, padding='same', activation='relu')(x)
x = MaxPooling2D()(x)
x = Conv2D(64, 3, padding='same', activation='relu')(x)
x = MaxPooling2D()(x)
x = Flatten()(x)
x = Dense(512, activation='relu')(x)
x = Dense(1, activation='sigmoid')(x)
outputs = Dense(10, activation='softmax', name='predictions')(x)
model = Model(inputs=inputs, outputs=outputs)
model = Sequential([
Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
MaxPooling2D(),
Conv2D(32, 3, padding='same', activation='relu'),
MaxPooling2D(),
Conv2D(64, 3, padding='same', activation='relu'),
MaxPooling2D(),
Flatten(),
Dense(512, activation='relu'),
Dense(1, activation='sigmoid')
])
return model
def get_compiled_model():
@ -57,15 +57,21 @@ def get_compiled_model():
def get_fit_model():
model = get_compiled_model()
model.fit(
train_data_gen,
steps_per_epoch=total_train // batch_size,
epochs=epochs,
validation_data=val_data_gen,
validation_steps=total_val // batch_size,
verbose=2
)
return model
#def get_fit_model():
# model = get_compiled_model()
# model.fit(
# train_data_gen,
# steps_per_epoch=total_train // batch_size,
# epochs=epochs,
# validation_data=val_data_gen,
# validation_steps=total_val // batch_size,
# verbose=2
# )
# return model
with strategy.scope():
get_uncompiled_model()
get_compiled_model()