still meh fit() dist

2020-01-18 21:19:56 -07:00 · 2020-01-18 21:19:56 -07:00 · 68b2fc2730
parent cc31af1062
commit 68b2fc2730
2 changed files with 57 additions and 36 deletions
--- a/jupyter/wut-train-cluster-fn.ipynb
+++ b/jupyter/wut-train-cluster-fn.ipynb
@ -117,6 +117,15 @@
    "tf.keras.backend.clear_session()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "options = tf.data.Options()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -210,17 +219,29 @@
    "val_image_generator = ImageDataGenerator(\n",
    "    rescale=1./255\n",
    ")\n",
-    "train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
+    "#train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
    "train_data_gen = train_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
    "                                                           directory=train_dir,\n",
    "                                                           shuffle=True,\n",
    "                                                           target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
    "                                                           class_mode='binary')\n",
-    "val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
+    "#val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
    "val_data_gen = val_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
    "                                                       directory=val_dir,\n",
    "                                                       target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
    "                                                       class_mode='binary')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#train_dist_dataset = strategy.experimental_distribute_dataset()\n",
    "#val_dist_dataset = strategy.experimental_distribute_dataset()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -264,16 +285,24 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "#strategy.num_replicas_in_sync\n",
+    "strategy.num_replicas_in_sync"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Compute global batch size using number of replicas.\n",
-    "#BATCH_SIZE_PER_REPLICA = 5\n",
+    "BATCH_SIZE_PER_REPLICA = 5\n",
-    "#print(BATCH_SIZE_PER_REPLICA)\n",
+    "print(BATCH_SIZE_PER_REPLICA)\n",
-    "#global_batch_size = (BATCH_SIZE_PER_REPLICA *\n",
+    "global_batch_size = (BATCH_SIZE_PER_REPLICA *\n",
-    "#                     strategy.num_replicas_in_sync)\n",
+    "                     strategy.num_replicas_in_sync)\n",
-    "#print(global_batch_size)\n",
+    "print(global_batch_size)\n",
-    "#dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)\n",
+    "dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)\n",
-    "#dataset = dataset.batch(global_batch_size)\n",
+    "dataset = dataset.batch(global_batch_size)\n",
-    "#LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}"
+    "LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}"
   ]
  },
  {
@ -338,6 +367,17 @@
    "#model = get_compiled_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a checkpoint directory to store the checkpoints.\n",
    "checkpoint_dir = './training_checkpoints'\n",
    "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/wut-train-cluster-fn.py
+++ b/wut-train-cluster-fn.py
@ -41,9 +41,15 @@ IMG_WIDTH= 804
 batch_size = 32
 epochs = 4
 BUFFER_SIZE = 10000
 NUM_WORKERS = 6
 GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
 # XXX
 #tf.keras.backend.clear_session()
 options = tf.data.Options()
 strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
    tf.distribute.experimental.CollectiveCommunication.RING)
@ -112,31 +118,6 @@ def get_compiled_model():
              metrics=['accuracy'])
  return model
 #def get_fit_model():
 #    model = get_compiled_model()
 #    model.fit(
 #        train_data_gen,
 #        steps_per_epoch=total_train // batch_size,
 #        epochs=epochs,
 #        validation_data=val_data_gen,
 #        validation_steps=total_val // batch_size,
 #        verbose=2
 #        )
 #    return model
 #with strategy.scope():
 #   get_uncompiled_model()
 #with strategy.scope():
 #    get_compiled_model()
 #with strategy.scope():
 #    get_fit_model()
 #multi_worker_model = get_compiled_model()
 #multi_worker_model.fit(
 #    x=train_data_gen,
 #    epochs=epochs,
 #    steps_per_epoch=total_train // batch_size
 #    )
 with strategy.scope():
    model = get_compiled_model()