wut-train-cluster-fn.py from jupyter export...

2020-01-21 15:55:13 -07:00 · 2020-01-21 15:55:13 -07:00 · 9fa145ed19
parent b945d4ab06
commit 9fa145ed19
2 changed files with 43 additions and 26 deletions
--- a/notebooks/wut-train-cluster-fn.ipynb
+++ b/notebooks/wut-train-cluster-fn.ipynb
@ -48,7 +48,8 @@
    "from tensorflow.python.keras.preprocessing import image\n",
    "from tensorflow.python.keras.preprocessing.image import img_to_array\n",
    "from tensorflow.python.keras.preprocessing.image import ImageDataGenerator\n",
-    "from tensorflow.python.keras.preprocessing.image import load_img"
+    "from tensorflow.python.keras.preprocessing.image import load_img\n",
+    "from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy"
   ]
  },
  {
@ -60,10 +61,10 @@
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
-    "from sklearn.decomposition import PCA\n",
-    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
-    "import ipywidgets as widgets\n",
-    "from IPython.display import display, Image"
+    "#from sklearn.decomposition import PCA\n",
+    "#from ipywidgets import interact, interactive, fixed, interact_manual\n",
+    "#import ipywidgets as widgets\n",
+    "#from IPython.display import display, Image"
   ]
  },
  {
@ -86,7 +87,7 @@
    "        \"worker\": [ \"ml0-int:2222\", \"ml1-int:2222\", \"ml2-int:2222\", \"ml3-int:2222\", \"ml4-int:2222\", \"ml5-int:2222\" ]\n",
    "    },\n",
    "   \"task\": {\"type\": \"worker\", \"index\": 0 },\n",
-    "   \"num_workers\": 40\n",
+    "   \"num_workers\": 6\n",
    "})"
   ]
  },
@ -99,7 +100,7 @@
    "IMG_HEIGHT = 416\n",
    "IMG_WIDTH= 804\n",
    "batch_size = 32\n",
-    "epochs = 4\n",
+    "epochs = 1\n",
    "# Full size, machine barfs probably needs more RAM\n",
    "#IMG_HEIGHT = 832\n",
    "#IMG_WIDTH = 1606\n",
@ -123,7 +124,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "options = tf.data.Options()"
+    "options = tf.data.Options()\n",
+    "#options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF\n",
+    "options.experimental_distribute.auto_shard_policy = AutoShardPolicy.DATA\n",
+    "# XXX\n",
+    "#dataset = dataset.with_options(options)"
   ]
  },
  {
@ -133,7 +138,10 @@
   "outputs": [],
   "source": [
    "strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(\n",
-    "    tf.distribute.experimental.CollectiveCommunication.RING)"
+    "    tf.distribute.experimental.CollectiveCommunication.RING)\n",
+    "\n",
+    "#mirrored_strategy = tf.distribute.MirroredStrategy(\n",
+    "#    cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())"
   ]
  },
  {
@ -142,8 +150,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "train_dir = os.path.join('data/', 'train')\n",
-    "val_dir = os.path.join('data/', 'val')\n",
+    "root_data_dir = ('/srv/satnogs')\n",
+    "train_dir = os.path.join(root_data_dir, 'data/', 'train')\n",
+    "val_dir = os.path.join(root_data_dir,'data/', 'val')\n",
    "train_good_dir = os.path.join(train_dir, 'good')\n",
    "train_bad_dir = os.path.join(train_dir, 'bad')\n",
    "val_good_dir = os.path.join(val_dir, 'good')\n",
@ -180,8 +189,8 @@
   "source": [
    "print(\"--\")\n",
    "print(\"Reduce training and validation set when testing\")\n",
-    "#total_train = 16\n",
-    "#total_val = 16\n",
+    "total_train = 100\n",
+    "total_val = 100\n",
    "print(\"Reduced training images:\", total_train)\n",
    "print(\"Reduced validation images:\", total_val)"
   ]
@ -255,7 +264,7 @@
    "log_dir=\"clusterlogs\"\n",
    "#tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)\n",
    "tensorboard_callback = tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir)\n",
-    "%tensorboard --logdir clusterlogs --port 6006"
+    "#%tensorboard --logdir clusterlogs --port 6006"
   ]
  },
  {
@ -274,14 +283,22 @@
   "outputs": [],
   "source": [
    "## Compute global batch size using number of replicas.\n",
-    "BATCH_SIZE_PER_REPLICA = 5\n",
-    "print(BATCH_SIZE_PER_REPLICA)\n",
+    "#GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS\n",
+    "BATCH_SIZE_PER_REPLICA = 8\n",
+    "print(\"BATCH_SIZE_PER_REPLICA\", BATCH_SIZE_PER_REPLICA)\n",
+    "print(\"strategy.num_replicas_in_sync\", strategy.num_replicas_in_sync)\n",
    "global_batch_size = (BATCH_SIZE_PER_REPLICA *\n",
    "                     strategy.num_replicas_in_sync)\n",
-    "print(global_batch_size)\n",
-    "dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)\n",
-    "dataset = dataset.batch(global_batch_size)\n",
-    "LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}"
+    "print(\"global_batch_size\", global_batch_size)\n",
+    "print(\"total_train\", total_train)\n",
+    "print(\"total_val  \", total_val)\n",
+    "print(\"batch_size\",  batch_size)\n",
+    "print(\"total_train // batch_size\", total_train // batch_size)\n",
+    "print(\"total_val   // batch_size\", total_val // batch_size)\n",
+    "#.batch(global_batch_size)\n",
+    "#dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)\n",
+    "#dataset = dataset.batch(global_batch_size)\n",
+    "#LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}"
   ]
  },
  {
@ -385,14 +402,14 @@
   "source": [
    "with strategy.scope():\n",
    "    model = get_compiled_model()\n",
-    "    model.fit(\n",
+    "    history = model.fit(\n",
    "        train_data_gen,\n",
    "        steps_per_epoch=total_train // batch_size,\n",
    "        epochs=epochs,\n",
    "        validation_data=val_data_gen,\n",
    "        validation_steps=total_val // batch_size,\n",
    "        verbose=2\n",
-    "        )"
+    "        ).batch(global_batch_size)"
   ]
  },
  {
--- a/wut-train-cluster-fn.py
+++ b/wut-train-cluster-fn.py
@ -50,10 +50,10 @@ from tensorflow.python.data.experimental.ops.distribute_options import AutoShard
 get_ipython().run_line_magic('matplotlib', 'inline')
 import matplotlib.pyplot as plt
 import seaborn as sns
-from sklearn.decomposition import PCA
-from ipywidgets import interact, interactive, fixed, interact_manual
-import ipywidgets as widgets
-from IPython.display import display, Image
+#from sklearn.decomposition import PCA
+#from ipywidgets import interact, interactive, fixed, interact_manual
+#import ipywidgets as widgets
+#from IPython.display import display, Image


 # In[ ]: