Merge branch 'master' of spacecruft.org:spacecruft/satnogs-wut

2020-01-20 13:47:55 -07:00 · 2020-01-20 13:47:55 -07:00 · f213d4da15
parent 58930d85aa eaf2785986
commit f213d4da15
6 changed files with 445 additions and 151 deletions
--- a/jupyter/wut-train-cluster-fn.ipynb
+++ b/jupyter/wut-train-cluster-fn.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -17,7 +17,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -31,7 +31,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -53,7 +53,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -68,24 +68,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf 2.1.0\n"
     ]
    }
   ],
   "source": [
    "print('tf {}'.format(tf.__version__))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -100,7 +92,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -118,7 +110,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -127,7 +119,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "options = tf.data.Options()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -137,19 +138,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:XLA_CPU:0']\n",
      "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
      "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n"
     ]
    }
   ],
   "source": [
    "strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(\n",
    "    tf.distribute.experimental.CollectiveCommunication.RING)\n",
@ -168,7 +159,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -188,24 +179,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total training good images: 3291\n",
      "total training bad images: 609\n",
      "--\n",
      "Total training images: 3900\n",
      "total validation good images: 3361\n",
      "total validation bad images: 601\n",
      "--\n",
      "Total validation images: 3962\n"
     ]
    }
   ],
   "source": [
    "print('total training good images:', num_train_good)\n",
    "print('total training bad images:', num_train_bad)\n",
@ -219,20 +195,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--\n",
      "Reduce training and validation set when testing\n",
      "Reduced training images: 3900\n",
      "Reduced validation images: 3962\n"
     ]
    }
   ],
   "source": [
    "print(\"--\")\n",
    "print(\"Reduce training and validation set when testing\")\n",
@ -244,18 +209,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 3900 images belonging to 2 classes.\n",
      "Found 3962 images belonging to 2 classes.\n"
     ]
    }
   ],
   "source": [
    "train_image_generator = ImageDataGenerator(\n",
    "    rescale=1./255\n",
@ -263,12 +219,14 @@
    "val_image_generator = ImageDataGenerator(\n",
    "    rescale=1./255\n",
    ")\n",
-    "train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
+    "#train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
    "train_data_gen = train_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
    "                                                           directory=train_dir,\n",
    "                                                           shuffle=True,\n",
    "                                                           target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
    "                                                           class_mode='binary')\n",
-    "val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
+    "#val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
    "val_data_gen = val_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
    "                                                       directory=val_dir,\n",
    "                                                       target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
    "                                                       class_mode='binary')"
@ -276,7 +234,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#train_dist_dataset = strategy.experimental_distribute_dataset()\n",
    "#val_dist_dataset = strategy.experimental_distribute_dataset()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -298,7 +266,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -313,25 +281,33 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "#strategy.num_replicas_in_sync\n",
+    "strategy.num_replicas_in_sync"
    "## Compute global batch size using number of replicas.\n",
    "#BATCH_SIZE_PER_REPLICA = 5\n",
    "#print(BATCH_SIZE_PER_REPLICA)\n",
    "#global_batch_size = (BATCH_SIZE_PER_REPLICA *\n",
    "#                     strategy.num_replicas_in_sync)\n",
    "#print(global_batch_size)\n",
    "#dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)\n",
    "#dataset = dataset.batch(global_batch_size)\n",
    "#LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Compute global batch size using number of replicas.\n",
    "BATCH_SIZE_PER_REPLICA = 5\n",
    "print(BATCH_SIZE_PER_REPLICA)\n",
    "global_batch_size = (BATCH_SIZE_PER_REPLICA *\n",
    "                     strategy.num_replicas_in_sync)\n",
    "print(global_batch_size)\n",
    "dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)\n",
    "dataset = dataset.batch(global_batch_size)\n",
    "LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -340,7 +316,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -361,7 +337,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -370,7 +346,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -384,7 +360,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -393,7 +369,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a checkpoint directory to store the checkpoints.\n",
    "checkpoint_dir = './training_checkpoints'\n",
    "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -403,7 +390,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -422,7 +409,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -432,7 +419,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -442,7 +429,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -452,7 +439,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@ -468,34 +455,7 @@
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n",
      "INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n",
      "INFO:tensorflow:Running Distribute Coordinator with mode = 'independent_worker', cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, environment = None, rpc_layer = 'grpc'\n",
      "WARNING:tensorflow:`eval_fn` is not passed in. The `worker_fn` will be used if an \"evaluator\" task exists in the cluster.\n",
      "WARNING:tensorflow:`eval_strategy` is not passed in. No distribution strategy will be used for evaluation.\n",
      "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
      "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n",
      "INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
      "INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n",
      "WARNING:tensorflow:ModelCheckpoint callback is not provided. Workers will need to restart training if any fails.\n",
      "WARNING:tensorflow:sample_weight modes were coerced from\n",
      "  ...\n",
      "    to  \n",
      "  ['...']\n",
      "WARNING:tensorflow:sample_weight modes were coerced from\n",
      "  ...\n",
      "    to  \n",
      "  ['...']\n",
      "Train for 121 steps, validate for 123 steps\n",
      "Epoch 1/4\n"
     ]
    }
   ],
   "source": [
    "with strategy.scope():\n",
    "    model = get_compiled_model()\n",
--- a/25
+++ b/25
@ -0,0 +1,25 @@
 #!/bin/bash
 # wut-tf
 #
 # Starts worker client.
 #
 # Usage:
 # wut-tf
 # Example:
 # wut-tf
 #
 # Note:
 # Each node needs a unique index number.
 #
 # NOTE!
 # This generates the node number based off the hostname.
 # The hosts are ml0 through ml5.
 HOSTNUM=`hostname | sed -e 's/ml//g'`
 #export TF_CONFIG='{"cluster": {"worker": [ "ml0-int:2222", "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
 export TF_CONFIG='{"cluster": {"worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}}'
 echo $TF_CONFIG
 python3 wut-tf.py
--- a/wut-tf.py
+++ b/wut-tf.py
@ -0,0 +1,60 @@
 #!/usr/bin/env python3
 #
 # wut-tf.py
 #
 # https://spacecruft.org/spacecruft/satnogs-wut
 #
 # Distributed Learning
 from __future__ import absolute_import, division, print_function, unicode_literals
 from __future__ import print_function
 import os
 import json
 import numpy as np
 import datetime
 import tensorflow as tf
 import tensorflow.python.keras
 from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras import Sequential
 from tensorflow.python.keras.layers import Activation, Dropout, Flatten, Dense
 from tensorflow.python.keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D
 from tensorflow.python.keras.layers import Input, concatenate
 from tensorflow.python.keras.models import load_model
 from tensorflow.python.keras.models import Model
 from tensorflow.python.keras.preprocessing import image
 from tensorflow.python.keras.preprocessing.image import img_to_array
 from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
 from tensorflow.python.keras.preprocessing.image import load_img
 os.environ["TF_CONFIG"] = json.dumps({
    "cluster": {
        "worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222" ]
    }#,
   #"task": {"type": "worker", "index": 0 },
 })
 print("Tensorflow Version: ", tf.__version__)
 print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
 print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))
 print(tf.config.experimental.list_physical_devices())
 #with tf.device("GPU:0"):
 #  tf.ones(())  # Make sure we can run on GPU
 print("XLA_FLAGS='{}'".format(os.getenv("XLA_FLAGS")))
 print(os.getenv("XLA_FLAGS"))
 tf.keras.backend.clear_session()
 IMG_HEIGHT = 416
 IMG_WIDTH= 804
 batch_size = 32
 epochs = 4
 BUFFER_SIZE = 10000
 NUM_WORKERS = 6
 GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
 #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
 #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
 #    tf.distribute.experimental.CollectiveCommunication.RING)
 AUTOTUNE = tf.data.experimental.AUTOTUNE
 NUM_TOTAL_IMAGES=100
 tf.config.optimizer.set_jit(True)
 #tf.summary.trace_on(profiler=True)
 #tf.summary.trace_export(name=trace-export,profiler_outdir=logs)
 options = tf.data.Options()
--- a/wut-train-cluster-fn.py
+++ b/wut-train-cluster-fn.py
@ -41,9 +41,15 @@ IMG_WIDTH= 804
 batch_size = 32
 epochs = 4
 BUFFER_SIZE = 10000
 NUM_WORKERS = 6
 GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
 # XXX
 #tf.keras.backend.clear_session()
 options = tf.data.Options()
 strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
    tf.distribute.experimental.CollectiveCommunication.RING)
@ -112,31 +118,6 @@ def get_compiled_model():
              metrics=['accuracy'])
  return model
 #def get_fit_model():
 #    model = get_compiled_model()
 #    model.fit(
 #        train_data_gen,
 #        steps_per_epoch=total_train // batch_size,
 #        epochs=epochs,
 #        validation_data=val_data_gen,
 #        validation_steps=total_val // batch_size,
 #        verbose=2
 #        )
 #    return model
 #with strategy.scope():
 #   get_uncompiled_model()
 #with strategy.scope():
 #    get_compiled_model()
 #with strategy.scope():
 #    get_fit_model()
 #multi_worker_model = get_compiled_model()
 #multi_worker_model.fit(
 #    x=train_data_gen,
 #    epochs=epochs,
 #    steps_per_epoch=total_train // batch_size
 #    )
 with strategy.scope():
    model = get_compiled_model()
--- a/26
+++ b/26
@ -0,0 +1,26 @@
 #!/bin/bash
 # wut-worker-mas
 #
 # Starts worker client.
 #
 # Usage:
 # wut-worker-mas
 # Example:
 # wut-worker-mas
 #
 # Note:
 # Each node needs a unique index number.
 #
 # NOTE!
 # This generates the node number based off the hostname.
 # The hosts are ml0 through ml5.
 HOSTNUM=`hostname | sed -e 's/ml//g'`
 #export TF_CONFIG='{"cluster": {"worker": [ "ml0-int:2222", "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
 #export TF_CONFIG='{"cluster": {"worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}}'
 export TF_CONFIG='{"cluster": {"chief": [ "ml0-int:2222" ], "worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
 echo $TF_CONFIG
 python3 wut-worker-mas.py
--- a/wut-worker-mas.py
+++ b/wut-worker-mas.py
@ -0,0 +1,242 @@
 #!/usr/bin/env python3
 #
 # wut-worker-mas.py
 #
 # https://spacecruft.org/spacecruft/satnogs-wut
 #
 # Distributed Learning
 from __future__ import absolute_import, division, print_function, unicode_literals
 from __future__ import print_function
 import os
 import json
 import numpy as np
 import datetime
 import tensorflow as tf
 import tensorflow.python.keras
 from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras import Sequential
 from tensorflow.python.keras.layers import Activation, Dropout, Flatten, Dense
 from tensorflow.python.keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D
 from tensorflow.python.keras.layers import Input, concatenate
 from tensorflow.python.keras.models import load_model
 from tensorflow.python.keras.models import Model
 from tensorflow.python.keras.preprocessing import image
 from tensorflow.python.keras.preprocessing.image import img_to_array
 from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
 from tensorflow.python.keras.preprocessing.image import load_img
 #import tensorflow.python.distribute.cluster_resolver
 #from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 #from tensorflow.python.distribute.cluster_resolver.TFConfigClusterResolver
 tf.keras.backend.clear_session()
 options = tf.data.Options()
 os.environ["TF_CONFIG"] = json.dumps({
    "cluster": {
        "chief": [ "ml0-int:2222" ],
        "worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222" ]
    },
   "task": {"type": "chief", "index": 0 },
 })
 #os.environ["TF_CONFIG"] = json.dumps({
 #    "cluster": {
 #        "worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222" ]
 #    }#,
 #   #"task": {"type": "worker", "index": 0 },
 #})
 print("Tensorflow Version: ", tf.__version__)
 print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
 print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))
 #with tf.device("GPU:0"):
 #  tf.ones(())  # Make sure we can run on GPU
 # This ensures that XLA and ptxas work well together, and helps with scaling.
 print("XLA_FLAGS='{}'".format(os.getenv("XLA_FLAGS")))
 IMG_HEIGHT = 416
 IMG_WIDTH= 804
 batch_size = 32
 epochs = 4
 BUFFER_SIZE = 10000
 NUM_WORKERS = 6
 GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
 # XXX
 POSITIVE_DIRECTORY = '/home/jebba/devel/spacecruft/satnogs-wut/data/pos'
 pos_dir = '/home/jebba/devel/spacecruft/satnogs-wut/data/posdir'
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
    tf.distribute.experimental.CollectiveCommunication.RING)
 def get_bytes_and_label(filepath):
  raw_bytes = tf.io.read_file(filepath)
  label = tf.strings.regex_full_match(
      POSITIVE_DIRECTORY, pos_dir + ".+")
  return raw_bytes, label
 def uncompiled_model():
  model = Sequential([
    Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
    MaxPooling2D(),
    Conv2D(32, 3, padding='same', activation='relu'),
    MaxPooling2D(),
    Conv2D(64, 3, padding='same', activation='relu'),
    MaxPooling2D(),
    Flatten(),
    Dense(512, activation='relu'),
    Dense(1, activation='sigmoid')
  ])
  return model
 input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)
 def process_image(image_bytes, label):
  image = tf.io.decode_png(image_bytes)
  #image = tf.image.resize(image, resolution)
  image.set_shape(input_shape)
  #image = image / 255. - 0.5
  #image = tf.image.random_flip_left_right(image)
  #image = tf.image.random_flip_up_down(image)
  #image += tf.random.normal(
  #    image.shape, mean=0, steddev=0.1)
  return image, tf.cast(label, tf.float32)
 AUTOTUNE = tf.data.experimental.AUTOTUNE
 NUM_TOTAL_IMAGES=100
 data_root = "/home/jebba/devel/spacecruft/satnogs-wut/data"
 profile_dir = os.path.join(data_root, "profiles")
 dataset = tf.data.Dataset.list_files(data_root)
 dataset = dataset.shuffle(NUM_TOTAL_IMAGES)
 dataset = dataset.map(get_bytes_and_label, num_parallel_calls=AUTOTUNE)
 dataset = dataset.map(process_image, num_parallel_calls=AUTOTUNE)
 dataset = dataset.batch(batch_size=32)
 dataset = dataset.prefetch(buffer_size=AUTOTUNE)
 os.makedirs(profile_dir, exist_ok=True)
 # tf.data.Dataset.from_generator
 tf.config.optimizer.set_jit(True)
 #tf.summary.trace_on(profiler=True)
 def compiled_model():
  model = uncompiled_model()
  model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
  return model
 with strategy.scope():
  #model = tf.keras.applications.mobilenet_v2.MobileNetV2(...)
  #optimizer = tf.keras.optimzers.SGD(learning_rate=0.01)
  #loss_fn = tf.nn.sigmoid_cross_entropy_with_logits
  #model.compile(..., optimizer=optimizer)
  model = uncompiled_model()
  model = compiled_model()
  #model.fit(train_dataset, epochs=10)
  model.fit(
    train_data_gen,
    steps_per_epoch=total_train // batch_size,
    epochs=epochs,
    validation_data=val_data_gen,
    validation_steps=total_val // batch_size,
    verbose=2
    )
 #tf.summary.trace_export(name=trace-export,profiler_outdir=logs)
 with strategy.scope():
  #model, loss_fn, optimzer = ...
  @tf.function
  def replicated_step(features, labels):
    return strategy.experimental_run_v2(step, (features, labels))
    with tf.GradientTape() as tape:
      logits = model(features, training=True)
      loss = tf.nn.compute_average_loss(
          loss, global_batch_size=global_batch_size)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss
    data = strategy.experimental_distribute_dataset(data)
    for features, labels in data:
      loss = replicated_step(features, labels)
 def data_generator():
  batch = []
  shuffle(data)
  for image_path, label in data:
    # Load from disk
    image = imread(image_path)
    # Resize
   # image = resize(image, resolution)
    # Horizontal and vertical flip
    #image = random_flip(image)
    # Normalize and add Gaussian noise
    #image = normalize_and_add_noise(image)
    batch.append((image, label))
    handle_batching
 # XXX ?
 def handle_batching():
  if len(batch) == batch_size:
    yield concat(batch)
    batch.reset()
 train_dir = os.path.join('data/', 'train')
 val_dir = os.path.join('data/', 'val')
 train_good_dir = os.path.join(train_dir, 'good')
 train_bad_dir = os.path.join(train_dir, 'bad')
 val_good_dir = os.path.join(val_dir, 'good')
 val_bad_dir = os.path.join(val_dir, 'bad')
 num_train_good = len(os.listdir(train_good_dir))
 num_train_bad = len(os.listdir(train_bad_dir))
 num_val_good = len(os.listdir(val_good_dir))
 num_val_bad = len(os.listdir(val_bad_dir))
 total_train = num_train_good + num_train_bad
 total_val = num_val_good + num_val_bad
 print('total training good images:', num_train_good)
 print('total training bad images:', num_train_bad)
 print("--")
 print("Total training images:", total_train)
 print('total validation good images:', num_val_good)
 print('total validation bad images:', num_val_bad)
 print("--")
 print("Total validation images:", total_val)
 print("--")
 print("Reduce training and validation set when testing")
 #total_train = 16
 #total_val = 16
 print("Reduced training images:", total_train)
 print("Reduced validation images:", total_val)
 #train_image_generator = ImageDataGenerator(
 #    rescale=1./255
 #)
 #val_image_generator = ImageDataGenerator(
 #    rescale=1./255
 #)
 #train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,
 #                                                           directory=train_dir,
 #                                                           shuffle=True,
 #                                                           target_size=(IMG_HEIGHT, IMG_WIDTH),
 #                                                           class_mode='binary')
 #val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,
 #                                                       directory=val_dir,
 #                                                       target_size=(IMG_HEIGHT, IMG_WIDTH),
 #                                                       class_mode='binary')