Merge branch 'master' of spacecruft.org:spacecruft/satnogs-wut

master
ml server 2020-01-20 13:47:55 -07:00
commit f213d4da15
6 changed files with 445 additions and 151 deletions

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -17,7 +17,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -31,7 +31,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -53,7 +53,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -68,24 +68,16 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tf 2.1.0\n"
]
}
],
"outputs": [],
"source": [
"print('tf {}'.format(tf.__version__))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -100,7 +92,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -118,7 +110,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -127,7 +119,16 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"options = tf.data.Options()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -137,19 +138,9 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:XLA_CPU:0']\n",
"INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
"INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n"
]
}
],
"outputs": [],
"source": [
"strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(\n",
" tf.distribute.experimental.CollectiveCommunication.RING)\n",
@ -168,7 +159,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -188,24 +179,9 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total training good images: 3291\n",
"total training bad images: 609\n",
"--\n",
"Total training images: 3900\n",
"total validation good images: 3361\n",
"total validation bad images: 601\n",
"--\n",
"Total validation images: 3962\n"
]
}
],
"outputs": [],
"source": [
"print('total training good images:', num_train_good)\n",
"print('total training bad images:', num_train_bad)\n",
@ -219,20 +195,9 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--\n",
"Reduce training and validation set when testing\n",
"Reduced training images: 3900\n",
"Reduced validation images: 3962\n"
]
}
],
"outputs": [],
"source": [
"print(\"--\")\n",
"print(\"Reduce training and validation set when testing\")\n",
@ -244,18 +209,9 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 3900 images belonging to 2 classes.\n",
"Found 3962 images belonging to 2 classes.\n"
]
}
],
"outputs": [],
"source": [
"train_image_generator = ImageDataGenerator(\n",
" rescale=1./255\n",
@ -263,12 +219,14 @@
"val_image_generator = ImageDataGenerator(\n",
" rescale=1./255\n",
")\n",
"train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
"#train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
"train_data_gen = train_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
" directory=train_dir,\n",
" shuffle=True,\n",
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
" class_mode='binary')\n",
"val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
"#val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
"val_data_gen = val_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
" directory=val_dir,\n",
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
" class_mode='binary')"
@ -276,7 +234,17 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#train_dist_dataset = strategy.experimental_distribute_dataset()\n",
"#val_dist_dataset = strategy.experimental_distribute_dataset()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -298,7 +266,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -313,25 +281,33 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#strategy.num_replicas_in_sync\n",
"## Compute global batch size using number of replicas.\n",
"#BATCH_SIZE_PER_REPLICA = 5\n",
"#print(BATCH_SIZE_PER_REPLICA)\n",
"#global_batch_size = (BATCH_SIZE_PER_REPLICA *\n",
"# strategy.num_replicas_in_sync)\n",
"#print(global_batch_size)\n",
"#dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)\n",
"#dataset = dataset.batch(global_batch_size)\n",
"#LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}"
"strategy.num_replicas_in_sync"
]
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## Compute global batch size using number of replicas.\n",
"BATCH_SIZE_PER_REPLICA = 5\n",
"print(BATCH_SIZE_PER_REPLICA)\n",
"global_batch_size = (BATCH_SIZE_PER_REPLICA *\n",
" strategy.num_replicas_in_sync)\n",
"print(global_batch_size)\n",
"dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)\n",
"dataset = dataset.batch(global_batch_size)\n",
"LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -340,7 +316,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -361,7 +337,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -370,7 +346,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -384,7 +360,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -393,7 +369,18 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create a checkpoint directory to store the checkpoints.\n",
"checkpoint_dir = './training_checkpoints'\n",
"checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -403,7 +390,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -422,7 +409,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -432,7 +419,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -442,7 +429,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -452,7 +439,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@ -468,34 +455,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n",
"INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n",
"INFO:tensorflow:Running Distribute Coordinator with mode = 'independent_worker', cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, environment = None, rpc_layer = 'grpc'\n",
"WARNING:tensorflow:`eval_fn` is not passed in. The `worker_fn` will be used if an \"evaluator\" task exists in the cluster.\n",
"WARNING:tensorflow:`eval_strategy` is not passed in. No distribution strategy will be used for evaluation.\n",
"INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
"INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n",
"INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
"INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n",
"WARNING:tensorflow:ModelCheckpoint callback is not provided. Workers will need to restart training if any fails.\n",
"WARNING:tensorflow:sample_weight modes were coerced from\n",
" ...\n",
" to \n",
" ['...']\n",
"WARNING:tensorflow:sample_weight modes were coerced from\n",
" ...\n",
" to \n",
" ['...']\n",
"Train for 121 steps, validate for 123 steps\n",
"Epoch 1/4\n"
]
}
],
"outputs": [],
"source": [
"with strategy.scope():\n",
" model = get_compiled_model()\n",

25
wut-tf 100755
View File

@ -0,0 +1,25 @@
#!/bin/bash
# wut-tf
#
# Starts worker client.
#
# Usage:
# wut-tf
# Example:
# wut-tf
#
# Note:
# Each node needs a unique index number.
#
# NOTE!
# This generates the node number based off the hostname.
# The hosts are ml0 through ml5.
HOSTNUM=`hostname | sed -e 's/ml//g'`
#export TF_CONFIG='{"cluster": {"worker": [ "ml0-int:2222", "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
export TF_CONFIG='{"cluster": {"worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}}'
echo $TF_CONFIG
python3 wut-tf.py

60
wut-tf.py 100644
View File

@ -0,0 +1,60 @@
#!/usr/bin/env python3
#
# wut-tf.py
#
# https://spacecruft.org/spacecruft/satnogs-wut
#
# Distributed Learning
from __future__ import absolute_import, division, print_function, unicode_literals
from __future__ import print_function
import os
import json
import numpy as np
import datetime
import tensorflow as tf
import tensorflow.python.keras
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from tensorflow.python.keras import optimizers
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Activation, Dropout, Flatten, Dense
from tensorflow.python.keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D
from tensorflow.python.keras.layers import Input, concatenate
from tensorflow.python.keras.models import load_model
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.preprocessing import image
from tensorflow.python.keras.preprocessing.image import img_to_array
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
from tensorflow.python.keras.preprocessing.image import load_img
os.environ["TF_CONFIG"] = json.dumps({
"cluster": {
"worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222" ]
}#,
#"task": {"type": "worker", "index": 0 },
})
print("Tensorflow Version: ", tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))
print(tf.config.experimental.list_physical_devices())
#with tf.device("GPU:0"):
# tf.ones(()) # Make sure we can run on GPU
print("XLA_FLAGS='{}'".format(os.getenv("XLA_FLAGS")))
print(os.getenv("XLA_FLAGS"))
tf.keras.backend.clear_session()
IMG_HEIGHT = 416
IMG_WIDTH= 804
batch_size = 32
epochs = 4
BUFFER_SIZE = 10000
NUM_WORKERS = 6
GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
#strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
#strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
# tf.distribute.experimental.CollectiveCommunication.RING)
AUTOTUNE = tf.data.experimental.AUTOTUNE
NUM_TOTAL_IMAGES=100
tf.config.optimizer.set_jit(True)
#tf.summary.trace_on(profiler=True)
#tf.summary.trace_export(name=trace-export,profiler_outdir=logs)
options = tf.data.Options()

View File

@ -41,9 +41,15 @@ IMG_WIDTH= 804
batch_size = 32
epochs = 4
BUFFER_SIZE = 10000
NUM_WORKERS = 6
GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
# XXX
#tf.keras.backend.clear_session()
options = tf.data.Options()
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
tf.distribute.experimental.CollectiveCommunication.RING)
@ -112,31 +118,6 @@ def get_compiled_model():
metrics=['accuracy'])
return model
#def get_fit_model():
# model = get_compiled_model()
# model.fit(
# train_data_gen,
# steps_per_epoch=total_train // batch_size,
# epochs=epochs,
# validation_data=val_data_gen,
# validation_steps=total_val // batch_size,
# verbose=2
# )
# return model
#with strategy.scope():
# get_uncompiled_model()
#with strategy.scope():
# get_compiled_model()
#with strategy.scope():
# get_fit_model()
#multi_worker_model = get_compiled_model()
#multi_worker_model.fit(
# x=train_data_gen,
# epochs=epochs,
# steps_per_epoch=total_train // batch_size
# )
with strategy.scope():
model = get_compiled_model()

26
wut-worker-mas 100755
View File

@ -0,0 +1,26 @@
#!/bin/bash
# wut-worker-mas
#
# Starts worker client.
#
# Usage:
# wut-worker-mas
# Example:
# wut-worker-mas
#
# Note:
# Each node needs a unique index number.
#
# NOTE!
# This generates the node number based off the hostname.
# The hosts are ml0 through ml5.
HOSTNUM=`hostname | sed -e 's/ml//g'`
#export TF_CONFIG='{"cluster": {"worker": [ "ml0-int:2222", "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
#export TF_CONFIG='{"cluster": {"worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}}'
export TF_CONFIG='{"cluster": {"chief": [ "ml0-int:2222" ], "worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
echo $TF_CONFIG
python3 wut-worker-mas.py

242
wut-worker-mas.py 100644
View File

@ -0,0 +1,242 @@
#!/usr/bin/env python3
#
# wut-worker-mas.py
#
# https://spacecruft.org/spacecruft/satnogs-wut
#
# Distributed Learning
from __future__ import absolute_import, division, print_function, unicode_literals
from __future__ import print_function
import os
import json
import numpy as np
import datetime
import tensorflow as tf
import tensorflow.python.keras
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from tensorflow.python.keras import optimizers
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Activation, Dropout, Flatten, Dense
from tensorflow.python.keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D
from tensorflow.python.keras.layers import Input, concatenate
from tensorflow.python.keras.models import load_model
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.preprocessing import image
from tensorflow.python.keras.preprocessing.image import img_to_array
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
from tensorflow.python.keras.preprocessing.image import load_img
#import tensorflow.python.distribute.cluster_resolver
#from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
#from tensorflow.python.distribute.cluster_resolver.TFConfigClusterResolver
tf.keras.backend.clear_session()
options = tf.data.Options()
os.environ["TF_CONFIG"] = json.dumps({
"cluster": {
"chief": [ "ml0-int:2222" ],
"worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222" ]
},
"task": {"type": "chief", "index": 0 },
})
#os.environ["TF_CONFIG"] = json.dumps({
# "cluster": {
# "worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222" ]
# }#,
# #"task": {"type": "worker", "index": 0 },
#})
print("Tensorflow Version: ", tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))
#with tf.device("GPU:0"):
# tf.ones(()) # Make sure we can run on GPU
# This ensures that XLA and ptxas work well together, and helps with scaling.
print("XLA_FLAGS='{}'".format(os.getenv("XLA_FLAGS")))
IMG_HEIGHT = 416
IMG_WIDTH= 804
batch_size = 32
epochs = 4
BUFFER_SIZE = 10000
NUM_WORKERS = 6
GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
# XXX
POSITIVE_DIRECTORY = '/home/jebba/devel/spacecruft/satnogs-wut/data/pos'
pos_dir = '/home/jebba/devel/spacecruft/satnogs-wut/data/posdir'
from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
tf.distribute.experimental.CollectiveCommunication.RING)
def get_bytes_and_label(filepath):
raw_bytes = tf.io.read_file(filepath)
label = tf.strings.regex_full_match(
POSITIVE_DIRECTORY, pos_dir + ".+")
return raw_bytes, label
def uncompiled_model():
model = Sequential([
Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
MaxPooling2D(),
Conv2D(32, 3, padding='same', activation='relu'),
MaxPooling2D(),
Conv2D(64, 3, padding='same', activation='relu'),
MaxPooling2D(),
Flatten(),
Dense(512, activation='relu'),
Dense(1, activation='sigmoid')
])
return model
input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)
def process_image(image_bytes, label):
image = tf.io.decode_png(image_bytes)
#image = tf.image.resize(image, resolution)
image.set_shape(input_shape)
#image = image / 255. - 0.5
#image = tf.image.random_flip_left_right(image)
#image = tf.image.random_flip_up_down(image)
#image += tf.random.normal(
# image.shape, mean=0, steddev=0.1)
return image, tf.cast(label, tf.float32)
AUTOTUNE = tf.data.experimental.AUTOTUNE
NUM_TOTAL_IMAGES=100
data_root = "/home/jebba/devel/spacecruft/satnogs-wut/data"
profile_dir = os.path.join(data_root, "profiles")
dataset = tf.data.Dataset.list_files(data_root)
dataset = dataset.shuffle(NUM_TOTAL_IMAGES)
dataset = dataset.map(get_bytes_and_label, num_parallel_calls=AUTOTUNE)
dataset = dataset.map(process_image, num_parallel_calls=AUTOTUNE)
dataset = dataset.batch(batch_size=32)
dataset = dataset.prefetch(buffer_size=AUTOTUNE)
os.makedirs(profile_dir, exist_ok=True)
# tf.data.Dataset.from_generator
tf.config.optimizer.set_jit(True)
#tf.summary.trace_on(profiler=True)
def compiled_model():
model = uncompiled_model()
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
return model
with strategy.scope():
#model = tf.keras.applications.mobilenet_v2.MobileNetV2(...)
#optimizer = tf.keras.optimzers.SGD(learning_rate=0.01)
#loss_fn = tf.nn.sigmoid_cross_entropy_with_logits
#model.compile(..., optimizer=optimizer)
model = uncompiled_model()
model = compiled_model()
#model.fit(train_dataset, epochs=10)
model.fit(
train_data_gen,
steps_per_epoch=total_train // batch_size,
epochs=epochs,
validation_data=val_data_gen,
validation_steps=total_val // batch_size,
verbose=2
)
#tf.summary.trace_export(name=trace-export,profiler_outdir=logs)
with strategy.scope():
#model, loss_fn, optimzer = ...
@tf.function
def replicated_step(features, labels):
return strategy.experimental_run_v2(step, (features, labels))
with tf.GradientTape() as tape:
logits = model(features, training=True)
loss = tf.nn.compute_average_loss(
loss, global_batch_size=global_batch_size)
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
return loss
data = strategy.experimental_distribute_dataset(data)
for features, labels in data:
loss = replicated_step(features, labels)
def data_generator():
batch = []
shuffle(data)
for image_path, label in data:
# Load from disk
image = imread(image_path)
# Resize
# image = resize(image, resolution)
# Horizontal and vertical flip
#image = random_flip(image)
# Normalize and add Gaussian noise
#image = normalize_and_add_noise(image)
batch.append((image, label))
handle_batching
# XXX ?
def handle_batching():
if len(batch) == batch_size:
yield concat(batch)
batch.reset()
train_dir = os.path.join('data/', 'train')
val_dir = os.path.join('data/', 'val')
train_good_dir = os.path.join(train_dir, 'good')
train_bad_dir = os.path.join(train_dir, 'bad')
val_good_dir = os.path.join(val_dir, 'good')
val_bad_dir = os.path.join(val_dir, 'bad')
num_train_good = len(os.listdir(train_good_dir))
num_train_bad = len(os.listdir(train_bad_dir))
num_val_good = len(os.listdir(val_good_dir))
num_val_bad = len(os.listdir(val_bad_dir))
total_train = num_train_good + num_train_bad
total_val = num_val_good + num_val_bad
print('total training good images:', num_train_good)
print('total training bad images:', num_train_bad)
print("--")
print("Total training images:", total_train)
print('total validation good images:', num_val_good)
print('total validation bad images:', num_val_bad)
print("--")
print("Total validation images:", total_val)
print("--")
print("Reduce training and validation set when testing")
#total_train = 16
#total_val = 16
print("Reduced training images:", total_train)
print("Reduced validation images:", total_val)
#train_image_generator = ImageDataGenerator(
# rescale=1./255
#)
#val_image_generator = ImageDataGenerator(
# rescale=1./255
#)
#train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,
# directory=train_dir,
# shuffle=True,
# target_size=(IMG_HEIGHT, IMG_WIDTH),
# class_mode='binary')
#val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,
# directory=val_dir,
# target_size=(IMG_HEIGHT, IMG_WIDTH),
# class_mode='binary')