Merge branch 'master' of spacecruft.org:spacecruft/satnogs-wut
commit
f213d4da15
|
@ -2,7 +2,7 @@
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -17,7 +17,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -31,7 +31,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -53,7 +53,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -68,24 +68,16 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"tf 2.1.0\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"print('tf {}'.format(tf.__version__))"
|
"print('tf {}'.format(tf.__version__))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -100,7 +92,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -118,7 +110,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -127,7 +119,16 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"options = tf.data.Options()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -137,19 +138,9 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:0/device:XLA_CPU:0']\n",
|
|
||||||
"INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
|
|
||||||
"INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(\n",
|
"strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(\n",
|
||||||
" tf.distribute.experimental.CollectiveCommunication.RING)\n",
|
" tf.distribute.experimental.CollectiveCommunication.RING)\n",
|
||||||
|
@ -168,7 +159,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -188,24 +179,9 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"total training good images: 3291\n",
|
|
||||||
"total training bad images: 609\n",
|
|
||||||
"--\n",
|
|
||||||
"Total training images: 3900\n",
|
|
||||||
"total validation good images: 3361\n",
|
|
||||||
"total validation bad images: 601\n",
|
|
||||||
"--\n",
|
|
||||||
"Total validation images: 3962\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"print('total training good images:', num_train_good)\n",
|
"print('total training good images:', num_train_good)\n",
|
||||||
"print('total training bad images:', num_train_bad)\n",
|
"print('total training bad images:', num_train_bad)\n",
|
||||||
|
@ -219,20 +195,9 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 13,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"--\n",
|
|
||||||
"Reduce training and validation set when testing\n",
|
|
||||||
"Reduced training images: 3900\n",
|
|
||||||
"Reduced validation images: 3962\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"print(\"--\")\n",
|
"print(\"--\")\n",
|
||||||
"print(\"Reduce training and validation set when testing\")\n",
|
"print(\"Reduce training and validation set when testing\")\n",
|
||||||
|
@ -244,18 +209,9 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Found 3900 images belonging to 2 classes.\n",
|
|
||||||
"Found 3962 images belonging to 2 classes.\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"train_image_generator = ImageDataGenerator(\n",
|
"train_image_generator = ImageDataGenerator(\n",
|
||||||
" rescale=1./255\n",
|
" rescale=1./255\n",
|
||||||
|
@ -263,12 +219,14 @@
|
||||||
"val_image_generator = ImageDataGenerator(\n",
|
"val_image_generator = ImageDataGenerator(\n",
|
||||||
" rescale=1./255\n",
|
" rescale=1./255\n",
|
||||||
")\n",
|
")\n",
|
||||||
"train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
|
"#train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,\n",
|
||||||
|
"train_data_gen = train_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
|
||||||
" directory=train_dir,\n",
|
" directory=train_dir,\n",
|
||||||
" shuffle=True,\n",
|
" shuffle=True,\n",
|
||||||
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
|
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
|
||||||
" class_mode='binary')\n",
|
" class_mode='binary')\n",
|
||||||
"val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
|
"#val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,\n",
|
||||||
|
"val_data_gen = val_image_generator.flow_from_directory(batch_size=GLOBAL_BATCH_SIZE,\n",
|
||||||
" directory=val_dir,\n",
|
" directory=val_dir,\n",
|
||||||
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
|
" target_size=(IMG_HEIGHT, IMG_WIDTH),\n",
|
||||||
" class_mode='binary')"
|
" class_mode='binary')"
|
||||||
|
@ -276,7 +234,17 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 15,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#train_dist_dataset = strategy.experimental_distribute_dataset()\n",
|
||||||
|
"#val_dist_dataset = strategy.experimental_distribute_dataset()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -298,7 +266,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 16,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -313,25 +281,33 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 17,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#strategy.num_replicas_in_sync\n",
|
"strategy.num_replicas_in_sync"
|
||||||
"## Compute global batch size using number of replicas.\n",
|
|
||||||
"#BATCH_SIZE_PER_REPLICA = 5\n",
|
|
||||||
"#print(BATCH_SIZE_PER_REPLICA)\n",
|
|
||||||
"#global_batch_size = (BATCH_SIZE_PER_REPLICA *\n",
|
|
||||||
"# strategy.num_replicas_in_sync)\n",
|
|
||||||
"#print(global_batch_size)\n",
|
|
||||||
"#dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)\n",
|
|
||||||
"#dataset = dataset.batch(global_batch_size)\n",
|
|
||||||
"#LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 18,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"## Compute global batch size using number of replicas.\n",
|
||||||
|
"BATCH_SIZE_PER_REPLICA = 5\n",
|
||||||
|
"print(BATCH_SIZE_PER_REPLICA)\n",
|
||||||
|
"global_batch_size = (BATCH_SIZE_PER_REPLICA *\n",
|
||||||
|
" strategy.num_replicas_in_sync)\n",
|
||||||
|
"print(global_batch_size)\n",
|
||||||
|
"dataset = tf.data.Dataset.from_tensors(([1.], [1.])).repeat(100)\n",
|
||||||
|
"dataset = dataset.batch(global_batch_size)\n",
|
||||||
|
"LEARNING_RATES_BY_BATCH_SIZE = {5: 0.1, 10: 0.15}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -340,7 +316,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 19,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -361,7 +337,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 20,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -370,7 +346,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 21,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -384,7 +360,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 22,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -393,7 +369,18 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 23,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Create a checkpoint directory to store the checkpoints.\n",
|
||||||
|
"checkpoint_dir = './training_checkpoints'\n",
|
||||||
|
"checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -403,7 +390,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 24,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -422,7 +409,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 25,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -432,7 +419,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 26,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -442,7 +429,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 27,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -452,7 +439,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 28,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -468,34 +455,7 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n",
|
|
||||||
"INFO:tensorflow:Collective batch_all_reduce: 1 all-reduces, num_workers = 6, communication_hint = RING\n",
|
|
||||||
"INFO:tensorflow:Running Distribute Coordinator with mode = 'independent_worker', cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, environment = None, rpc_layer = 'grpc'\n",
|
|
||||||
"WARNING:tensorflow:`eval_fn` is not passed in. The `worker_fn` will be used if an \"evaluator\" task exists in the cluster.\n",
|
|
||||||
"WARNING:tensorflow:`eval_strategy` is not passed in. No distribution strategy will be used for evaluation.\n",
|
|
||||||
"INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
|
|
||||||
"INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n",
|
|
||||||
"INFO:tensorflow:Using MirroredStrategy with devices ('/job:worker/task:0',)\n",
|
|
||||||
"INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['10.100.100.130:2222', 'ml1:2222', 'ml2:2222', 'ml3:2222', 'ml4:2222', 'ml5:2222']}, task_type = 'worker', task_id = 0, num_workers = 6, local_devices = ('/job:worker/task:0',), communication = CollectiveCommunication.RING\n",
|
|
||||||
"WARNING:tensorflow:ModelCheckpoint callback is not provided. Workers will need to restart training if any fails.\n",
|
|
||||||
"WARNING:tensorflow:sample_weight modes were coerced from\n",
|
|
||||||
" ...\n",
|
|
||||||
" to \n",
|
|
||||||
" ['...']\n",
|
|
||||||
"WARNING:tensorflow:sample_weight modes were coerced from\n",
|
|
||||||
" ...\n",
|
|
||||||
" to \n",
|
|
||||||
" ['...']\n",
|
|
||||||
"Train for 121 steps, validate for 123 steps\n",
|
|
||||||
"Epoch 1/4\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"with strategy.scope():\n",
|
"with strategy.scope():\n",
|
||||||
" model = get_compiled_model()\n",
|
" model = get_compiled_model()\n",
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# wut-tf
|
||||||
|
#
|
||||||
|
# Starts worker client.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# wut-tf
|
||||||
|
# Example:
|
||||||
|
# wut-tf
|
||||||
|
#
|
||||||
|
# Note:
|
||||||
|
# Each node needs a unique index number.
|
||||||
|
#
|
||||||
|
# NOTE!
|
||||||
|
# This generates the node number based off the hostname.
|
||||||
|
# The hosts are ml0 through ml5.
|
||||||
|
|
||||||
|
HOSTNUM=`hostname | sed -e 's/ml//g'`
|
||||||
|
|
||||||
|
#export TF_CONFIG='{"cluster": {"worker": [ "ml0-int:2222", "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
|
||||||
|
export TF_CONFIG='{"cluster": {"worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}}'
|
||||||
|
|
||||||
|
echo $TF_CONFIG
|
||||||
|
python3 wut-tf.py
|
||||||
|
|
|
@ -0,0 +1,60 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# wut-tf.py
|
||||||
|
#
|
||||||
|
# https://spacecruft.org/spacecruft/satnogs-wut
|
||||||
|
#
|
||||||
|
# Distributed Learning
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
from __future__ import print_function
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import numpy as np
|
||||||
|
import datetime
|
||||||
|
import tensorflow as tf
|
||||||
|
import tensorflow.python.keras
|
||||||
|
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
|
||||||
|
from tensorflow.python.keras import optimizers
|
||||||
|
from tensorflow.python.keras import Sequential
|
||||||
|
from tensorflow.python.keras.layers import Activation, Dropout, Flatten, Dense
|
||||||
|
from tensorflow.python.keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D
|
||||||
|
from tensorflow.python.keras.layers import Input, concatenate
|
||||||
|
from tensorflow.python.keras.models import load_model
|
||||||
|
from tensorflow.python.keras.models import Model
|
||||||
|
from tensorflow.python.keras.preprocessing import image
|
||||||
|
from tensorflow.python.keras.preprocessing.image import img_to_array
|
||||||
|
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
|
||||||
|
from tensorflow.python.keras.preprocessing.image import load_img
|
||||||
|
os.environ["TF_CONFIG"] = json.dumps({
|
||||||
|
"cluster": {
|
||||||
|
"worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222" ]
|
||||||
|
}#,
|
||||||
|
#"task": {"type": "worker", "index": 0 },
|
||||||
|
})
|
||||||
|
print("Tensorflow Version: ", tf.__version__)
|
||||||
|
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
|
||||||
|
print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))
|
||||||
|
print(tf.config.experimental.list_physical_devices())
|
||||||
|
#with tf.device("GPU:0"):
|
||||||
|
# tf.ones(()) # Make sure we can run on GPU
|
||||||
|
print("XLA_FLAGS='{}'".format(os.getenv("XLA_FLAGS")))
|
||||||
|
print(os.getenv("XLA_FLAGS"))
|
||||||
|
tf.keras.backend.clear_session()
|
||||||
|
IMG_HEIGHT = 416
|
||||||
|
IMG_WIDTH= 804
|
||||||
|
batch_size = 32
|
||||||
|
epochs = 4
|
||||||
|
BUFFER_SIZE = 10000
|
||||||
|
NUM_WORKERS = 6
|
||||||
|
GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
|
||||||
|
#strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
|
||||||
|
#strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
|
||||||
|
# tf.distribute.experimental.CollectiveCommunication.RING)
|
||||||
|
AUTOTUNE = tf.data.experimental.AUTOTUNE
|
||||||
|
NUM_TOTAL_IMAGES=100
|
||||||
|
tf.config.optimizer.set_jit(True)
|
||||||
|
#tf.summary.trace_on(profiler=True)
|
||||||
|
#tf.summary.trace_export(name=trace-export,profiler_outdir=logs)
|
||||||
|
options = tf.data.Options()
|
||||||
|
|
|
@ -41,9 +41,15 @@ IMG_WIDTH= 804
|
||||||
batch_size = 32
|
batch_size = 32
|
||||||
epochs = 4
|
epochs = 4
|
||||||
|
|
||||||
|
BUFFER_SIZE = 10000
|
||||||
|
NUM_WORKERS = 6
|
||||||
|
GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
|
||||||
|
|
||||||
# XXX
|
# XXX
|
||||||
#tf.keras.backend.clear_session()
|
#tf.keras.backend.clear_session()
|
||||||
|
|
||||||
|
options = tf.data.Options()
|
||||||
|
|
||||||
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
|
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
|
||||||
tf.distribute.experimental.CollectiveCommunication.RING)
|
tf.distribute.experimental.CollectiveCommunication.RING)
|
||||||
|
|
||||||
|
@ -112,31 +118,6 @@ def get_compiled_model():
|
||||||
metrics=['accuracy'])
|
metrics=['accuracy'])
|
||||||
return model
|
return model
|
||||||
|
|
||||||
#def get_fit_model():
|
|
||||||
# model = get_compiled_model()
|
|
||||||
# model.fit(
|
|
||||||
# train_data_gen,
|
|
||||||
# steps_per_epoch=total_train // batch_size,
|
|
||||||
# epochs=epochs,
|
|
||||||
# validation_data=val_data_gen,
|
|
||||||
# validation_steps=total_val // batch_size,
|
|
||||||
# verbose=2
|
|
||||||
# )
|
|
||||||
# return model
|
|
||||||
|
|
||||||
#with strategy.scope():
|
|
||||||
# get_uncompiled_model()
|
|
||||||
#with strategy.scope():
|
|
||||||
# get_compiled_model()
|
|
||||||
#with strategy.scope():
|
|
||||||
# get_fit_model()
|
|
||||||
|
|
||||||
#multi_worker_model = get_compiled_model()
|
|
||||||
#multi_worker_model.fit(
|
|
||||||
# x=train_data_gen,
|
|
||||||
# epochs=epochs,
|
|
||||||
# steps_per_epoch=total_train // batch_size
|
|
||||||
# )
|
|
||||||
|
|
||||||
with strategy.scope():
|
with strategy.scope():
|
||||||
model = get_compiled_model()
|
model = get_compiled_model()
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# wut-worker-mas
|
||||||
|
#
|
||||||
|
# Starts worker client.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# wut-worker-mas
|
||||||
|
# Example:
|
||||||
|
# wut-worker-mas
|
||||||
|
#
|
||||||
|
# Note:
|
||||||
|
# Each node needs a unique index number.
|
||||||
|
#
|
||||||
|
# NOTE!
|
||||||
|
# This generates the node number based off the hostname.
|
||||||
|
# The hosts are ml0 through ml5.
|
||||||
|
|
||||||
|
HOSTNUM=`hostname | sed -e 's/ml//g'`
|
||||||
|
|
||||||
|
#export TF_CONFIG='{"cluster": {"worker": [ "ml0-int:2222", "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
|
||||||
|
#export TF_CONFIG='{"cluster": {"worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}}'
|
||||||
|
export TF_CONFIG='{"cluster": {"chief": [ "ml0-int:2222" ], "worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222"]}, "task": {"index": '$HOSTNUM', "type": "worker"}}'
|
||||||
|
|
||||||
|
echo $TF_CONFIG
|
||||||
|
python3 wut-worker-mas.py
|
||||||
|
|
|
@ -0,0 +1,242 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# wut-worker-mas.py
|
||||||
|
#
|
||||||
|
# https://spacecruft.org/spacecruft/satnogs-wut
|
||||||
|
#
|
||||||
|
# Distributed Learning
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
from __future__ import print_function
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import numpy as np
|
||||||
|
import datetime
|
||||||
|
import tensorflow as tf
|
||||||
|
import tensorflow.python.keras
|
||||||
|
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
|
||||||
|
from tensorflow.python.keras import optimizers
|
||||||
|
from tensorflow.python.keras import Sequential
|
||||||
|
from tensorflow.python.keras.layers import Activation, Dropout, Flatten, Dense
|
||||||
|
from tensorflow.python.keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D
|
||||||
|
from tensorflow.python.keras.layers import Input, concatenate
|
||||||
|
from tensorflow.python.keras.models import load_model
|
||||||
|
from tensorflow.python.keras.models import Model
|
||||||
|
from tensorflow.python.keras.preprocessing import image
|
||||||
|
from tensorflow.python.keras.preprocessing.image import img_to_array
|
||||||
|
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
|
||||||
|
from tensorflow.python.keras.preprocessing.image import load_img
|
||||||
|
#import tensorflow.python.distribute.cluster_resolver
|
||||||
|
#from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
|
||||||
|
#from tensorflow.python.distribute.cluster_resolver.TFConfigClusterResolver
|
||||||
|
|
||||||
|
tf.keras.backend.clear_session()
|
||||||
|
options = tf.data.Options()
|
||||||
|
os.environ["TF_CONFIG"] = json.dumps({
|
||||||
|
"cluster": {
|
||||||
|
"chief": [ "ml0-int:2222" ],
|
||||||
|
"worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222" ]
|
||||||
|
},
|
||||||
|
"task": {"type": "chief", "index": 0 },
|
||||||
|
})
|
||||||
|
#os.environ["TF_CONFIG"] = json.dumps({
|
||||||
|
# "cluster": {
|
||||||
|
# "worker": [ "ml1-int:2222", "ml2-int:2222", "ml3-int:2222", "ml4-int:2222", "ml5-int:2222" ]
|
||||||
|
# }#,
|
||||||
|
# #"task": {"type": "worker", "index": 0 },
|
||||||
|
#})
|
||||||
|
|
||||||
|
print("Tensorflow Version: ", tf.__version__)
|
||||||
|
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
|
||||||
|
print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))
|
||||||
|
#with tf.device("GPU:0"):
|
||||||
|
# tf.ones(()) # Make sure we can run on GPU
|
||||||
|
|
||||||
|
# This ensures that XLA and ptxas work well together, and helps with scaling.
|
||||||
|
print("XLA_FLAGS='{}'".format(os.getenv("XLA_FLAGS")))
|
||||||
|
|
||||||
|
IMG_HEIGHT = 416
|
||||||
|
IMG_WIDTH= 804
|
||||||
|
batch_size = 32
|
||||||
|
epochs = 4
|
||||||
|
|
||||||
|
BUFFER_SIZE = 10000
|
||||||
|
NUM_WORKERS = 6
|
||||||
|
GLOBAL_BATCH_SIZE = 64 * NUM_WORKERS
|
||||||
|
|
||||||
|
# XXX
|
||||||
|
POSITIVE_DIRECTORY = '/home/jebba/devel/spacecruft/satnogs-wut/data/pos'
|
||||||
|
pos_dir = '/home/jebba/devel/spacecruft/satnogs-wut/data/posdir'
|
||||||
|
|
||||||
|
from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
|
||||||
|
|
||||||
|
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy(
|
||||||
|
tf.distribute.experimental.CollectiveCommunication.RING)
|
||||||
|
|
||||||
|
|
||||||
|
def get_bytes_and_label(filepath):
|
||||||
|
raw_bytes = tf.io.read_file(filepath)
|
||||||
|
label = tf.strings.regex_full_match(
|
||||||
|
POSITIVE_DIRECTORY, pos_dir + ".+")
|
||||||
|
return raw_bytes, label
|
||||||
|
|
||||||
|
def uncompiled_model():
|
||||||
|
model = Sequential([
|
||||||
|
Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),
|
||||||
|
MaxPooling2D(),
|
||||||
|
Conv2D(32, 3, padding='same', activation='relu'),
|
||||||
|
MaxPooling2D(),
|
||||||
|
Conv2D(64, 3, padding='same', activation='relu'),
|
||||||
|
MaxPooling2D(),
|
||||||
|
Flatten(),
|
||||||
|
Dense(512, activation='relu'),
|
||||||
|
Dense(1, activation='sigmoid')
|
||||||
|
])
|
||||||
|
return model
|
||||||
|
|
||||||
|
input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)
|
||||||
|
def process_image(image_bytes, label):
|
||||||
|
image = tf.io.decode_png(image_bytes)
|
||||||
|
#image = tf.image.resize(image, resolution)
|
||||||
|
image.set_shape(input_shape)
|
||||||
|
#image = image / 255. - 0.5
|
||||||
|
#image = tf.image.random_flip_left_right(image)
|
||||||
|
#image = tf.image.random_flip_up_down(image)
|
||||||
|
#image += tf.random.normal(
|
||||||
|
# image.shape, mean=0, steddev=0.1)
|
||||||
|
return image, tf.cast(label, tf.float32)
|
||||||
|
|
||||||
|
AUTOTUNE = tf.data.experimental.AUTOTUNE
|
||||||
|
NUM_TOTAL_IMAGES=100
|
||||||
|
data_root = "/home/jebba/devel/spacecruft/satnogs-wut/data"
|
||||||
|
profile_dir = os.path.join(data_root, "profiles")
|
||||||
|
dataset = tf.data.Dataset.list_files(data_root)
|
||||||
|
dataset = dataset.shuffle(NUM_TOTAL_IMAGES)
|
||||||
|
dataset = dataset.map(get_bytes_and_label, num_parallel_calls=AUTOTUNE)
|
||||||
|
dataset = dataset.map(process_image, num_parallel_calls=AUTOTUNE)
|
||||||
|
dataset = dataset.batch(batch_size=32)
|
||||||
|
dataset = dataset.prefetch(buffer_size=AUTOTUNE)
|
||||||
|
|
||||||
|
|
||||||
|
os.makedirs(profile_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# tf.data.Dataset.from_generator
|
||||||
|
|
||||||
|
tf.config.optimizer.set_jit(True)
|
||||||
|
|
||||||
|
#tf.summary.trace_on(profiler=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def compiled_model():
|
||||||
|
model = uncompiled_model()
|
||||||
|
model.compile(optimizer='adam',
|
||||||
|
loss='binary_crossentropy',
|
||||||
|
metrics=['accuracy'])
|
||||||
|
return model
|
||||||
|
|
||||||
|
with strategy.scope():
|
||||||
|
#model = tf.keras.applications.mobilenet_v2.MobileNetV2(...)
|
||||||
|
#optimizer = tf.keras.optimzers.SGD(learning_rate=0.01)
|
||||||
|
#loss_fn = tf.nn.sigmoid_cross_entropy_with_logits
|
||||||
|
#model.compile(..., optimizer=optimizer)
|
||||||
|
model = uncompiled_model()
|
||||||
|
model = compiled_model()
|
||||||
|
#model.fit(train_dataset, epochs=10)
|
||||||
|
model.fit(
|
||||||
|
train_data_gen,
|
||||||
|
steps_per_epoch=total_train // batch_size,
|
||||||
|
epochs=epochs,
|
||||||
|
validation_data=val_data_gen,
|
||||||
|
validation_steps=total_val // batch_size,
|
||||||
|
verbose=2
|
||||||
|
)
|
||||||
|
|
||||||
|
#tf.summary.trace_export(name=trace-export,profiler_outdir=logs)
|
||||||
|
|
||||||
|
with strategy.scope():
|
||||||
|
#model, loss_fn, optimzer = ...
|
||||||
|
@tf.function
|
||||||
|
def replicated_step(features, labels):
|
||||||
|
return strategy.experimental_run_v2(step, (features, labels))
|
||||||
|
with tf.GradientTape() as tape:
|
||||||
|
logits = model(features, training=True)
|
||||||
|
loss = tf.nn.compute_average_loss(
|
||||||
|
loss, global_batch_size=global_batch_size)
|
||||||
|
|
||||||
|
grads = tape.gradient(loss, model.trainable_variables)
|
||||||
|
optimizer.apply_gradients(zip(grads, model.trainable_variables))
|
||||||
|
return loss
|
||||||
|
|
||||||
|
data = strategy.experimental_distribute_dataset(data)
|
||||||
|
|
||||||
|
for features, labels in data:
|
||||||
|
loss = replicated_step(features, labels)
|
||||||
|
|
||||||
|
def data_generator():
|
||||||
|
batch = []
|
||||||
|
shuffle(data)
|
||||||
|
for image_path, label in data:
|
||||||
|
# Load from disk
|
||||||
|
image = imread(image_path)
|
||||||
|
# Resize
|
||||||
|
# image = resize(image, resolution)
|
||||||
|
# Horizontal and vertical flip
|
||||||
|
#image = random_flip(image)
|
||||||
|
# Normalize and add Gaussian noise
|
||||||
|
#image = normalize_and_add_noise(image)
|
||||||
|
batch.append((image, label))
|
||||||
|
handle_batching
|
||||||
|
|
||||||
|
# XXX ?
|
||||||
|
def handle_batching():
|
||||||
|
if len(batch) == batch_size:
|
||||||
|
yield concat(batch)
|
||||||
|
batch.reset()
|
||||||
|
|
||||||
|
train_dir = os.path.join('data/', 'train')
|
||||||
|
val_dir = os.path.join('data/', 'val')
|
||||||
|
train_good_dir = os.path.join(train_dir, 'good')
|
||||||
|
train_bad_dir = os.path.join(train_dir, 'bad')
|
||||||
|
val_good_dir = os.path.join(val_dir, 'good')
|
||||||
|
val_bad_dir = os.path.join(val_dir, 'bad')
|
||||||
|
num_train_good = len(os.listdir(train_good_dir))
|
||||||
|
num_train_bad = len(os.listdir(train_bad_dir))
|
||||||
|
num_val_good = len(os.listdir(val_good_dir))
|
||||||
|
num_val_bad = len(os.listdir(val_bad_dir))
|
||||||
|
total_train = num_train_good + num_train_bad
|
||||||
|
total_val = num_val_good + num_val_bad
|
||||||
|
|
||||||
|
print('total training good images:', num_train_good)
|
||||||
|
print('total training bad images:', num_train_bad)
|
||||||
|
print("--")
|
||||||
|
print("Total training images:", total_train)
|
||||||
|
print('total validation good images:', num_val_good)
|
||||||
|
print('total validation bad images:', num_val_bad)
|
||||||
|
print("--")
|
||||||
|
print("Total validation images:", total_val)
|
||||||
|
print("--")
|
||||||
|
print("Reduce training and validation set when testing")
|
||||||
|
#total_train = 16
|
||||||
|
#total_val = 16
|
||||||
|
print("Reduced training images:", total_train)
|
||||||
|
print("Reduced validation images:", total_val)
|
||||||
|
|
||||||
|
|
||||||
|
#train_image_generator = ImageDataGenerator(
|
||||||
|
# rescale=1./255
|
||||||
|
#)
|
||||||
|
#val_image_generator = ImageDataGenerator(
|
||||||
|
# rescale=1./255
|
||||||
|
#)
|
||||||
|
|
||||||
|
#train_data_gen = train_image_generator.flow_from_directory(batch_size=batch_size,
|
||||||
|
# directory=train_dir,
|
||||||
|
# shuffle=True,
|
||||||
|
# target_size=(IMG_HEIGHT, IMG_WIDTH),
|
||||||
|
# class_mode='binary')
|
||||||
|
#val_data_gen = val_image_generator.flow_from_directory(batch_size=batch_size,
|
||||||
|
# directory=val_dir,
|
||||||
|
# target_size=(IMG_HEIGHT, IMG_WIDTH),
|
||||||
|
# class_mode='binary')
|
||||||
|
|
Loading…
Reference in New Issue