more setup for cluster

master
ml server 2020-01-17 17:46:23 -07:00
parent 1770f749b2
commit 7efde346fe
3 changed files with 313 additions and 79 deletions

View File

@ -93,16 +93,6 @@
"from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from tensorflow.python.keras.models import Model\n",
"from tensorflow.python.keras.layers import Input, concatenate"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -158,6 +148,24 @@
"print(\"Python import done\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Load HDF file\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = load_model('data/hdf/wut-KgazZMKEa74VnquqXLwAvD.h5')"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -268,25 +276,7 @@
"metadata": {},
"outputs": [],
"source": [
"plotImages(sample_test_images[0:3])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load .h5 file here plz"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = load_model('data/wut.h5')"
"plotImages(sample_test_images[0:1])"
]
},
{
@ -318,7 +308,7 @@
"source": [
"prediction = model.predict(\n",
" x=test_data_gen,\n",
" verbose=2\n",
" verbose=1\n",
")\n",
"print(\"end predict\")"
]
@ -377,32 +367,6 @@
"print('Observation: %s' % (rating))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if prediction_bool[1] == False:\n",
" rating = 'bad'\n",
"else:\n",
" rating = 'good'\n",
"print('Observation: %s' % (rating))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if prediction_bool[2] == False:\n",
" rating = 'bad'\n",
"else:\n",
" rating = 'good'\n",
"print('Observation: %s' % (rating))"
]
},
{
"cell_type": "code",
"execution_count": null,

View File

@ -6,11 +6,11 @@
"metadata": {},
"outputs": [],
"source": [
"# wut-train --- What U Think? SatNOGS Observation AI, training application.\n",
"# wut-train-cluster --- What U Think? SatNOGS Observation AI, training application cluster edition.\n",
"#\n",
"# https://spacecruft.org/spacecruft/satnogs-wut\n",
"#\n",
"# Based on data/train and data/val directories builds a wut.h5 file."
"# Based on data/train and data/val directories builds a wut.tf file."
]
},
{
@ -149,6 +149,18 @@
"from IPython.display import display, Image"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cluster\n",
"from __future__ import absolute_import, division, print_function, unicode_literals\n",
"import tensorflow as tf\n",
"import simplejson as json"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -158,6 +170,141 @@
"print(\"Python import done\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# CLUSTER"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Valid roles: \"chief\", \"worker\", \"ps\" and \"evaluator\".\n",
"os.environ[\"TF_CONFIG\"] = json.dumps({\n",
" \"cluster\": {\n",
" \"worker\": [ \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
" },\n",
" \"task\": {\"type\": \"worker\", \"index\": 1},\n",
" \"num_workers\": 5\n",
"})\n",
"# \"worker\": [\"ml1:port\", \"ml2:port\", \"ml3:port\", \"ml4:port\", \"ml5:port\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# MultiWorkerMirroredStrategy needs TF_CONFIG\n",
"multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#os.environ[\"TF_CONFIG\"] = json.dumps({\n",
"# \"cluster\": {\n",
"# \"worker\": [\"host1:port\", \"host2:port\", \"host3:port\"]\n",
"# },\n",
"# \"task\": {\"type\": \"worker\", \"index\": 1}\n",
"#})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Pick one Strategy Below\n",
"# moved further down above Sequence()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Central Storage Strategy\n",
"#central_storage_strategy = tf.distribute.experimental.CentralStorageStrategy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ParameterServerStrategy needs TF_CONFIG\n",
"#ps_strategy = tf.distribute.experimental.ParameterServerStrategy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# OneDeviceStrategy No cluster\n",
"#strategy = tf.distribute.OneDeviceStrategy(device=\"/CPU:0\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Mirrored Strategy\n",
"#mirrored_strategy = tf.distribute.MirroredStrategy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Mirrored Strategy\n",
"#mirrored_strategy = tf.distribute.MirroredStrategy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# END CLUSTER"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -259,8 +406,8 @@
"outputs": [],
"source": [
"print(\"Reduce training and validation set when testing\")\n",
"#total_train = 1\n",
"#total_val = 1\n",
"total_train = 100\n",
"total_val = 100\n",
"print(\"Train =\")\n",
"print(total_train)\n",
"print(\"Validation =\")\n",
@ -284,8 +431,8 @@
"#epochs = 16 # BEST SO FAR\n",
"#\n",
"# Fast, but reasonable answers\n",
"batch_size = 64\n",
"epochs = 4\n",
"#batch_size = 64\n",
"#epochs = 4\n",
"# Faster, but reasonable answers ?\n",
"#batch_size = 32\n",
"#epochs = 2\n",
@ -295,8 +442,8 @@
"#epochs = 3\n",
"#\n",
"# Smallest set for testing\n",
"#batch_size = 1\n",
"#epochs = 1"
"batch_size = 8\n",
"epochs = 4"
]
},
{
@ -401,7 +548,7 @@
"metadata": {},
"outputs": [],
"source": [
"plotImages(sample_train_images[0:3])"
"#plotImages(sample_train_images[0:3])"
]
},
{
@ -410,7 +557,7 @@
"metadata": {},
"outputs": [],
"source": [
"plotImages(sample_val_images[0:3])"
"#plotImages(sample_val_images[0:3])"
]
},
{
@ -419,7 +566,47 @@
"metadata": {},
"outputs": [],
"source": [
"model = Sequential([\n",
"# CLUSTER"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# USE CPU only (doesn't work)\n",
"#import tensorflow as tf\n",
"#sess = Session(config=tf.ConfigProto(device_count={'GPU': 0}))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#with mirrored_strategy.scope():\n",
"# model = tf.keras.Sequential([tf.keras.layers.Dense(1, input_shape=(1,))])\n",
"# model.compile(loss='mse', optimizer='sgd')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#with mirrored_strategy.scope():\n",
"with multiworker_strategy.scope():\n",
" model = Sequential([\n",
" Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),\n",
" MaxPooling2D(),\n",
" Conv2D(32, 3, padding='same', activation='relu'),\n",
@ -429,7 +616,10 @@
" Flatten(),\n",
" Dense(512, activation='relu'),\n",
" Dense(1, activation='sigmoid')\n",
"])"
" ])\n",
" model.compile(optimizer='adam',\n",
" loss='binary_crossentropy',\n",
" metrics=['accuracy'])"
]
},
{
@ -438,9 +628,7 @@
"metadata": {},
"outputs": [],
"source": [
"model.compile(optimizer='adam',\n",
" loss='binary_crossentropy',\n",
" metrics=['accuracy'])"
"strategy.num_replicas_in_sync"
]
},
{
@ -576,7 +764,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Save .h5 data here"
"model.to_json()"
]
},
{
@ -585,9 +773,80 @@
"metadata": {},
"outputs": [],
"source": [
"model.save('data/hdf/wut-KgazZMKEa74VnquqXLwAvD.h5')"
"# Save .tf model data here"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.save('data/models/DUV/wut-train-cluster.tf')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.save('data/models/DUV/wut-train-cluster.h5')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.save_weights('data/models/DUV/wut-weights-train-cluster.tf')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.save_weights('data/models/DUV/wut-weights-train-cluster.h5')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,

View File

@ -259,8 +259,8 @@
"outputs": [],
"source": [
"print(\"Reduce training and validation set when testing\")\n",
"#total_train = 100\n",
"#total_val = 100\n",
"#total_train = 1\n",
"#total_val = 1\n",
"print(\"Train =\")\n",
"print(total_train)\n",
"print(\"Validation =\")\n",
@ -280,12 +280,23 @@
"# Large Test\n",
"#batch_size = 512 # FAIL\n",
"#batch_size = 256 # FAIL\n",
"batch_size = 192\n",
"epochs = 16\n",
"#batch_size = 192 # BEST SO FAR\n",
"#epochs = 16 # BEST SO FAR\n",
"#\n",
"# Fast, but reasonable answers\n",
"batch_size = 64\n",
"epochs = 4\n",
"# Faster, but reasonable answers ?\n",
"#batch_size = 32\n",
"#epochs = 2\n",
"#\n",
"# Testing, faster more inaccurate results\n",
"#batch_size = 16\n",
"#epochs = 3"
"#epochs = 3\n",
"#\n",
"# Smallest set for testing\n",
"#batch_size = 1\n",
"#epochs = 1"
]
},
{
@ -574,7 +585,7 @@
"metadata": {},
"outputs": [],
"source": [
"model.save('data/hdf/wut-train.h5')"
"model.save('data/hdf/wut-KgazZMKEa74VnquqXLwAvD.h5')"
]
},
{