more setup for cluster
parent
1770f749b2
commit
7efde346fe
|
@ -93,16 +93,6 @@
|
|||
"from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tensorflow.python.keras.models import Model\n",
|
||||
"from tensorflow.python.keras.layers import Input, concatenate"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -158,6 +148,24 @@
|
|||
"print(\"Python import done\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Load HDF file\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = load_model('data/hdf/wut-KgazZMKEa74VnquqXLwAvD.h5')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -268,25 +276,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plotImages(sample_test_images[0:3])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load .h5 file here plz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = load_model('data/wut.h5')"
|
||||
"plotImages(sample_test_images[0:1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -318,7 +308,7 @@
|
|||
"source": [
|
||||
"prediction = model.predict(\n",
|
||||
" x=test_data_gen,\n",
|
||||
" verbose=2\n",
|
||||
" verbose=1\n",
|
||||
")\n",
|
||||
"print(\"end predict\")"
|
||||
]
|
||||
|
@ -377,32 +367,6 @@
|
|||
"print('Observation: %s' % (rating))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if prediction_bool[1] == False:\n",
|
||||
" rating = 'bad'\n",
|
||||
"else:\n",
|
||||
" rating = 'good'\n",
|
||||
"print('Observation: %s' % (rating))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if prediction_bool[2] == False:\n",
|
||||
" rating = 'bad'\n",
|
||||
"else:\n",
|
||||
" rating = 'good'\n",
|
||||
"print('Observation: %s' % (rating))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
|
|
@ -6,11 +6,11 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# wut-train --- What U Think? SatNOGS Observation AI, training application.\n",
|
||||
"# wut-train-cluster --- What U Think? SatNOGS Observation AI, training application cluster edition.\n",
|
||||
"#\n",
|
||||
"# https://spacecruft.org/spacecruft/satnogs-wut\n",
|
||||
"#\n",
|
||||
"# Based on data/train and data/val directories builds a wut.h5 file."
|
||||
"# Based on data/train and data/val directories builds a wut.tf file."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -149,6 +149,18 @@
|
|||
"from IPython.display import display, Image"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Cluster\n",
|
||||
"from __future__ import absolute_import, division, print_function, unicode_literals\n",
|
||||
"import tensorflow as tf\n",
|
||||
"import simplejson as json"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -158,6 +170,141 @@
|
|||
"print(\"Python import done\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# CLUSTER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Valid roles: \"chief\", \"worker\", \"ps\" and \"evaluator\".\n",
|
||||
"os.environ[\"TF_CONFIG\"] = json.dumps({\n",
|
||||
" \"cluster\": {\n",
|
||||
" \"worker\": [ \"ml1:2222\", \"ml2:2222\", \"ml3:2222\", \"ml4:2222\", \"ml5:2222\" ]\n",
|
||||
" },\n",
|
||||
" \"task\": {\"type\": \"worker\", \"index\": 1},\n",
|
||||
" \"num_workers\": 5\n",
|
||||
"})\n",
|
||||
"# \"worker\": [\"ml1:port\", \"ml2:port\", \"ml3:port\", \"ml4:port\", \"ml5:port\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# MultiWorkerMirroredStrategy needs TF_CONFIG\n",
|
||||
"multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#os.environ[\"TF_CONFIG\"] = json.dumps({\n",
|
||||
"# \"cluster\": {\n",
|
||||
"# \"worker\": [\"host1:port\", \"host2:port\", \"host3:port\"]\n",
|
||||
"# },\n",
|
||||
"# \"task\": {\"type\": \"worker\", \"index\": 1}\n",
|
||||
"#})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Pick one Strategy Below\n",
|
||||
"# moved further down above Sequence()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Central Storage Strategy\n",
|
||||
"#central_storage_strategy = tf.distribute.experimental.CentralStorageStrategy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ParameterServerStrategy needs TF_CONFIG\n",
|
||||
"#ps_strategy = tf.distribute.experimental.ParameterServerStrategy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# OneDeviceStrategy No cluster\n",
|
||||
"#strategy = tf.distribute.OneDeviceStrategy(device=\"/CPU:0\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Mirrored Strategy\n",
|
||||
"#mirrored_strategy = tf.distribute.MirroredStrategy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Mirrored Strategy\n",
|
||||
"#mirrored_strategy = tf.distribute.MirroredStrategy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# END CLUSTER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -259,8 +406,8 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Reduce training and validation set when testing\")\n",
|
||||
"#total_train = 1\n",
|
||||
"#total_val = 1\n",
|
||||
"total_train = 100\n",
|
||||
"total_val = 100\n",
|
||||
"print(\"Train =\")\n",
|
||||
"print(total_train)\n",
|
||||
"print(\"Validation =\")\n",
|
||||
|
@ -284,8 +431,8 @@
|
|||
"#epochs = 16 # BEST SO FAR\n",
|
||||
"#\n",
|
||||
"# Fast, but reasonable answers\n",
|
||||
"batch_size = 64\n",
|
||||
"epochs = 4\n",
|
||||
"#batch_size = 64\n",
|
||||
"#epochs = 4\n",
|
||||
"# Faster, but reasonable answers ?\n",
|
||||
"#batch_size = 32\n",
|
||||
"#epochs = 2\n",
|
||||
|
@ -295,8 +442,8 @@
|
|||
"#epochs = 3\n",
|
||||
"#\n",
|
||||
"# Smallest set for testing\n",
|
||||
"#batch_size = 1\n",
|
||||
"#epochs = 1"
|
||||
"batch_size = 8\n",
|
||||
"epochs = 4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -401,7 +548,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plotImages(sample_train_images[0:3])"
|
||||
"#plotImages(sample_train_images[0:3])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -410,7 +557,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"plotImages(sample_val_images[0:3])"
|
||||
"#plotImages(sample_val_images[0:3])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -419,7 +566,47 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = Sequential([\n",
|
||||
"# CLUSTER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# USE CPU only (doesn't work)\n",
|
||||
"#import tensorflow as tf\n",
|
||||
"#sess = Session(config=tf.ConfigProto(device_count={'GPU': 0}))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#with mirrored_strategy.scope():\n",
|
||||
"# model = tf.keras.Sequential([tf.keras.layers.Dense(1, input_shape=(1,))])\n",
|
||||
"# model.compile(loss='mse', optimizer='sgd')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#with mirrored_strategy.scope():\n",
|
||||
"with multiworker_strategy.scope():\n",
|
||||
" model = Sequential([\n",
|
||||
" Conv2D(16, 3, padding='same', activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH ,3)),\n",
|
||||
" MaxPooling2D(),\n",
|
||||
" Conv2D(32, 3, padding='same', activation='relu'),\n",
|
||||
|
@ -429,7 +616,10 @@
|
|||
" Flatten(),\n",
|
||||
" Dense(512, activation='relu'),\n",
|
||||
" Dense(1, activation='sigmoid')\n",
|
||||
"])"
|
||||
" ])\n",
|
||||
" model.compile(optimizer='adam',\n",
|
||||
" loss='binary_crossentropy',\n",
|
||||
" metrics=['accuracy'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -438,9 +628,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.compile(optimizer='adam',\n",
|
||||
" loss='binary_crossentropy',\n",
|
||||
" metrics=['accuracy'])"
|
||||
"strategy.num_replicas_in_sync"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -576,7 +764,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Save .h5 data here"
|
||||
"model.to_json()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -585,9 +773,80 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.save('data/hdf/wut-KgazZMKEa74VnquqXLwAvD.h5')"
|
||||
"# Save .tf model data here"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.save('data/models/DUV/wut-train-cluster.tf')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.save('data/models/DUV/wut-train-cluster.h5')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.save_weights('data/models/DUV/wut-weights-train-cluster.tf')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.save_weights('data/models/DUV/wut-weights-train-cluster.h5')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
|
|
@ -259,8 +259,8 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Reduce training and validation set when testing\")\n",
|
||||
"#total_train = 100\n",
|
||||
"#total_val = 100\n",
|
||||
"#total_train = 1\n",
|
||||
"#total_val = 1\n",
|
||||
"print(\"Train =\")\n",
|
||||
"print(total_train)\n",
|
||||
"print(\"Validation =\")\n",
|
||||
|
@ -280,12 +280,23 @@
|
|||
"# Large Test\n",
|
||||
"#batch_size = 512 # FAIL\n",
|
||||
"#batch_size = 256 # FAIL\n",
|
||||
"batch_size = 192\n",
|
||||
"epochs = 16\n",
|
||||
"#batch_size = 192 # BEST SO FAR\n",
|
||||
"#epochs = 16 # BEST SO FAR\n",
|
||||
"#\n",
|
||||
"# Fast, but reasonable answers\n",
|
||||
"batch_size = 64\n",
|
||||
"epochs = 4\n",
|
||||
"# Faster, but reasonable answers ?\n",
|
||||
"#batch_size = 32\n",
|
||||
"#epochs = 2\n",
|
||||
"#\n",
|
||||
"# Testing, faster more inaccurate results\n",
|
||||
"#batch_size = 16\n",
|
||||
"#epochs = 3"
|
||||
"#epochs = 3\n",
|
||||
"#\n",
|
||||
"# Smallest set for testing\n",
|
||||
"#batch_size = 1\n",
|
||||
"#epochs = 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -574,7 +585,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.save('data/hdf/wut-train.h5')"
|
||||
"model.save('data/hdf/wut-KgazZMKEa74VnquqXLwAvD.h5')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue