wut-dl-sort script to organize downloads into data/
parent
1e4d21dc1b
commit
0ab498a879
|
@ -28,11 +28,12 @@ Learning/Testing, results are inaccurate.
|
|||
The following scripts are in the repo:
|
||||
|
||||
* `wut` --- Feed it an observation ID and it returns if it is a "good", "bad", or "failed" observation.
|
||||
* `wut-dl-sort` --- Populate `data/` dir with waterfalls from `download/`.
|
||||
* `wut-ml` --- Main machine learning Python script using Tensorflow and Keras.
|
||||
* `wut-obs` --- Download the JSON for an observation ID.
|
||||
* `wut-review-staging` --- Review all images in `data/staging`.
|
||||
* `wut-water` --- Download waterfall for an observation ID to `download/[ID]`.
|
||||
* `wut-water-range` --- Download waterfalls for a range of observation IDs to `download/[ID]`.
|
||||
* `wut-ml` --- Main machine learning Python script using Tensorflow and Keras.
|
||||
* `wut-review-staging` --- Review all images in `data/staging`.
|
||||
|
||||
|
||||
# Installation
|
||||
|
@ -99,8 +100,8 @@ The following steps need to be performed:
|
|||
These get put in the `downloads/[ID]/` directories.
|
||||
|
||||
1. Organize downloaded waterfalls into categories (e.g. "good", "bad", "failed").
|
||||
Note: this needs a script written.
|
||||
Put them into their respective directories under:
|
||||
Use `wut-dl-sort` script.
|
||||
The script them into their respective directories under:
|
||||
* `data/train/good/`
|
||||
* `data/train/bad/`
|
||||
* `data/train/failed/`
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
#!/bin/bash
|
||||
# wut-dl-sort
|
||||
#
|
||||
# Populates the data/ directory from the download/dir.
|
||||
#
|
||||
# XXX This script removes directories in data/ !!! XXX
|
||||
#
|
||||
# Usage:
|
||||
# wut-dl-sort [Minimum Observation ID] [Maximum Observation ID]
|
||||
# Example:
|
||||
# wut-dl-sort 1292441 1470517
|
||||
#
|
||||
# * Takes the files in the download/ dir.
|
||||
# * Looks at the JSON files to see if it is :good", "bad", or "failed".
|
||||
# * Hard link it in the appropriate data/ directory.
|
||||
# * File is randomly copied to either data/train or data/val directory.
|
||||
#
|
||||
# Possible vetted_status: bad, failed, good, null, unknown.
|
||||
|
||||
OBSIDMIN="$1"
|
||||
OBSIDMAX="$2"
|
||||
OBSID=$OBSIDMIN
|
||||
|
||||
# First make sure we have the range all downloaded:
|
||||
echo "Downloading Waterfalls"
|
||||
./wut-water-range $OBSIDMIN $OBSIDMAX
|
||||
|
||||
# XXX remove data/train and data/val directories XXX
|
||||
echo "Removing data/ subdirectories"
|
||||
rm -rf data/train data/val
|
||||
# Create new empty dirs
|
||||
mkdir -p data/train/good data/train/bad data/train/failed
|
||||
mkdir -p data/val/good data/val/bad data/val/failed
|
||||
|
||||
# Then parse each file and link appropriately
|
||||
echo "Parsing download/ directory for observation IDs $OBSIDMIN to $OBSIDMAX"
|
||||
cd download/ || exit
|
||||
|
||||
while [ $OBSID -lt $OBSIDMAX ]
|
||||
do echo "ID: $OBSID "
|
||||
cd $OBSID
|
||||
VET=`cat $OBSID.json | jq --compact-output '.[0] | {vetted_status}' | cut -f 2 -d ":" | sed -e 's/}//g' -e 's/"//g'`
|
||||
RAND_DIR=`echo $((0 + RANDOM % 2))`
|
||||
if [ $RAND_DIR = 1 ] ; then
|
||||
CLASS_DIR="train"
|
||||
else
|
||||
CLASS_DIR="val"
|
||||
fi
|
||||
case "$VET" in
|
||||
bad) ln waterfall_$OBSID_*.png ../../data/$CLASS_DIR/$VET/
|
||||
;;
|
||||
good) ln waterfall_$OBSID_*.png ../../data/$CLASS_DIR/$VET/
|
||||
;;
|
||||
failed) ln waterfall_$OBSID_*.png ../../data/$CLASS_DIR/$VET/
|
||||
;;
|
||||
null) echo "null, not copying"
|
||||
;;
|
||||
unknown) echo "unknown, not copying"
|
||||
;;
|
||||
esac
|
||||
|
||||
let OBSID=$OBSID+1
|
||||
cd ..
|
||||
done
|
Loading…
Reference in New Issue