wut-ogg2wav, misc

master 0.20
ml server 2020-01-06 15:35:22 -07:00
parent bdeb891fee
commit da2c7d2c49
4 changed files with 82 additions and 1 deletions

View File

@ -19,6 +19,8 @@
#
# So to get mostly all of the observations in December, 2019, run:
# wut-audio-archive 1292461 1470525
# Archive.org doesn't have everything from December, 2019 yet. Run:
# wut-audio-archive 1292461 1333333
#
# XXX Should check input is sane...
@ -32,7 +34,7 @@ cd $DOWNDIR || exit
# Download JSON
while [ $OBSID -lt $OBSIDMAX ]
do echo "ID: $OBSID"
do echo "Audio. ID: $OBSID"
mkdir -p $OBSID
cd $OBSID
# Download if is isn't there already

View File

@ -62,6 +62,7 @@ for i in */satnogs_*.ogg
echo "Re-download is good"
else
echo "Still bad after re-downloading"
rm "$AUDIOFILE"
fi
fi
cd ..

1
wut-ml
View File

@ -100,6 +100,7 @@ print("add")
# * JSON metadata
# * TLE
# * Audio File (ogg)
# https://www.tensorflow.org/io/api_docs/python/tfio/ffmpeg/AudioDataset
# * Decoded Data (HEX, ASCII, PNG)
# Data from external sources to consider adding:
# * Weather

77
wut-ogg2wav 100755
View File

@ -0,0 +1,77 @@
#!/bin/bash
# ogg2wav
#
# Convert .ogg files in downloads/ to .wav files.
# It checks sha1 before converting, re-downloads if bad.
# Dependency: vorbis-tools
#
# Usage:
# ogg2wav
# Example:
# ogg2wav
OGGDECOPT=""
cd download/ || exit
# Compile a list of ogg files (NOTE: THIS WILL BREAK AS ARCHIVE GROWS XXX)
echo "Total audio files: `ls -1 */satnogs_*.ogg | wc -l`"
for i in */satnogs_*.ogg
do OBSID=`dirname $i`
# Go into directories with audiofiles
echo $OBSID
cd $OBSID
# See if there is an archive.org XML file, if not, download it.
XMLURL="https://archive.org/download/satnogs-observation-$OBSID/satnogs-observation-$OBSID""_files.xml"
XMLFILE=`basename "$XMLURL"`
[ ! -f "$XMLFILE" ] && \
curl \
--location \
--silent \
--http2 --ipv4 \
--remote-time \
--output $XMLFILE \
$XMLURL \
&& sleep `echo $((0 + RANDOM % 1))`
# Get name of audio file.
AUDIOURL=`cat $OBSID.json | jq --compact-output '.[0] | {archive_url}' | grep ogg | cut -f 2- -d : | sed -e 's/}//g' -e 's/http:/https:/g' -e 's/"//g'`
AUDIOFILE=`basename "$AUDIOURL"`
# Get sha1 for audio file. XXX sgrep dependency XXX
AUDIOXMLSHA1=`sgrep -g xml \
'"<file name=\"'$AUDIOFILE'\" source=\"original\"" .. "/file>"' \
"$XMLFILE" | \
grep "<sha1>" | \
sed -e 's/.*<sha1>//' -e 's/<\/sha1>//'`
AUDIOFILESHA1=`sha1sum $AUDIOFILE | cut -f 1 -d " "`
echo -e -n "XML: $AUDIOXMLSHA1\nFile: $AUDIOFILESHA1 "
sleep 1
if [ "$AUDIOXMLSHA1" = "$AUDIOFILESHA1" ] ; then
echo "Encode"
echo "oggdec $OGGDECOPT $AUDIOFILE"
ls -hl $AUDIOFILE *wav
nice oggdec $OGGDECOPT $AUDIOFILE
else
echo "Bad, re-downloading $AUDIOURL"
rm "$AUDIOFILE"
curl \
--location \
--silent \
--http2 --ipv4 \
--remote-time \
--output $AUDIOFILE \
$AUDIOURL \
&& sleep `echo $((0 + RANDOM % 1))`
AUDIOFILESHA1=`sha1sum $AUDIOFILE | cut -f 1 -d " "`
echo -e -n "XML: $AUDIOXMLSHA1\nFile: $AUDIOFILESHA1 "
if [ "$AUDIOXMLSHA1" = "$AUDIOFILESHA1" ] ; then
echo "Re-download is good, encode"
echo 'oggdec "$OGGDECOPT" "$AUDIOFILE"'
nice oggdec $OGGDECOPT $AUDIOFILE
else
echo "Still bad after re-downloading, remove"
rm "$AUDIOFILE"
fi
fi
cd ..
done