#!/bin/bash
if [ "$1" = "" ] || [ "$(echo "$1" | egrep "https?://boards.4chan.org/[a-z0-9]+/res/[0-9]+")" = "" ]; then
echo "Usage: $(echo "$0" | sed "s_.*/\(.\+\)_\1_g") <4chan thread url>"
exit 1
fi
echo "4chan downloader"
echo ""
LOC=$(echo "$1" | sed 's_.\+/res/\([0-9]\+\)_\1_g' )
ST="static.4chan.org"
if [ "$LOC" = "" ]; then
echo "Can't determine the thread's number"
exit 1
fi
echo "Downloading to $LOC"
echo ""
while [ "1" = "1" ]; do
echo "------------------------------"
echo ""
echo "Downloading thread's HTML"
if [ -s $LOC.html ]; then
wget -np -nd -nH -q -erobots=off "$1" -O a
if [ "$(wc -c a|cut -d" " -f1)" -eq "0" ]; then
echo "Thread has 404'd. Stopping script."
rm a
exit 0
fi
if [ "$(wc -c a|cut -d" " -f1)" -gt "$(wc -c $LOC.html|cut -d" " -f1)" ]; then
mv a $LOC.html
else
rm a
fi
else
wget -np -nd -nH -q -erobots=off "$1" -O $LOC.html
if [ "$(wc -c $LOC.html|cut -d" " -f1)" -eq "0" ]; then
echo "Thread doesn't exist. Stopping script."
rm $LOC.html
exit 0
fi
fi
if [ ! -d $LOC ]; then
mkdir $LOC
fi
if [ ! -d $LOC/misc ]; then
mkdir $LOC/misc
fi
echo "Parsing HTML"
egrep "//[0-9].thumbs.4chan.org/[a-z0-9]+/thumb/([0-9]*)s.jpg" $LOC.html -o | sed 's_^//_http://_g' > $LOC/misc/misc
egrep "//${ST}/image/spoiler.png" $LOC.html -o | sed 's_^//_http://_g' | head -n1 >> $LOC/misc/misc
egrep "//${ST}/image/title/[a-z]+/([0-9]*).(jpg|png|gif)" $LOC.html -o | sed 's_^//_http://_g' >> $LOC/misc/misc
egrep "//${ST}/image/favicon(.*).ico" $LOC.html -o | sed 's_^//_http://_g' >> $LOC/misc/misc
egrep "//${ST}/css/(.+)\.css" $LOC.html -o | sed -e 's_\.css_\.css\n_g' -e 's_//stat_\nhttp://stat_g' | grep /css/ | head -n1 >> $LOC/misc/misc
egrep "//images.4chan.org/[a-z0-9]+/src/([0-9]*).(jpg|png|gif)" $LOC.html -o | sed 's_^//_http://_g' > $LOC/images
echo "Resources' lists created and filled with URLs"
echo ""
sed -e "s_//.\.thumbs\.4chan\.org/[a-z0-9]\+/thumb/\([0-9]\+\)s\.jpg_${LOC}/misc/\1s.jpg_g" -e "s_//images\.4chan\.org/[a-z0-9]\+/src/\([0-9]\+\)\.\(jpg\|gif\|png\)_${LOC}/\1.\2_g" -e "s_//${ST}/image/title/[a-z]\+/\([0-9]\+\)\.\(jpg\|gif\|png\)_${LOC}/misc/\1.\2_g" -e "s_//${ST}/image/spoiler\.png_${LOC}/misc/spoiler.png_g" -e "s_//${ST}/image/favicon\(.*\)\.ico_${LOC}/misc/favicon\1.ico_g" -e "s_//${ST}/css/\(.\+\)\.css_${LOC}/misc/\1.css_g" $LOC.html > a
# :a;N;$!ba;
mv a $LOC.html
cd $LOC
echo "Downloading full images"
wget -nc -q -i images
rm images
cd misc
echo "Downloading miscellaneous files"
wget -nc -q -i misc
CSS=$(cat misc | tail -n1 | sed 's_.*/\(.\+\)_\1_g')
sed "s_.*fade\(.*\)\.png.*_http://${ST}/image/fade\1.png_g" $CSS > misc
wget -nc -q -i misc
sed 's_/image/fade\(.*\)\.png_fade\1.png_g' $CSS > a
mv a $CSS
rm misc
touch .nomedia
cd ../..
echo "Session completed and temporary files've been cleaned up"
echo "Waiting 30 seconds before next run"
sleep 30
echo ""
done;