#! /bin/bash
#
# This script may be used to download many SPSS and CPSS dissections from
# www.squaring.net. Needs modification if website contents change. 
#

URLROOT=http://www.squaring.net

#
# 1) Determine what to download: SPSS or CPSS (default to SPSS).
#
if [ $# = 0 ]
then
  TYPE=spss  
else
  TYPE=$1
fi
case $TYPE in
  spss) LIST=`seq 21 75`
        ;;
  cpss) LIST=`seq 24 86`
        ;;
  *)    echo Download type must be spss or cpss, sorry.
        exit 1
        ;;
esac

#
# 2) Go to a special working directory.
#
mkdir ./${TYPE}_htmlx 2> /dev/null
cd ./${TYPE}_htmlx
rm -f *.htmlx

#
# 3) Download the primary HTML files.
#
RECOVERY=0
for i in $LIST
do
  URL=$URLROOT/sq/ss/$TYPE/o$i/${TYPE}o$i.html
  echo -n $URL
  wget -q $URL -O $i.htmlx
  if [ $? = 0 ]
  then
    echo " got"
  else

    #
    # recovery hack for order 31 (to be removed in the future)
    #
    if [ "$TYPE" = "spss" ] &&  [ $i = 31 ]
    then
      echo " not got, trying to recover..."
      URL=$URLROOT/sq/ss/spss/o$i+/spsso$i.html
      echo -n $URL
      wget -q $URL -O $i.htmlx
      if [ $? = 0 ]
      then
        echo " got"
        RECOVERY=1
      else
        echo " NOT got"
      fi
    else
      echo " NOT got"
    fi

  fi
done

#
# 4) Append contents of secondary HTML files to their primary file if needed.
#
for i in *.htmlx
do
  I=`basename $i .htmlx`

  #
  # part 2 of recovery hack for order 31 (to be removed in the future)
  #
  if [ $I = 31 ] && [ $RECOVERY = 1 ]
  then
    I=31+
  fi

  for j in `grep $TYPE $i | grep -e li | grep Order | grep -- - | awk -F\" '{print $2}'`
  do
    URL=$URLROOT/sq/ss/${TYPE}/o$I/$j
    echo -n $URL
    wget -q $URL -O - >> $i 
    if [ $? = 0 ]
    then
      echo " got"
    else
      echo " NOT got"
    fi
  done
done

#
# 5) Finally, make the list spss.txt or cpss.txt from download files.
#
grep "option value" *.htmlx | grep -v image | awk -F\" '{print $2}' | tr "\(\)\t," "    " > ../$TYPE.txt
