#!/bin/bash # # Misc httracker functions. # # Set common httrack options function httracker_opts { OPTS=" --mirror \ --continue \ --depth=${DEPTH} \ --ext-depth ${EXT_DEPTH} \ --near \ --purge-old=0 \ --index \ --cookies=1 \ --path ${TARGET}" } # Download URLs, mirror mode function httracker_get { # Options local url="$1" local hash="`echo $1 | sha1sum | cut -d ' ' -f 1`" local first="`echo $hash | cut -c 1-2`" local second="`echo $hash | cut -c 3-4`" echo -n "Processing $url..." # Set target and make sure it exists # We use two levels of directories used for hashing, # to prevent too many things ending up in any one directory. # See https://git-annex.branchable.com/internals/ TARGET="$MIRRORS/$first/$second/$hash" mkdir -p $TARGET # We already got this one if [ -e "$TARGET/httracker-ok" ]; then echo " skipping as it's already downloaded as $hash..." return else echo "" fi # Basic options httracker_opts # Additional options if [ "`whoami`" == "root" ]; then OPTS="$OPTS --user $USER" fi # Fix permissions if [ "`whoami`" != "$USER" ] && [ "`whoami`" == "root" ]; then echo "Fixing $TARGET permissions..." chown -R $USER.$GROUP $TARGET/ fi # Get each URL httrack ${OPTS} ${url} if [ "$?" == "0" ]; then # Mark as downloaded date +%s > $TARGET/httracker-ok else echo "Error fetching $url." rm -rf $TARGET fi # Add PDF symlink if needed url_lower="$(echo "$url" | tr '[:upper:]' '[:lower:]')" url_base="$(basename "$url")" if [ "$(basename "$url_lower")" != "$(basename "$url_lower" .pdf)" ]; then ( cd $TARGET && find -iname '*.pdf' -exec ln -s {} "$url_base" \; ) fi # Save as PDF if [ "$WKHTMLTOPDF" == "1" ] && which xvfb-run &> /dev/null && which wkhtmltopdf &> /dev/null; then xvfb-run -a wkhtmltopdf "$url" $TARGET/screenshot.pdf fi # Save as PNG if [ "$WKHTMLTOIMAGE" == "1" ] && which xvfb-run &> /dev/null && which wkhtmltoimage &> /dev/null; then xvfb-run -a wkhtmltoimage "$url" $TARGET/screenshot.png fi # See https://www.insecure.ws/linux/serverless_screenshot.html # http://gfdsa.gfdsa.org/2012/08/15/making-web-pages-screenshots-with-webkit2png-flash-included/ if [ ! -d "$TARGET/screenshot.pdf" ]; then if [ "$WEBKIT2PDF" == "1" ] && which xvfb-run &> /dev/null && which webkit2pdf &> /dev/null; then ( cd $TARGET && xvfb-run -a webkit2pdf "$url" && mv 0000.pdf screenshot.pdf ) fi fi # Get a screenshot # https://github.com/paulhammond/webkit2png/ # https://github.com/adamn/python-webkit2png # https://snippets.aktagon.com/snippets/504-how-to-generate-screenshots-on-debian-linux-with-python-webkit2png if [ ! -d "$TARGET/screenshot.png" ]; then if [ "$WEBKIT2PNG" == "1" ] && which xvfb-run &> /dev/null; then xvfb-run -a $DIRNAME/webkit2png/webkit2png/webkit2png.py -o $TARGET/screenshot.png "$url" fi fi # Fix permissions again if [ "`whoami`" != "$USER" ] && [ "`whoami`" == "root" ]; then echo "Fixing $TARGET permissions..." chown -R $USER.$GROUP $TARGET/ fi # Done here echo "" } # Download URLs, incremental mode function httracker_get_incremental { # Create TARGET dir year=`date +%Y` month=`date +%m` day=`date +%d` TARGET=${MIRRORDIR}/${year}/${month} sudo -u ${USER} mkdir -p ${TARGET} # Basic options httracker_opts # Additional options if [ "`whoami`" == "root" ]; then OPTS="--user $USER" fi # Grab URLs from the network httrack ${OPTS} --list ${URLS} # Fix permissions if [ "`whoami`" != "$USER" ] && [ "`whoami`" == "root" ]; then chown -R $USER.$GROUP $TARGET/ fi } # Get SemanticScuttle parameter function httracker_scuttle_config { grep ${1} ${CONFIGFILE} | sed -e s/\[^\'\]\*\'// -e s/\'\.\*\$// } # Query a mysql database function httracker_sqlquery { mysql --skip-column-names --batch \ --user=${DBUSER} \ --password=${DBPASS} \ --database=${DBNAME} \ --host=${DBHOST} \ --execute="${1}" } # Iterate over all URLs function httracker_iterate { local i=1 local t="`wc -l $URLS | cut -d ' ' -f 1`" for link in `cat $URLS | xargs`; do # Fix entities link="`echo $link | sed -f $BASE/lib/httracker/html.sed`" echo "Processing item $i from $t total..." httracker_get "$link" let i++ done } # Create basic folders function httracker_setup_folders { mkdir -p $MIRRORS $TMP if [ "`whoami`" == "root" ]; then echo "Fixing $MIRRORS permissions..." chown $USER.$GROUP $MIRRORS chown $USER.$GROUP $TMP fi } # Set basic environment function httracker_initialize { BASE="`dirname $0`" source $BASE/config.default || exit 1 # Load custom config if [ -e "$BASE/config" ]; then source $BASE/config || exit 1 fi # Check if httrack is available if ! which httrack &> /dev/null; then echo "error: httrack not avalable" exit 1 fi # Create folders httracker_setup_folders # Lockfile LOCKFILE="${TMP}/`basename $0`.lock" httracker_check_lockfile httracker_set_lockfile # Logfile #LOG="${TMP}/`basename $0`.log" } # Cleanup environment function httracker_teardown { httracker_unset_lockfile rm -rf $URLS } # Create lockfile function httracker_set_lockfile { if [ ! -z "$LOCKFILE" ]; then mkdir -p `dirname $LOCKFILE` if ( set -o noclobber; echo "$$" > "$LOCKFILE" ) &> /dev/null; then trap 'httracker_unset_lockfile' INT TERM EXIT else echo "Could not create lockfile $LOCKFILE, exiting" exit 1 fi fi } # Remove lockfile function httracker_unset_lockfile { if [ ! -z "$LOCKFILE" ]; then rm -f $LOCKFILE || echo "Could not remove lockfile $LOCKFILE" fi } # Check lockfile function httracker_check_lockfile { local pid process if [ ! -z "$LOCKFILE" ] && [ -f "$LOCKFILE" ]; then pid="`cat $LOCKFILE`" process="`ps --no-headers -o comm $pid`" if [ "$?" == "0" ] && [ "`ps --no-headers -o comm $$`" == "$process" ]; then echo "Another program is running for $LOCKFILE, skipping run" exit else echo "Found old lockfile $LOCKFILE, removing it" httracker_unset_lockfile fi fi } # Initialize httracker_initialize