diff options
author | Silvio Rhatto <rhatto@riseup.net> | 2013-08-25 16:08:20 -0300 |
---|---|---|
committer | Silvio Rhatto <rhatto@riseup.net> | 2013-08-25 16:08:20 -0300 |
commit | 399d75317c9adf4c22fdbb3b6bf4be579ce1b9f2 (patch) | |
tree | 4871bbae283c4fdda68cf5f08eae725f5e692a30 | |
download | httruta-399d75317c9adf4c22fdbb3b6bf4be579ce1b9f2.tar.gz httruta-399d75317c9adf4c22fdbb3b6bf4be579ce1b9f2.tar.bz2 |
Initial import
-rw-r--r-- | README.mdwn | 7 | ||||
-rw-r--r-- | scuttle.sh | 48 |
2 files changed, 55 insertions, 0 deletions
diff --git a/README.mdwn b/README.mdwn new file mode 100644 index 0000000..b5aff85 --- /dev/null +++ b/README.mdwn @@ -0,0 +1,7 @@ +Feed Crawler +============ + +Download all links from a feed using httrack. + + curl $URL > urls.txt + httrack -r$LEVEL -e%$EXT_LEVEL -m$FILESIZE -Y -%L urls.txt diff --git a/scuttle.sh b/scuttle.sh new file mode 100644 index 0000000..d41405f --- /dev/null +++ b/scuttle.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +BASEDIR=/var/sites/links +SCUTTLEDIR=`basename $( find ${BASEDIR} -maxdepth 1 -iname "SemanticScuttle-*" | head -n 1 )` +CONFIGFILE=${BASEDIR}/${SCUTTLEDIR}/data/config.php +MIRRORDIR=${BASEDIR}/mirrors +TMPDIR=/tmp + +getconf() { + grep ${1} ${CONFIGFILE} | sed -e s/\[^\'\]\*\'// -e s/\'\.\*\$// +} + +dbuser=`getconf dbuser` +dbpass=`getconf dbpass` +dbname=`getconf dbname` +dbhost=`getconf dbhost` + +sqlquery() { + mysql --skip-column-names --batch \ + --user=${dbuser} \ + --password=${dbpass} \ + --database=${dbname} \ + --host=${dbhost} \ + --execute="${1}" +} + +# grabs URLs from db +tmpfile=`mktemp -p ${TMPDIR}` +chown links.links ${tmpfile} +chmod 600 ${tmpfile} +sqlquery "select bAddress from sc_bookmarks;" > ${tmpfile} + +# creates target dir +year=`date +%Y` +month=`date +%m` +%day=`date +%d` +TARGETDIR=${MIRRORDIR}/${year}/${month} +sudo -u links mkdir -p ${TARGETDIR} + +# grabs URLs from the network +httrack --verbose \ + --user links \ + --depth=1 \ + --purge-old=0 \ + --index \ + --cookies=1 \ + --list ${tmpfile} \ + --path ${TARGETDIR} \ |