From 399d75317c9adf4c22fdbb3b6bf4be579ce1b9f2 Mon Sep 17 00:00:00 2001 From: Silvio Rhatto Date: Sun, 25 Aug 2013 16:08:20 -0300 Subject: Initial import --- README.mdwn | 7 +++++++ scuttle.sh | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 README.mdwn create mode 100644 scuttle.sh diff --git a/README.mdwn b/README.mdwn new file mode 100644 index 0000000..b5aff85 --- /dev/null +++ b/README.mdwn @@ -0,0 +1,7 @@ +Feed Crawler +============ + +Download all links from a feed using httrack. + + curl $URL > urls.txt + httrack -r$LEVEL -e%$EXT_LEVEL -m$FILESIZE -Y -%L urls.txt diff --git a/scuttle.sh b/scuttle.sh new file mode 100644 index 0000000..d41405f --- /dev/null +++ b/scuttle.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +BASEDIR=/var/sites/links +SCUTTLEDIR=`basename $( find ${BASEDIR} -maxdepth 1 -iname "SemanticScuttle-*" | head -n 1 )` +CONFIGFILE=${BASEDIR}/${SCUTTLEDIR}/data/config.php +MIRRORDIR=${BASEDIR}/mirrors +TMPDIR=/tmp + +getconf() { + grep ${1} ${CONFIGFILE} | sed -e s/\[^\'\]\*\'// -e s/\'\.\*\$// +} + +dbuser=`getconf dbuser` +dbpass=`getconf dbpass` +dbname=`getconf dbname` +dbhost=`getconf dbhost` + +sqlquery() { + mysql --skip-column-names --batch \ + --user=${dbuser} \ + --password=${dbpass} \ + --database=${dbname} \ + --host=${dbhost} \ + --execute="${1}" +} + +# grabs URLs from db +tmpfile=`mktemp -p ${TMPDIR}` +chown links.links ${tmpfile} +chmod 600 ${tmpfile} +sqlquery "select bAddress from sc_bookmarks;" > ${tmpfile} + +# creates target dir +year=`date +%Y` +month=`date +%m` +%day=`date +%d` +TARGETDIR=${MIRRORDIR}/${year}/${month} +sudo -u links mkdir -p ${TARGETDIR} + +# grabs URLs from the network +httrack --verbose \ + --user links \ + --depth=1 \ + --purge-old=0 \ + --index \ + --cookies=1 \ + --list ${tmpfile} \ + --path ${TARGETDIR} \ -- cgit v1.2.3