From 2f3b2d3684068987691242da4dbda2f09828a56c Mon Sep 17 00:00:00 2001 From: Silvio Rhatto Date: Sun, 25 Aug 2013 21:52:43 -0300 Subject: Usage and cleaner httrack options --- README.mdwn | 8 ++++++++ config | 5 ++--- lib/httracker/functions | 45 ++++++++++++++++++++++----------------------- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/README.mdwn b/README.mdwn index e9c9d72..3309f60 100644 --- a/README.mdwn +++ b/README.mdwn @@ -4,8 +4,16 @@ Feed Crawler Download all links from a feed using httrack. This is the engine behind the "Cache" feature used by https://links.sarava.org Semantic Scuttle instance. +Usage +----- + +Place this script somewhere and setup a cronjob like this: + +`*/5 * * * * /var/sites/arquivo/httracker/httracker &> /dev/null` + TODO ---- - Include all sites already donwloaded by scuttler. - Support for other fetchers like youtube-dl. +- Lockfile support. diff --git a/config b/config index 828cfc1..f692713 100644 --- a/config +++ b/config @@ -4,9 +4,8 @@ FEED="https://links.sarava.org/rss?sort=date_desc&count=100" TMP="/var/sites/arquivo/tmp/httracker" URLS="$TMP/urls-httracker.txt" URLS_SCUTTLER="$TMP/urls-scuttler.txt" -LEVEL="1" -EXT_LEVEL="1" FILESIZE="" USER="arquivo" GROUP="arquivo" -DEPTH="1" +DEPTH="2" +EXT_DEPTH="1" diff --git a/lib/httracker/functions b/lib/httracker/functions index a5144c9..33152b1 100644 --- a/lib/httracker/functions +++ b/lib/httracker/functions @@ -26,19 +26,17 @@ function httracker_get { fi # Get each URL - httrack \ - --mirror \ - --continue \ - --depth=${DEPTH} \ - --near \ - --purge-old=0 \ - --index \ - --cookies=1 \ - --path ${target} \ - -r${LEVEL} ${OPTS} ${url} - #-e%${EXT_LEVEL} \ - #-m$FILESIZE \ - #--verbose + httrack \ + --mirror \ + --continue \ + --depth=${DEPTH} \ + --ext-depth ${EXT_DEPTH} \ + --near \ + --purge-old=0 \ + --index \ + --cookies=1 \ + --path ${target} \ + ${OPTS} ${url} if [ "$?" == "0" ]; then # Mark as downloaded @@ -64,16 +62,17 @@ function httracker_get_incremental { fi # Grabs URLs from the network - httrack --verbose \ - --mirror \ - --continue \ - --user links \ - --depth=${DEPTH} \ - --near \ - --purge-old=0 \ - --index \ - --cookies=1 \ - --list ${URLS} \ + httrack \ + --mirror \ + --continue \ + --depth=${DEPTH} \ + --ext-depth ${EXT_DEPTH} \ + --near \ + --purge-old=0 \ + --index \ + --cookies=1 \ + --user links \ + --list ${URLS} \ --path ${target} ${OPTS} } -- cgit v1.2.3