aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSilvio Rhatto <rhatto@riseup.net>2013-08-25 21:52:43 -0300
committerSilvio Rhatto <rhatto@riseup.net>2013-08-25 21:52:43 -0300
commit2f3b2d3684068987691242da4dbda2f09828a56c (patch)
treedaed8f0d6d16a99e2d8a500606cc5b53484601c7
parent5ce98951033bc358510026e0cec15463ba59c16e (diff)
downloadhttruta-2f3b2d3684068987691242da4dbda2f09828a56c.tar.gz
httruta-2f3b2d3684068987691242da4dbda2f09828a56c.tar.bz2
Usage and cleaner httrack options
-rw-r--r--README.mdwn8
-rw-r--r--config5
-rw-r--r--lib/httracker/functions45
3 files changed, 32 insertions, 26 deletions
diff --git a/README.mdwn b/README.mdwn
index e9c9d72..3309f60 100644
--- a/README.mdwn
+++ b/README.mdwn
@@ -4,8 +4,16 @@ Feed Crawler
Download all links from a feed using httrack. This is the engine behind the
"Cache" feature used by https://links.sarava.org Semantic Scuttle instance.
+Usage
+-----
+
+Place this script somewhere and setup a cronjob like this:
+
+`*/5 * * * * /var/sites/arquivo/httracker/httracker &> /dev/null`
+
TODO
----
- Include all sites already donwloaded by scuttler.
- Support for other fetchers like youtube-dl.
+- Lockfile support.
diff --git a/config b/config
index 828cfc1..f692713 100644
--- a/config
+++ b/config
@@ -4,9 +4,8 @@ FEED="https://links.sarava.org/rss?sort=date_desc&count=100"
TMP="/var/sites/arquivo/tmp/httracker"
URLS="$TMP/urls-httracker.txt"
URLS_SCUTTLER="$TMP/urls-scuttler.txt"
-LEVEL="1"
-EXT_LEVEL="1"
FILESIZE=""
USER="arquivo"
GROUP="arquivo"
-DEPTH="1"
+DEPTH="2"
+EXT_DEPTH="1"
diff --git a/lib/httracker/functions b/lib/httracker/functions
index a5144c9..33152b1 100644
--- a/lib/httracker/functions
+++ b/lib/httracker/functions
@@ -26,19 +26,17 @@ function httracker_get {
fi
# Get each URL
- httrack \
- --mirror \
- --continue \
- --depth=${DEPTH} \
- --near \
- --purge-old=0 \
- --index \
- --cookies=1 \
- --path ${target} \
- -r${LEVEL} ${OPTS} ${url}
- #-e%${EXT_LEVEL} \
- #-m$FILESIZE \
- #--verbose
+ httrack \
+ --mirror \
+ --continue \
+ --depth=${DEPTH} \
+ --ext-depth ${EXT_DEPTH} \
+ --near \
+ --purge-old=0 \
+ --index \
+ --cookies=1 \
+ --path ${target} \
+ ${OPTS} ${url}
if [ "$?" == "0" ]; then
# Mark as downloaded
@@ -64,16 +62,17 @@ function httracker_get_incremental {
fi
# Grabs URLs from the network
- httrack --verbose \
- --mirror \
- --continue \
- --user links \
- --depth=${DEPTH} \
- --near \
- --purge-old=0 \
- --index \
- --cookies=1 \
- --list ${URLS} \
+ httrack \
+ --mirror \
+ --continue \
+ --depth=${DEPTH} \
+ --ext-depth ${EXT_DEPTH} \
+ --near \
+ --purge-old=0 \
+ --index \
+ --cookies=1 \
+ --user links \
+ --list ${URLS} \
--path ${target} ${OPTS}
}