blob: 9e049fe043064eb891d4541375bc684082c647f0 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
#!/bin/bash
#
# Httrack feed downloader
#
# Configuration
MIRRORS="/var/cache/sites/arquivo/conteudo/links.sarava.org/assets"
FEED="https://links.sarava.org/rss?sort=date_desc&count=50"
TMP="/tmp/httracker"
URLS="$TMP/urls.txt"
LEVEL="1"
EXT_LEVEL="1"
FILESIZE=""
USER="arquivo"
GROUP="arquivo"
function httracker_get {
local url="$1"
local hash="`echo $1 | sha1sum | cut -d ' ' -f 1`"
local target="$MIRRORS/$hash"
mkdir -p $target
# We already got this one
if [ -f "$target/ok" ]; then
return
fi
# Get each URL
httrack --verbose \
--user $USER \
--depth=1 \
--purge-old=0 \
--index \
--cookies=1 \
--path ${target} \
-r$LEVEL \
-e%$EXT_LEVEL \
#-m$FILESIZE \
"$url"
if [ "$1" == "0" ]; then
# Mark as downloaded
touch $target/ok
chown -R $USER.$GROUP $target/
else
echo "Error fetching $url."
rm -rf $target
fi
}
# Create folders
mkdir $MIRRORS $TMP
# Get URL
# Thanks http://stackoverflow.com/questions/443991/how-to-parse-rss-feeds-xml-in-a-shell-script
curl $FEED | grep -o '<link>[^<]*' | grep -o "[^>]*$" > $URLS
if [ "$?" != "0" ]; then
echo "Error downloading feed $FEED, aborting."
exit 1
fi
# Iterate over all URLs
for link in `cat $URLS | xargs`; do
httracker_get "$link"
done
# Cleanup
rm -rf $TMP
|