blob: a35f1deba342365718df9a0e0c08c76653ec217e (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
|
#!/bin/bash
#
# Misc httracker functions.
#
# Set common httrack options
function httracker_opts {
OPTS=" --mirror \
--continue \
--depth=${DEPTH} \
--ext-depth ${EXT_DEPTH} \
--near \
--purge-old=0 \
--index \
--cookies=1 \
--path ${TARGET}"
}
# Download URLs, mirror mode
function httracker_get {
# Options
local url="$1"
local hash="`echo $1 | sha1sum | cut -d ' ' -f 1`"
local first="`echo $hash | cut -c 1-2`"
local second="`echo $hash | cut -c 3-4`"
echo -n "Processing $url..."
# Set target and make sure it exists
# We use two levels of directories used for hashing,
# to prevent too many things ending up in any one directory.
# See https://git-annex.branchable.com/internals/
TARGET="$MIRRORS/$first/$second/$hash"
mkdir -p $TARGET
# We already got this one
if [ -e "$TARGET/httracker-ok" ]; then
echo " skipping as it's already downloaded as $hash..."
return
else
echo ""
fi
# Basic options
httracker_opts
# Additional options
if [ "`whoami`" == "root" ]; then
OPTS="$OPTS --user $USER"
fi
# Fix permissions
if [ "`whoami`" != "$USER" ] && [ "`whoami`" == "root" ]; then
echo "Fixing $TARGET permissions..."
chown -R $USER.$GROUP $TARGET/
fi
# Get each URL
httrack ${OPTS} ${url}
if [ "$?" == "0" ]; then
# Mark as downloaded
date +%s > $TARGET/httracker-ok
else
echo "Error fetching $url."
rm -rf $TARGET
fi
# Add PDF symlink if needed
url_lower="$(echo "$url" | tr '[:upper:]' '[:lower:]')"
url_base="$(basename "$url")"
if [ "$(basename "$url_lower")" != "$(basename "$url_lower" .pdf)" ]; then
( cd $TARGET && find -iname '*.pdf' -exec ln -s {} "$url_base" \; )
fi
# Save as PDF
if [ "$WKHTMLTOPDF" == "1" ] && which xvfb-run &> /dev/null && which wkhtmltopdf &> /dev/null; then
xvfb-run -a wkhtmltopdf "$url" $TARGET/screenshot.pdf
fi
# Save as PNG
if [ "$WKHTMLTOIMAGE" == "1" ] && which xvfb-run &> /dev/null && which wkhtmltoimage &> /dev/null; then
xvfb-run -a wkhtmltoimage "$url" $TARGET/screenshot.png
fi
# See https://www.insecure.ws/linux/serverless_screenshot.html
# http://gfdsa.gfdsa.org/2012/08/15/making-web-pages-screenshots-with-webkit2png-flash-included/
if [ ! -d "$TARGET/screenshot.pdf" ]; then
if [ "$WEBKIT2PDF" == "1" ] && which xvfb-run &> /dev/null && which webkit2pdf &> /dev/null; then
( cd $TARGET && xvfb-run -a webkit2pdf "$url" && mv 0000.pdf screenshot.pdf )
fi
fi
# Get a screenshot
# https://github.com/paulhammond/webkit2png/
# https://github.com/adamn/python-webkit2png
# https://snippets.aktagon.com/snippets/504-how-to-generate-screenshots-on-debian-linux-with-python-webkit2png
if [ ! -d "$TARGET/screenshot.png" ]; then
if [ "$WEBKIT2PNG" == "1" ] && which xvfb-run &> /dev/null; then
xvfb-run -a $DIRNAME/webkit2png/webkit2png/webkit2png.py -o $TARGET/screenshot.png "$url"
fi
fi
# Fix permissions again
if [ "`whoami`" != "$USER" ] && [ "`whoami`" == "root" ]; then
echo "Fixing $TARGET permissions..."
chown -R $USER.$GROUP $TARGET/
fi
# Done here
echo ""
}
# Download URLs, incremental mode
function httracker_get_incremental {
# Create TARGET dir
year=`date +%Y`
month=`date +%m`
day=`date +%d`
TARGET=${MIRRORDIR}/${year}/${month}
sudo -u ${USER} mkdir -p ${TARGET}
# Basic options
httracker_opts
# Additional options
if [ "`whoami`" == "root" ]; then
OPTS="--user $USER"
fi
# Grab URLs from the network
httrack ${OPTS} --list ${URLS}
# Fix permissions
if [ "`whoami`" != "$USER" ] && [ "`whoami`" == "root" ]; then
chown -R $USER.$GROUP $TARGET/
fi
}
# Get SemanticScuttle parameter
function httracker_scuttle_config {
grep ${1} ${CONFIGFILE} | sed -e s/\[^\'\]\*\'// -e s/\'\.\*\$//
}
# Query a mysql database
function httracker_sqlquery {
mysql --skip-column-names --batch \
--user=${DBUSER} \
--password=${DBPASS} \
--database=${DBNAME} \
--host=${DBHOST} \
--execute="${1}"
}
# Iterate over all URLs
function httracker_iterate {
local i=1
local t="`wc -l $URLS | cut -d ' ' -f 1`"
for link in `cat $URLS | xargs`; do
# Fix entities
link="`echo $link | sed -f $BASE/lib/httracker/html.sed`"
echo "Processing item $i from $t total..."
httracker_get "$link"
let i++
done
}
# Create basic folders
function httracker_setup_folders {
mkdir -p $MIRRORS $TMP
if [ "`whoami`" == "root" ]; then
echo "Fixing $MIRRORS permissions..."
chown $USER.$GROUP $MIRRORS
chown $USER.$GROUP $TMP
fi
}
# Set basic environment
function httracker_initialize {
BASE="`dirname $0`"
source $BASE/config.default || exit 1
# Load custom config
if [ -e "$BASE/config" ]; then
source $BASE/config || exit 1
fi
# Check if httrack is available
if ! which httrack &> /dev/null; then
echo "error: httrack not avalable"
exit 1
fi
# Create folders
httracker_setup_folders
# Lockfile
LOCKFILE="${TMP}/`basename $0`.lock"
httracker_check_lockfile
httracker_set_lockfile
# Logfile
#LOG="${TMP}/`basename $0`.log"
}
# Cleanup environment
function httracker_teardown {
httracker_unset_lockfile
rm -rf $URLS
}
# Create lockfile
function httracker_set_lockfile {
if [ ! -z "$LOCKFILE" ]; then
mkdir -p `dirname $LOCKFILE`
if ( set -o noclobber; echo "$$" > "$LOCKFILE" ) &> /dev/null; then
trap 'httracker_unset_lockfile' INT TERM EXIT
else
echo "Could not create lockfile $LOCKFILE, exiting"
exit 1
fi
fi
}
# Remove lockfile
function httracker_unset_lockfile {
if [ ! -z "$LOCKFILE" ]; then
rm -f $LOCKFILE || echo "Could not remove lockfile $LOCKFILE"
fi
}
# Check lockfile
function httracker_check_lockfile {
local pid process
if [ ! -z "$LOCKFILE" ] && [ -f "$LOCKFILE" ]; then
pid="`cat $LOCKFILE`"
process="`ps --no-headers -o comm $pid`"
if [ "$?" == "0" ] && [ "`ps --no-headers -o comm $$`" == "$process" ]; then
echo "Another program is running for $LOCKFILE, skipping run"
exit
else
echo "Found old lockfile $LOCKFILE, removing it"
httracker_unset_lockfile
fi
fi
}
# Initialize
httracker_initialize
|