blob: 5c45f9b10d9c906abd0d8be887b0b067febeff50 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
|
#!/bin/bash
#
# Misc httracker functions.
#
# Set common httrack options
function httracker_opts {
OPTS=" --mirror \
--continue \
--depth=${DEPTH} \
--ext-depth ${EXT_DEPTH} \
--near \
--purge-old=0 \
--index \
--cookies=1 \
--path ${TARGET}"
}
# Download URLs, mirror mode
function httracker_get {
# Options
local url="$1"
local hash="`echo $1 | sha1sum | cut -d ' ' -f 1`"
echo -n "Processing $url..."
# Set target and make sure it exists
TARGET="$MIRRORS/$hash"
mkdir -p $TARGET
# We already got this one
if [ -e "$TARGET/httracker-ok" ]; then
echo " skipping as it's already downloaded as $hash..."
return
else
echo -e "\n"
fi
# Basic options
httracker_opts
# Additional options
if [ "`whoami`" == "root" ]; then
OPTS="$OPTS --user $USER"
fi
# Fix permissions
if [ "`whoami`" != "$USER" ] && [ "`whoami`" == "root" ]; then
chown -R $USER.$GROUP $TARGET/
fi
# Get each URL
httrack ${OPTS} ${url} | tee $LOG
# Fix permissions again
if [ "`whoami`" != "$USER" ] && [ "`whoami`" == "root" ]; then
chown -R $USER.$GROUP $TARGET/
fi
if [ "$?" == "0" ]; then
# Mark as downloaded
touch $TARGET/httracker-ok
else
echo "Error fetching $url."
rm -rf $TARGET
fi
}
# Download URLs, incremental mode
function httracker_get_incremental {
# Create TARGET dir
year=`date +%Y`
month=`date +%m`
day=`date +%d`
TARGET=${MIRRORDIR}/${year}/${month}
sudo -u ${USER} mkdir -p ${TARGET}
# Basic options
httracker_opts
# Additional options
if [ "`whoami`" == "root" ]; then
OPTS="--user $USER"
fi
# Grab URLs from the network
httrack ${OPTS} --list ${URLS} | tee $LOG
# Fix permissions again
if [ "`whoami`" != "$USER" ] && [ "`whoami`" == "root" ]; then
chown -R $USER.$GROUP $TARGET/
fi
}
# Get SemanticScuttle parameter
function httracker_scuttle_config {
grep ${1} ${CONFIGFILE} | sed -e s/\[^\'\]\*\'// -e s/\'\.\*\$//
}
# Query a mysql database
function httracker_sqlquery {
mysql --skip-column-names --batch \
--user=${DBUSER} \
--password=${DBPASS} \
--database=${DBNAME} \
--host=${DBHOST} \
--execute="${1}"
}
# Iterate over all URLs
function httracker_iterate {
for link in `cat $URLS | xargs`; do
# Fix entities
link="`echo $link | sed -f $BASE/lib/httracker/html.sed`"
httracker_get "$link"
done
}
# Create basic folders
function httracker_setup_folders {
mkdir -p $MIRRORS $TMP
if [ "`whoami`" == "root" ]; then
chown -R $USER.$GROUP $MIRRORS
chown -R $USER.$GROUP $TMP
fi
}
# Set basic environment
function httracker_initialize {
BASE="`dirname $0`"
source $BASE/config || exit 1
# Create folders
httracker_setup_folders
# Lockfile
LOCKFILE="${TMP}/`basename $0`.lock"
httracker_check_lockfile
httracker_set_lockfile
# Logfile
LOG="${TMP}/`basename $0`.log"
}
# Cleanup environment
function httracker_teardown {
httracker_unset_lockfile
rm -rf $URLS
}
# Create lockfile
function httracker_set_lockfile {
if [ ! -z "$LOCKFILE" ]; then
mkdir -p `dirname $LOCKFILE`
if ( set -o noclobber; echo "$$" > "$LOCKFILE" ) &> /dev/null; then
trap 'httracker_unset_lockfile' INT TERM EXIT
else
echo "Could not create lockfile $LOCKFILE, exiting"
exit 1
fi
fi
}
# Remove lockfile
function httracker_unset_lockfile {
if [ ! -z "$LOCKFILE" ]; then
rm -f $LOCKFILE || echo "Could not remove lockfile $LOCKFILE"
fi
}
# Check lockfile
function httracker_check_lockfile {
local pid process
if [ ! -z "$LOCKFILE" ] && [ -f "$LOCKFILE" ]; then
pid="`cat $LOCKFILE`"
process="`ps --no-headers -o comm $pid`"
if [ "$?" == "0" ] && [ "`ps --no-headers -o comm $$`" == "$process" ]; then
echo "Another program is running for $LOCKFILE, skipping run"
exit
else
echo "Found old lockfile $LOCKFILE, removing it"
httracker_unset_lockfile
fi
fi
}
# Initialize
httracker_initialize
|