aboutsummaryrefslogtreecommitdiff
path: root/lib/httracker/functions
blob: a35f1deba342365718df9a0e0c08c76653ec217e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#!/bin/bash
#
# Misc httracker functions.
#

# Set common httrack options
function httracker_opts {
  OPTS=" --mirror                 \
         --continue               \
         --depth=${DEPTH}         \
         --ext-depth ${EXT_DEPTH} \
         --near                   \
         --purge-old=0            \
         --index                  \
         --cookies=1              \
         --path ${TARGET}"
}

# Download URLs, mirror mode
function httracker_get {
  # Options
  local url="$1"
  local hash="`echo $1 | sha1sum | cut -d ' ' -f 1`"
  local first="`echo $hash | cut -c 1-2`"
  local second="`echo $hash | cut -c 3-4`"

  echo -n "Processing $url..."

  # Set target and make sure it exists
  # We use two levels of directories used for hashing,
  # to prevent too many things ending up in any one directory.
  # See https://git-annex.branchable.com/internals/
  TARGET="$MIRRORS/$first/$second/$hash"
  mkdir -p $TARGET

  # We already got this one
  if [ -e "$TARGET/httracker-ok" ]; then
    echo " skipping as it's already downloaded as $hash..."
    return
  else
    echo ""
  fi

  # Basic options
  httracker_opts

  # Additional options
  if [ "`whoami`" == "root" ]; then
    OPTS="$OPTS --user $USER"
  fi

  # Fix permissions
  if [ "`whoami`" != "$USER" ] && [ "`whoami`" == "root" ]; then
    echo "Fixing $TARGET permissions..."
    chown -R $USER.$GROUP $TARGET/
  fi

  # Get each URL
  httrack ${OPTS} ${url}

  if [ "$?" == "0" ]; then
    # Mark as downloaded
    date +%s > $TARGET/httracker-ok
  else
    echo "Error fetching $url."
    rm -rf $TARGET
  fi

  # Add PDF symlink if needed
  url_lower="$(echo "$url" | tr '[:upper:]' '[:lower:]')"
  url_base="$(basename "$url")"
  if [ "$(basename "$url_lower")" != "$(basename "$url_lower" .pdf)" ]; then
    ( cd $TARGET && find -iname '*.pdf' -exec ln -s {} "$url_base" \; )
  fi

  # Save as PDF
  if [ "$WKHTMLTOPDF" == "1" ] && which xvfb-run &> /dev/null && which wkhtmltopdf &> /dev/null; then
    xvfb-run -a wkhtmltopdf "$url" $TARGET/screenshot.pdf
  fi

  # Save as PNG
  if [ "$WKHTMLTOIMAGE" == "1" ] && which xvfb-run &> /dev/null && which wkhtmltoimage &> /dev/null; then
    xvfb-run -a wkhtmltoimage "$url" $TARGET/screenshot.png
  fi

  # See https://www.insecure.ws/linux/serverless_screenshot.html
  #     http://gfdsa.gfdsa.org/2012/08/15/making-web-pages-screenshots-with-webkit2png-flash-included/
  if [ ! -d "$TARGET/screenshot.pdf" ]; then
    if [ "$WEBKIT2PDF" == "1" ] && which xvfb-run &> /dev/null && which webkit2pdf &> /dev/null; then
      ( cd $TARGET && xvfb-run -a webkit2pdf "$url" && mv 0000.pdf screenshot.pdf )
    fi
  fi

  # Get a screenshot
  # https://github.com/paulhammond/webkit2png/
  # https://github.com/adamn/python-webkit2png
  # https://snippets.aktagon.com/snippets/504-how-to-generate-screenshots-on-debian-linux-with-python-webkit2png
  if [ ! -d "$TARGET/screenshot.png" ]; then
    if [ "$WEBKIT2PNG" == "1" ] && which xvfb-run &> /dev/null; then
      xvfb-run -a $DIRNAME/webkit2png/webkit2png/webkit2png.py -o $TARGET/screenshot.png "$url"
    fi
  fi

  # Fix permissions again
  if [ "`whoami`" != "$USER" ] && [ "`whoami`" == "root" ]; then
    echo "Fixing $TARGET permissions..."
    chown -R $USER.$GROUP $TARGET/
  fi

  # Done here
  echo ""
}

# Download URLs, incremental mode
function httracker_get_incremental {
  # Create TARGET dir
  year=`date +%Y`
  month=`date +%m`
  day=`date +%d`
  TARGET=${MIRRORDIR}/${year}/${month}
  sudo -u ${USER} mkdir -p ${TARGET}

  # Basic options
  httracker_opts

  # Additional options
  if [ "`whoami`" == "root" ]; then
    OPTS="--user $USER"
  fi

  # Grab URLs from the network
  httrack ${OPTS} --list ${URLS}

  # Fix permissions
  if [ "`whoami`" != "$USER" ] && [ "`whoami`" == "root" ]; then
    chown -R $USER.$GROUP $TARGET/
  fi
}

# Get SemanticScuttle parameter
function httracker_scuttle_config {
  grep ${1} ${CONFIGFILE} | sed -e s/\[^\'\]\*\'// -e s/\'\.\*\$//
}

# Query a mysql database
function httracker_sqlquery {
  mysql --skip-column-names --batch \
        --user=${DBUSER}          \
        --password=${DBPASS}      \
        --database=${DBNAME}      \
        --host=${DBHOST}          \
        --execute="${1}"
}

# Iterate over all URLs
function httracker_iterate {
  local i=1
  local t="`wc -l $URLS | cut -d ' ' -f 1`"

  for link in `cat $URLS | xargs`; do
    # Fix entities
    link="`echo $link | sed -f $BASE/lib/httracker/html.sed`"

    echo "Processing item $i from $t total..."
    httracker_get "$link"
    let i++
  done
}

# Create basic folders
function httracker_setup_folders {
  mkdir -p $MIRRORS $TMP

  if [ "`whoami`" == "root" ]; then
    echo "Fixing $MIRRORS permissions..."
    chown $USER.$GROUP $MIRRORS
    chown $USER.$GROUP $TMP
  fi
}

# Set basic environment
function httracker_initialize {
  BASE="`dirname $0`"
  source $BASE/config.default || exit 1

  # Load custom config
  if [ -e "$BASE/config" ]; then
    source $BASE/config || exit 1
  fi

  # Check if httrack is available
  if ! which httrack &> /dev/null; then
    echo "error: httrack not avalable"
    exit 1
  fi

  # Create folders
  httracker_setup_folders

  # Lockfile
  LOCKFILE="${TMP}/`basename $0`.lock"
  httracker_check_lockfile
  httracker_set_lockfile

  # Logfile
  #LOG="${TMP}/`basename $0`.log"
}

# Cleanup environment
function httracker_teardown {
  httracker_unset_lockfile
  rm -rf $URLS
}

# Create lockfile
function httracker_set_lockfile {
  if [ ! -z "$LOCKFILE" ]; then
    mkdir -p `dirname $LOCKFILE`
    if ( set -o noclobber; echo "$$" > "$LOCKFILE" ) &> /dev/null; then
      trap 'httracker_unset_lockfile' INT TERM EXIT
    else
      echo "Could not create lockfile $LOCKFILE, exiting"
      exit 1
    fi
  fi
}

# Remove lockfile
function httracker_unset_lockfile {
  if [ ! -z "$LOCKFILE" ]; then
    rm -f $LOCKFILE || echo "Could not remove lockfile $LOCKFILE"
  fi
}

# Check lockfile
function httracker_check_lockfile {
  local pid process

  if [ ! -z "$LOCKFILE" ] && [ -f "$LOCKFILE" ]; then
    pid="`cat $LOCKFILE`"
    process="`ps --no-headers -o comm $pid`"
    if [ "$?" == "0" ] && [ "`ps --no-headers -o comm $$`" == "$process" ]; then
      echo "Another program is running for $LOCKFILE, skipping run"
      exit
    else
      echo "Found old lockfile $LOCKFILE, removing it"
      httracker_unset_lockfile
    fi
  fi
}

# Initialize
httracker_initialize