aboutsummaryrefslogtreecommitdiff
path: root/lib/httracker/functions
blob: f2dde7f99dc348e85900d54b05f000eb29e6a01d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/bin/bash
#
# Misc httracker functions.
#

# Set common httrack options
function httracker_opts {
  OPTS=" --mirror                 \
         --continue               \
         --depth=${DEPTH}         \
         --ext-depth ${EXT_DEPTH} \
         --near                   \
         --purge-old=0            \
         --index                  \
         --cookies=1              \
         --path ${TARGET}"
}

# Download URLs, mirror mode
function httracker_get {
  # Options
  local url="$1"
  local hash="`echo $1 | sha1sum | cut -d ' ' -f 1`"

  echo -n "Processing $url..."

  # Set target and make sure it exists
  TARGET="$MIRRORS/$hash"
  mkdir -p $TARGET

  # We already got this one
  if [ -e "$TARGET/httracker-ok" ]; then
    echo " skipping as it's already downloaded as $hash..."
    return
  else
    echo -e "\n"
  fi

  # Basic options
  httracker_opts

  # Additional options
  if [ "`whoami`" == "root" ]; then
    OPTS="$OPTS --user $USER"
  fi

  # Fix permissions
  if [ "`whoami`" != "$USER" ] && [ "`whoami`" == "root" ]; then
    chown -R $USER.$GROUP $TARGET/
  fi

  # Get each URL
  httrack ${OPTS} ${url} | tee $LOG

  if [ "$?" == "0" ]; then
    # Mark as downloaded
    touch $TARGET/httracker-ok
  else
    echo "Error fetching $url."
    rm -rf $TARGET
  fi
}

# Download URLs, incremental mode
function httracker_get_incremental {
  # Create TARGET dir
  year=`date +%Y`
  month=`date +%m`
  day=`date +%d`
  TARGET=${MIRRORDIR}/${year}/${month}
  sudo -u ${USER} mkdir -p ${TARGET}

  # Basic options
  httracker_opts

  # Additional options
  if [ "`whoami`" == "root" ]; then
    OPTS="--user $USER"
  fi

  # Grab URLs from the network
  httrack ${OPTS} --list ${URLS} | tee $LOG
}

# Get SemanticScuttle parameter
function httracker_scuttle_config {
  grep ${1} ${CONFIGFILE} | sed -e s/\[^\'\]\*\'// -e s/\'\.\*\$//
}

# Query a mysql database
function httracker_sqlquery {
  mysql --skip-column-names --batch \
        --user=${DBUSER}          \
        --password=${DBPASS}      \
        --database=${DBNAME}      \
        --host=${DBHOST}          \
        --execute="${1}"
}

# Iterate over all URLs
function httracker_iterate {
  for link in `cat $URLS | xargs`; do
    # Fix entities
    link="`echo $link | $BASE/lib/httracker/html.sed`"
    httracker_get "$link"
  done
}

# Create basic folders
function httracker_setup_folders {
  mkdir -p $MIRRORS $TMP

  if [ "`whoami`" == "root" ]; then
    chown -R $USER.$GROUP $MIRRORS
    chown -R $USER.$GROUP $TMP
  fi
}

# Set basic environment
function httracker_initialize {
  BASE="`dirname $0`"
  source $BASE/config || exit 1

  # Create folders
  httracker_setup_folders

  # Lockfile
  LOCKFILE="${TMP}/`basename $0`.lock"
  httracker_check_lockfile
  httracker_set_lockfile

  # Logfile
  LOG="${TMP}/`basename $0`.log"
}

# Cleanup environment
function httracker_teardown {
  httracker_unset_lockfile
  rm -rf $URLS
}

# Create lockfile
function httracker_set_lockfile {
  if [ ! -z "$LOCKFILE" ]; then
    mkdir -p `dirname $LOCKFILE`
    if ( set -o noclobber; echo "$$" > "$LOCKFILE" ) &> /dev/null; then
      trap 'httracker_unset_lockfile' INT TERM EXIT
    else
      echo "Could not create lockfile $LOCKFILE, exiting"
      exit 1
    fi
  fi
}

# Remove lockfile
function httracker_unset_lockfile {
  if [ ! -z "$LOCKFILE" ]; then
    rm -f $LOCKFILE || echo "Could not remove lockfile $LOCKFILE"
  fi
}

# Check lockfile
function httracker_check_lockfile {
  local pid process

  if [ ! -z "$LOCKFILE" ] && [ -f "$LOCKFILE" ]; then
    pid="`cat $LOCKFILE`"
    process="`ps --no-headers -o comm $pid`"
    if [ "$?" == "0" ] && [ "`ps --no-headers -o comm $$`" == "$process" ]; then
      echo "Another program is running for $LOCKFILE, skipping run"
      exit
    else
      echo "Found old lockfile $LOCKFILE, removing it"
      httracker_unset_lockfile
    fi
  fi
}

# Initialize
httracker_initialize