diff options
Diffstat (limited to 'vendors/dokuwiki/lib/exe/indexer.php')
-rw-r--r-- | vendors/dokuwiki/lib/exe/indexer.php | 375 |
1 files changed, 375 insertions, 0 deletions
diff --git a/vendors/dokuwiki/lib/exe/indexer.php b/vendors/dokuwiki/lib/exe/indexer.php new file mode 100644 index 000000000..872f6b5be --- /dev/null +++ b/vendors/dokuwiki/lib/exe/indexer.php @@ -0,0 +1,375 @@ +<?php +/** + * DokuWiki indexer + * + * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) + * @author Andreas Gohr <andi@splitbrain.org> + */ +if(!defined('DOKU_INC')) define('DOKU_INC',dirname(__FILE__).'/../../'); +define('DOKU_DISABLE_GZIP_OUTPUT',1); +require_once(DOKU_INC.'inc/init.php'); +require_once(DOKU_INC.'inc/auth.php'); +require_once(DOKU_INC.'inc/events.php'); +session_write_close(); //close session +if(!defined('NL')) define('NL',"\n"); +global $ID; +// Version tag used to force rebuild on upgrade +define('INDEXER_VERSION', 2); + +// keep running after browser closes connection +@ignore_user_abort(true); + +// check if user abort worked, if yes send output early +$defer = !@ignore_user_abort() || $conf['broken_iua']; +if(!$defer){ + sendGIF(); // send gif +} + +$ID = cleanID($_REQUEST['id']); + +// Catch any possible output (e.g. errors) +if(!isset($_REQUEST['debug'])) ob_start(); + +// run one of the jobs +$tmp = array(); // No event data +$evt = new Doku_Event('INDEXER_TASKS_RUN', $tmp); +if ($evt->advise_before()) { + runIndexer() or + metaUpdate() or + runSitemapper() or + runTrimRecentChanges() or + runTrimRecentChanges(true) or + $evt->advise_after(); +} +if($defer) sendGIF(); + +if(!isset($_REQUEST['debug'])) ob_end_clean(); +exit; + +// -------------------------------------------------------------------- + +/** + * Trims the recent changes cache (or imports the old changelog) as needed. + * + * @param media_changes If the media changelog shall be trimmed instead of + * the page changelog + * + * @author Ben Coburn <btcoburn@silicodon.net> + */ +function runTrimRecentChanges($media_changes = false) { + global $conf; + + $fn = ($media_changes ? $conf['media_changelog'] : $conf['changelog']); + + // Trim the Recent Changes + // Trims the recent changes cache to the last $conf['changes_days'] recent + // changes or $conf['recent'] items, which ever is larger. + // The trimming is only done once a day. + if (@file_exists($fn) && + (@filemtime($fn.'.trimmed')+86400)<time() && + !@file_exists($fn.'_tmp')) { + @touch($fn.'.trimmed'); + io_lock($fn); + $lines = file($fn); + if (count($lines)<=$conf['recent']) { + // nothing to trim + io_unlock($fn); + return false; + } + + io_saveFile($fn.'_tmp', ''); // presave tmp as 2nd lock + $trim_time = time() - $conf['recent_days']*86400; + $out_lines = array(); + + for ($i=0; $i<count($lines); $i++) { + $log = parseChangelogLine($lines[$i]); + if ($log === false) continue; // discard junk + if ($log['date'] < $trim_time) { + $old_lines[$log['date'].".$i"] = $lines[$i]; // keep old lines for now (append .$i to prevent key collisions) + } else { + $out_lines[$log['date'].".$i"] = $lines[$i]; // definitely keep these lines + } + } + + if (count($lines)==count($out_lines)) { + // nothing to trim + @unlink($fn.'_tmp'); + io_unlock($fn); + return false; + } + + // sort the final result, it shouldn't be necessary, + // however the extra robustness in making the changelog cache self-correcting is worth it + ksort($out_lines); + $extra = $conf['recent'] - count($out_lines); // do we need extra lines do bring us up to minimum + if ($extra > 0) { + ksort($old_lines); + $out_lines = array_merge(array_slice($old_lines,-$extra),$out_lines); + } + + // save trimmed changelog + io_saveFile($fn.'_tmp', implode('', $out_lines)); + @unlink($fn); + if (!rename($fn.'_tmp', $fn)) { + // rename failed so try another way... + io_unlock($fn); + io_saveFile($fn, implode('', $out_lines)); + @unlink($fn.'_tmp'); + } else { + io_unlock($fn); + } + return true; + } + + // nothing done + return false; +} + +/** + * Runs the indexer for the current page + * + * @author Andreas Gohr <andi@splitbrain.org> + */ +function runIndexer(){ + global $ID; + global $conf; + print "runIndexer(): started".NL; + + // Move index files (if needed) + // Uses the importoldindex plugin to upgrade the index automatically. + // FIXME: Remove this from runIndexer when it is no longer needed. + if (@file_exists($conf['cachedir'].'/page.idx') && + (!@file_exists($conf['indexdir'].'/page.idx') || + !filesize($conf['indexdir'].'/page.idx')) && + !@file_exists($conf['indexdir'].'/index_importing')) { + echo "trigger TEMPORARY_INDEX_UPGRADE_EVENT\n"; + $tmp = array(); // no event data + trigger_event('TEMPORARY_INDEX_UPGRADE_EVENT', $tmp); + } + + if(!$ID) return false; + + // check if indexing needed + $idxtag = metaFN($ID,'.indexed'); + if(@file_exists($idxtag)){ + if(io_readFile($idxtag) >= INDEXER_VERSION){ + $last = @filemtime($idxtag); + if($last > @filemtime(wikiFN($ID))){ + print "runIndexer(): index for $ID up to date".NL; + return false; + } + } + } + + // try to aquire a lock + $lock = $conf['lockdir'].'/_indexer.lock'; + while(!@mkdir($lock,$conf['dmode'])){ + usleep(50); + if(time()-@filemtime($lock) > 60*5){ + // looks like a stale lock - remove it + @rmdir($lock); + print "runIndexer(): stale lock removed".NL; + }else{ + print "runIndexer(): indexer locked".NL; + return false; + } + } + if($conf['dperm']) chmod($lock, $conf['dperm']); + + require_once(DOKU_INC.'inc/indexer.php'); + + // upgrade to version 2 + if (!@file_exists($conf['indexdir'].'/pageword.idx')) + idx_upgradePageWords(); + + // do the work + idx_addPage($ID); + + // we're finished - save and free lock + io_saveFile(metaFN($ID,'.indexed'),INDEXER_VERSION); + @rmdir($lock); + print "runIndexer(): finished".NL; + return true; +} + +/** + * Will render the metadata for the page if not exists yet + * + * This makes sure pages which are created from outside DokuWiki will + * gain their data when viewed for the first time. + */ +function metaUpdate(){ + global $ID; + print "metaUpdate(): started".NL; + + if(!$ID) return false; + $file = metaFN($ID, '.meta'); + echo "meta file: $file".NL; + + // rendering needed? + if (@file_exists($file)) return false; + if (!@file_exists(wikiFN($ID))) return false; + + require_once(DOKU_INC.'inc/common.php'); + require_once(DOKU_INC.'inc/parserutils.php'); + global $conf; + + + // gather some additional info from changelog + $info = io_grep($conf['changelog'], + '/^(\d+)\t(\d+\.\d+\.\d+\.\d+)\t'.preg_quote($ID,'/').'\t([^\t]+)\t([^\t\n]+)/', + 0,true); + + $meta = array(); + if(!empty($info)){ + $meta['date']['created'] = $info[0][1]; + foreach($info as $item){ + if($item[4] != '*'){ + $meta['date']['modified'] = $item[1]; + if($item[3]){ + $meta['contributor'][$item[3]] = $item[3]; + } + } + } + } + + $meta = p_render_metadata($ID, $meta); + io_saveFile($file, serialize($meta)); + + echo "metaUpdate(): finished".NL; + return true; +} + +/** + * Builds a Google Sitemap of all public pages known to the indexer + * + * The map is placed in the root directory named sitemap.xml.gz - This + * file needs to be writable! + * + * @author Andreas Gohr + * @link https://www.google.com/webmasters/sitemaps/docs/en/about.html + */ +function runSitemapper(){ + global $conf; + print "runSitemapper(): started".NL; + if(!$conf['sitemap']) return false; + + if($conf['compression'] == 'bz2' || $conf['compression'] == 'gz'){ + $sitemap = 'sitemap.xml.gz'; + }else{ + $sitemap = 'sitemap.xml'; + } + print "runSitemapper(): using $sitemap".NL; + + if(@file_exists(DOKU_INC.$sitemap)){ + if(!is_writable(DOKU_INC.$sitemap)) return false; + }else{ + if(!is_writable(DOKU_INC)) return false; + } + + if(@filesize(DOKU_INC.$sitemap) && + @filemtime(DOKU_INC.$sitemap) > (time()-($conf['sitemap']*60*60*24))){ + print 'runSitemapper(): Sitemap up to date'.NL; + return false; + } + + $pages = file($conf['indexdir'].'/page.idx'); + print 'runSitemapper(): creating sitemap using '.count($pages).' pages'.NL; + + // build the sitemap + ob_start(); + print '<?xml version="1.0" encoding="UTF-8"?>'.NL; + print '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'.NL; + foreach($pages as $id){ + $id = trim($id); + $file = wikiFN($id); + + //skip hidden, non existing and restricted files + if(isHiddenPage($id)) continue; + $date = @filemtime($file); + if(!$date) continue; + if(auth_aclcheck($id,'','') < AUTH_READ) continue; + + print ' <url>'.NL; + print ' <loc>'.wl($id,'',true).'</loc>'.NL; + print ' <lastmod>'.date_iso8601($date).'</lastmod>'.NL; + print ' </url>'.NL; + } + print '</urlset>'.NL; + $data = ob_get_contents(); + ob_end_clean(); + + //save the new sitemap + io_saveFile(DOKU_INC.$sitemap,$data); + + //ping search engines... + $http = new DokuHTTPClient(); + $http->timeout = 8; + + //ping google + print 'runSitemapper(): pinging google'.NL; + $url = 'http://www.google.com/webmasters/sitemaps/ping?sitemap='; + $url .= urlencode(DOKU_URL.$sitemap); + $resp = $http->get($url); + if($http->error) print 'runSitemapper(): '.$http->error.NL; + print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; + + //ping yahoo + print 'runSitemapper(): pinging yahoo'.NL; + $url = 'http://search.yahooapis.com/SiteExplorerService/V1/updateNotification?appid=dokuwiki&url='; + $url .= urlencode(DOKU_URL.$sitemap); + $resp = $http->get($url); + if($http->error) print 'runSitemapper(): '.$http->error.NL; + print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; + + //ping microsoft + print 'runSitemapper(): pinging microsoft'.NL; + $url = 'http://www.bing.com/webmaster/ping.aspx?siteMap='; + $url .= urlencode(DOKU_URL.$sitemap); + $resp = $http->get($url); + if($http->error) print 'runSitemapper(): '.$http->error.NL; + print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; + + print 'runSitemapper(): finished'.NL; + return true; +} + +/** + * Formats a timestamp as ISO 8601 date + * + * @author <ungu at terong dot com> + * @link http://www.php.net/manual/en/function.date.php#54072 + */ +function date_iso8601($int_date) { + //$int_date: current date in UNIX timestamp + $date_mod = date('Y-m-d\TH:i:s', $int_date); + $pre_timezone = date('O', $int_date); + $time_zone = substr($pre_timezone, 0, 3).":".substr($pre_timezone, 3, 2); + $date_mod .= $time_zone; + return $date_mod; +} + +/** + * Just send a 1x1 pixel blank gif to the browser + * + * @author Andreas Gohr <andi@splitbrain.org> + * @author Harry Fuecks <fuecks@gmail.com> + */ +function sendGIF(){ + if(isset($_REQUEST['debug'])){ + header('Content-Type: text/plain'); + return; + } + $img = base64_decode('R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAEALAAAAAABAAEAAAIBTAA7'); + header('Content-Type: image/gif'); + header('Content-Length: '.strlen($img)); + header('Connection: Close'); + print $img; + flush(); + // Browser should drop connection after this + // Thinks it's got the whole image +} + +//Setup VIM: ex: et ts=4 enc=utf-8 : +// No trailing PHP closing tag - no output please! +// See Note at http://www.php.net/manual/en/language.basic-syntax.instruction-separation.php |