diff options
Diffstat (limited to 'mod/search/start.php')
| -rw-r--r-- | mod/search/start.php | 663 |
1 files changed, 305 insertions, 358 deletions
diff --git a/mod/search/start.php b/mod/search/start.php index 18b743cde..8a112a3a3 100644 --- a/mod/search/start.php +++ b/mod/search/start.php @@ -1,46 +1,42 @@ <?php /** - * Elgg core search. - * - * @package Elgg - * @subpackage Core - * @author Curverider Ltd <info@elgg.com>, The MITRE Corporation <http://www.mitre.org> - * @link http://elgg.org/ - */ + * Elgg search plugin + * + */ + +elgg_register_event_handler('init','system','search_init'); /** - * Initialise search helper functions. - * + * Initialize search plugin */ function search_init() { global $CONFIG; require_once 'search_hooks.php'; // page handler for search actions and results - register_page_handler('search','search_page_handler'); + elgg_register_page_handler('search', 'search_page_handler'); // register some default search hooks - register_plugin_hook('search', 'object', 'search_objects_hook'); - register_plugin_hook('search', 'user', 'search_users_hook'); - - // @todo pull this out into groups - register_plugin_hook('search', 'group', 'search_groups_hook'); + elgg_register_plugin_hook_handler('search', 'object', 'search_objects_hook'); + elgg_register_plugin_hook_handler('search', 'user', 'search_users_hook'); + elgg_register_plugin_hook_handler('search', 'group', 'search_groups_hook'); // tags and comments are a bit different. // register a search types and a hooks for them. - register_plugin_hook('search_types', 'get_types', 'search_custom_types_tags_hook'); - register_plugin_hook('search', 'tags', 'search_tags_hook'); + elgg_register_plugin_hook_handler('search_types', 'get_types', 'search_custom_types_tags_hook'); + elgg_register_plugin_hook_handler('search', 'tags', 'search_tags_hook'); - register_plugin_hook('search_types', 'get_types', 'search_custom_types_comments_hook'); - register_plugin_hook('search', 'comments', 'search_comments_hook'); + elgg_register_plugin_hook_handler('search_types', 'get_types', 'search_custom_types_comments_hook'); + elgg_register_plugin_hook_handler('search', 'comments', 'search_comments_hook'); // get server min and max allowed chars for ft searching $CONFIG->search_info = array(); // can't use get_data() here because some servers don't have these globals set, // which throws a db exception. - $r = mysql_query('SELECT @@ft_min_word_len as min, @@ft_max_word_len as max'); - if ($word_lens = mysql_fetch_assoc($r)) { + $dblink = get_db_link('read'); + $r = mysql_query('SELECT @@ft_min_word_len as min, @@ft_max_word_len as max', $dblink); + if ($r && ($word_lens = mysql_fetch_assoc($r))) { $CONFIG->search_info['min_chars'] = $word_lens['min']; $CONFIG->search_info['max_chars'] = $word_lens['max']; } else { @@ -50,16 +46,19 @@ function search_init() { } // add in CSS for search elements - elgg_extend_view('css', 'search/css'); + elgg_extend_view('css/elgg', 'search/css'); + + // extend view for elgg topbar search box + elgg_extend_view('page/elements/header', 'search/header'); } /** * Page handler for search * - * @param array $page Page elements from pain page handler + * @param array $page Page elements from core page handler + * @return bool */ function search_page_handler($page) { - global $CONFIG; // if there is no q set, we're being called from a legacy installation // it expects a search by tags. @@ -70,420 +69,315 @@ function search_page_handler($page) { //set_input('search_type', 'tags'); } - include_once('index.php'); + $base_dir = elgg_get_plugins_path() . 'search/pages/search'; + + include_once("$base_dir/index.php"); + return true; } /** - * Return a string with highlighted matched elements. - * Checks for "s - * Provides context for matched elements. - * Will not return more than $max_length of full context. - * Only highlights words + * Return a string with highlighted matched queries and relevant context + * Determines context based upon occurance and distance of words with each other. * - * @param unknown_type $haystack - * @param unknown_type $need - * @param unknown_type $context - * @param unknown_type $max_length - * @return unknown_type + * @param string $haystack + * @param string $query + * @param int $min_match_context = 30 + * @param int $max_length = 300 + * @param bool $tag_match Search is for tags. Don't ignore words. + * @return string */ -function search_get_highlighted_relevant_substrings($haystack, $needle, $min_match_context = 15, $max_length = 500) { - global $CONFIG; +function search_get_highlighted_relevant_substrings($haystack, $query, $min_match_context = 30, $max_length = 300, $tag_match = false) { + $haystack = strip_tags($haystack); - $haystack_lc = strtolower($haystack); -// -// $haystack = "Like merge sort, quicksort can also be easily parallelized due to its " -// . "divide-and-conquer nature. Individual in-place partition operations are difficult " -// . "to parallelize, but once divided, different sections of the list can be sorted in parallel. " -// . "If we have p processors, we can divide a list of n ele"; -// -// $needle = 'difficult to sort in parallel'; - - // for now don't worry about "s or boolean operators - $needle = str_replace(array('"', '-', '+', '~'), '', stripslashes(strip_tags($needle))); - $words = explode(' ', $needle); + $haystack_length = elgg_strlen($haystack); + $haystack_lc = elgg_strtolower($haystack); - $min_chars = $CONFIG->search_info['min_chars']; - // if > ft_min_word == not running in literal mode. - if ($needle >= $min_chars) { - // clean out any words that are ignored by mysql - foreach ($words as $i => $word) { - if (strlen($word) < $min_chars) { - unset ($words[$i]); - } - } + if (!$tag_match) { + $words = search_remove_ignored_words($query, 'array'); + } else { + $words = array(); } - /* - - $body_len = 250 - - $context = 5-30, 20-45, 75-100, 150 - - can pull out context either on: - one of each matching term - X # of highest matching terms + // if haystack < $max_length return the entire haystack w/formatting immediately + if ($haystack_length <= $max_length) { + $return = search_highlight_words($words, $haystack); + return $return; + } - */ - $substr_counts = array(); - $str_pos = array(); - // matrices for being and end context lengths. - // defaults to min context. will add additional context later if needed + // get the starting positions and lengths for all matching words $starts = array(); - $stops = array(); - - // map the words to the starts and stops - $words_arg = array(); - $context_count = 0; - - - // get the full count of matches. + $lengths = array(); foreach ($words as $word) { - $word = strtolower($word); - $count = substr_count($haystack, $word); - $word_len = strlen($word); + $word = elgg_strtolower($word); + $count = elgg_substr_count($haystack_lc, $word); + $word_len = elgg_strlen($word); + $haystack_len = elgg_strlen($haystack_lc); // find the start positions for the words if ($count > 1) { - $str_pos[$word] = array(); $offset = 0; - while (FALSE !== $pos = strpos($haystack, $word, $offset)) { - $str_pos[$word][] = $pos; - $starts[] = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; - $stops[] = $pos + $word_len + $min_match_context; - $words_arg[] = $word; - $context_count += $min_match_context + $word_len; + while (FALSE !== $pos = elgg_strpos($haystack_lc, $word, $offset)) { + $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; + $starts[] = $start; + $stop = $pos + $word_len + $min_match_context; + $lengths[] = $stop - $start; $offset += $pos + $word_len; + + if ($offset >= $haystack_len) { + break; + } } } else { - $pos = strpos($haystack, $word); - $str_pos[$word] = array($pos); - $starts[] = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; - $stops[] = $pos + $word_len + $min_match_context; - $context_count += $min_match_context + $word_len; - $words_arg[] = $word; - } - $substr_counts[$word] = $count; - } - - // sort by order of occurence - //krsort($substr_counts); - $full_count = array_sum($substr_counts); - - // figure out what the context needs to be. - // take one of each matched phrase - // if there are any - -// -// var_dump($str_pos); -// var_dump($substr_counts); -// var_dump($context_count); - - - // sort to put them in order of occurence - asort($starts, SORT_NUMERIC); - asort($stops, SORT_NUMERIC); - - // offset them correctly - $starts[] = 0; - $new_stops = array(0); - foreach ($stops as $i => $pos) { - $new_stops[$i+1] = $pos; - } - $stops = $new_stops; - - $substrings = array(); - $len = count($starts); - - $starts = array_merge($starts); - $stops = array_merge($stops); - - $offsets = array(); - $limits = array(); - $c = 0; - foreach ($starts as $i => $start) { - $stop = $stops[$i]; - $offsets[$c] = $start; - $limits[$c] = $stop; - - // never need the last one as it's just a displacing entry - if ($c+1 == count($starts)) { - break; - } - - if ($start - $stop < 0) { - //var_dump("Looking at c=$c & $start - $stop and going to unset {$limits[$c]}"); - unset($offsets[$c]); - unset($limits[$c]); + $pos = elgg_strpos($haystack_lc, $word); + $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; + $starts[] = $start; + $stop = $pos + $word_len + $min_match_context; + $lengths[] = $stop - $start; } - $c++; } - // reset indexes and remove placeholder elements. - $limits = array_merge($limits); - array_shift($limits); - $offsets = array_merge($offsets); - array_pop($offsets); - - // figure out if we need to adjust the offsets from the base - // this could result in overlapping summaries. - // might be nicer to just remove it. + $offsets = search_consolidate_substrings($starts, $lengths); - $total_len = 0; - foreach ($offsets as $i => $offset) { - $total_len += $limits[$i] - $offset; - } + // figure out if we can adjust the offsets and lengths + // in order to return more context + $total_length = array_sum($offsets); $add_length = 0; - if ($total_length < $max_length) { - $add_length = floor((($max_length - $total_len) / count($offsets)) / 2); - } - - $lengths = array(); - foreach ($offsets as $i => $offset) { - $limit = $limits[$i]; - if ($offset == 0 && $add_length) { - $limit += $add_length; - } else { - $offset = $offset - $add_length; - } - $string = substr($haystack, $offset, $limit - $offset); - - if ($offset != 0) { - $string = "...$string"; - } - - if ($limit + $offset >= strlen($haystack)) { - $string .= '...'; + if ($total_length < $max_length && $offsets) { + $add_length = floor((($max_length - $total_length) / count($offsets)) / 2); + + $starts = array(); + $lengths = array(); + foreach ($offsets as $offset => $length) { + $start = ($offset - $add_length > 0) ? $offset - $add_length : 0; + $length = $length + $add_length; + $starts[] = $start; + $lengths[] = $length; } - $substrings[] = $string; - $lengths[] = strlen($string); + $offsets = search_consolidate_substrings($starts, $lengths); } - // sort by length of context. - asort($lengths); + // sort by order of string size descending (which is roughly + // the proximity of matched terms) so we can keep the + // substrings with terms closest together and discard + // the others as needed to fit within $max_length. + arsort($offsets); - $matched = ''; - foreach ($lengths as $i => $len) { - $string = $substrings[$i]; + $return_strs = array(); + $total_length = 0; + foreach ($offsets as $start => $length) { + $string = trim(elgg_substr($haystack, $start, $length)); - if (strlen($matched) + strlen($string) < $max_length) { - $matched .= $string; + // continue past if adding this substring exceeds max length + if ($total_length + $length > $max_length) { + continue; } - } - $i = 1; - foreach ($words as $word) { - $search = "/($word)/i"; - $replace = "<strong class=\"searchMatch searchMatchColor$i\">$1</strong>"; - $matched = preg_replace($search, $replace, $matched); - $i++; + $total_length += $length; + $return_strs[$start] = $string; } - return $matched; - + // put the strings in order of occurence + ksort($return_strs); - // crap below.. - - - - for ($i=0; $i<$len; $i++) { - $start = $starts[$i]; - $stop = $stops[$i]; - var_dump("Looking at $i = $start - $stop"); - - while ($start - $stop <= 0) { - $stop = $stops[$i++]; - var_dump("New start is $stop"); - } - - var_dump("$start-$stop"); - } - - // find the intersecting contexts - foreach ($starts as $i => $start_pos) { - $words .= "{$words_arg[$i]}\t\t\t"; - echo "$start_pos\t\t\t"; + // add ...s where needed + $return = implode('...', $return_strs); + if (!array_key_exists(0, $return_strs)) { + $return = "...$return"; } - echo "\n"; - - foreach ($stops as $i => $stop_pos) { - echo "$stop_pos\t\t\t"; + // add to end of string if last substring doesn't hit the end. + $starts = array_keys($return_strs); + $last_pos = $starts[count($starts)-1]; + if ($last_pos + elgg_strlen($return_strs[$last_pos]) < $haystack_length) { + $return .= '...'; } -echo "\n$words\n"; - // get full number of matches against all words to see how many we actually want to look at. + $return = search_highlight_words($words, $return); + return $return; +} +/** + * Takes an array of offsets and lengths and consolidates any + * overlapping entries, returning an array of new offsets and lengths + * + * Offsets and lengths are specified in separate arrays because of possible + * index collisions with the offsets. + * + * @param array $offsets + * @param array $lengths + * @return array + */ +function search_consolidate_substrings($offsets, $lengths) { + // sort offsets by occurence + asort($offsets, SORT_NUMERIC); -// $desc = search_get_relevant_substring($entity->description, $params['query'], '<strong class="searchMatch">', '</strong>'); - - - $params['query']; - // "this is"just a test "silly person" - - // check for "s - $words_quotes = explode('"', $needle); - - $words_orig = explode(' ', $needle); - $words = array(); - - foreach ($words_orig as $i => $word) { - // figure out if we have a special operand - $operand = substr($word, 0, 1); - switch($operand) { - case '"': - // find the matching " if any. else, remove the " - if (substr_count($query, '"') < 2) { - $words[] = substr($word, 1); - } else { - $word = substr($word, 1); - $word_i = $i; - while ('"' != strpos($words_orig[$word_i], '"')) { - $word .= " {$words_orig[$word_i]}"; - unset($words_orig[$word_i]); - } - } + // reset the indexes maintaining association with the original offsets. + $offsets = array_merge($offsets); - break; + $new_lengths = array(); + foreach ($offsets as $i => $offset) { + $new_lengths[] = $lengths[$i]; + } - case '+': - // remove + - $words[] = substr($word, 1); - break; + $lengths = $new_lengths; - case '~': - case '-': - // remove this from highlighted list. + $return = array(); + $count = count($offsets); + for ($i=0; $i<$count; $i++) { + $offset = $offsets[$i]; + $length = $lengths[$i]; + $end_pos = $offset + $length; + // find the next entry that doesn't overlap + while (array_key_exists($i+1, $offsets) && $end_pos > $offsets[$i+1]) { + $i++; + if (!array_key_exists($i, $offsets)) { break; + } + $end_pos = $lengths[$i] + $offsets[$i]; } - } - // pick out " queries - if (substr_count($query, '"') >= 2) { + $length = $end_pos - $offset; + // will never have a colliding offset, so can return as a single array + $return[$offset] = $length; } - // ignore queries starting with - - - - // @todo figure out a way to "center" the matches within the max_length. - // if only one match, its context is $context + $max_length / 2 - // if 2 matches, its context is $context + $max_length / 4 - // if 3 matches, its context is $context + $max_length / 6 - // $context per match = $min_match_context + ($max_length / $num_count_match) - - // if $max_length / ($matched_count * 2) < $context - // only match against the first X matches where $context >= $context + return $return; } /** - * Returns a matching string with $context amount of context, optionally - * surrounded by $before and $after. + * Safely highlights the words in $words found in $string avoiding recursion * - * If no match is found, restricts string to $context*2 starting from strpos 0. - * - * @param str $haystack - * @param str $needle - * @param str $before - * @param str $after - * @param int $context - * @return str + * @param array $words + * @param string $string + * @return string */ -function search_get_relevant_substring($haystack, $needle, $before = '', $after = '', $context = 75) { - $haystack = strip_tags($haystack); - $needle = strip_tags($needle); - - $pos = strpos(strtolower($haystack), strtolower($needle)); +function search_highlight_words($words, $string) { + $i = 1; + $replace_html = array( + 'strong' => rand(10000, 99999), + 'class' => rand(10000, 99999), + 'search-highlight' => rand(10000, 99999), + 'search-highlight-color' => rand(10000, 99999) + ); - if ($pos === FALSE) { - $str = substr($haystack, 0, $context*2); - if (strlen($haystack) > $context*2) { - $str .= '...'; - } + foreach ($words as $word) { + // remove any boolean mode operators + $word = preg_replace("/([\-\+~])([\w]+)/i", '$2', $word); + + // escape the delimiter and any other regexp special chars + $word = preg_quote($word, '/'); + + $search = "/($word)/i"; - return $str; + // @todo + // must replace with placeholders in case one of the search terms is + // in the html string. + // later, will replace the placeholders with the actual html. + // Yeah this is hacky. I'm tired. + $strong = $replace_html['strong']; + $class = $replace_html['class']; + $highlight = $replace_html['search-highlight']; + $color = $replace_html['search-highlight-color']; + + $replace = "<$strong $class=\"$highlight $color{$i}\">$1</$strong>"; + $string = preg_replace($search, $replace, $string); + $i++; } - $start_pos = $pos - $context; - - if ($start_pos < 0) { - $start_pos = 0; + foreach ($replace_html as $replace => $search) { + $string = str_replace($search, $replace, $string); } - // get string from -context to +context - $matched = substr($haystack, $start_pos, $context*2); + return $string; +} - // add elipses to front. - if ($start_pos > 0) { - $matched = "...$matched"; - } +/** + * Returns a query with stop and too short words removed. + * (Unless the entire query is < ft_min_word_chars, in which case + * it's taken literally.) + * + * @param array $query + * @param str $format Return as an array or a string + * @return mixed + */ +function search_remove_ignored_words($query, $format = 'array') { + global $CONFIG; - // add elipses to end. - if ($pos + strlen($needle) + $context*2 < strlen($haystack)) { - $matched = "$matched..."; + // don't worry about "s or boolean operators + //$query = str_replace(array('"', '-', '+', '~'), '', stripslashes(strip_tags($query))); + $query = stripslashes(strip_tags($query)); + + $words = explode(' ', $query); + + $min_chars = $CONFIG->search_info['min_chars']; + // if > ft_min_word we're not running in literal mode. + if (elgg_strlen($query) >= $min_chars) { + // clean out any words that are ignored by mysql + foreach ($words as $i => $word) { + if (elgg_strlen($word) < $min_chars) { + unset ($words[$i]); + } + } } - // surround if needed - // @todo would getting each position of the match then - // inserting manually based on the position be faster than preg_replace()? - if ($before || $after) { - $matched = str_ireplace($needle, $before . $needle . $after, $matched); - //$matched = mb_ereg_replace("") - // insert before + if ($format == 'string') { + return implode(' ', $words); } - return $matched; + return $words; } /** - * Passes entities, count, and original params to the view functions for + * Passes results, and original params to the view functions for * search type. * - * @param array $entities - * @param int $count + * @param array $results * @param array $params + * @param string $view_type = list, entity or layout * @return string */ -function search_get_listing_html($entities, $count, $params) { - if (!is_array($entities) || !$count) { - return FALSE; +function search_get_search_view($params, $view_type) { + switch ($view_type) { + case 'list': + case 'entity': + case 'layout': + break; + + default: + return FALSE; } $view_order = array(); - // check if there's a special search view for this type:subtype + // check if there's a special search list view for this type:subtype if (isset($params['type']) && $params['type'] && isset($params['subtype']) && $params['subtype']) { - $view_order[] = "search/{$params['type']}/{$params['subtype']}/listing"; + $view_order[] = "search/{$params['type']}/{$params['subtype']}/$view_type"; } // also check for the default type if (isset($params['type']) && $params['type']) { - $view_order[] = "search/{$params['type']}/listing"; + $view_order[] = "search/{$params['type']}/$view_type"; } // check search types if (isset($params['search_type']) && $params['search_type']) { - $view_order[] = "search/{$params['search_type']}/listing"; + $view_order[] = "search/{$params['search_type']}/$view_type"; } - // finally default to a search listing default - $view_order[] = "search/listing"; - - $vars = array( - 'entities' => $entities, - 'count' => $count, - 'params' => $params - ); + // finally default to a search list default + $view_order[] = "search/$view_type"; foreach ($view_order as $view) { if (elgg_view_exists($view)) { - return elgg_view($view, $vars); + return $view; } } @@ -498,58 +392,111 @@ function search_get_listing_html($entities, $count, $params) { * @param array $params Original search params * @return str */ -function search_get_where_sql($table, $fields, $params) { +function search_get_where_sql($table, $fields, $params, $use_fulltext = TRUE) { global $CONFIG; $query = $params['query']; // add the table prefix to the fields foreach ($fields as $i => $field) { - $fields[$i] = "$table.$field"; + if ($table) { + $fields[$i] = "$table.$field"; + } } + + $where = ''; // if query is shorter than the min for fts words // it's likely a single acronym or similar // switch to literal mode - if (strlen($query) < $CONFIG->search_info['min_chars']) { + if (elgg_strlen($query) < $CONFIG->search_info['min_chars']) { $likes = array(); $query = sanitise_string($query); foreach ($fields as $field) { $likes[] = "$field LIKE '%$query%'"; } $likes_str = implode(' OR ', $likes); - //$where = "($table.guid = e.guid AND ($likes_str))"; $where = "($likes_str)"; } else { - // if using advanced or paired "s, switch into boolean mode - if ((isset($params['advanced_search']) && $params['advanced_search']) || substr_count($query, '"') >= 2 ) { + // if we're not using full text, rewrite the query for bool mode. + // exploiting a feature(ish) of bool mode where +-word is the same as -word + if (!$use_fulltext) { + $query = '+' . str_replace(' ', ' +', $query); + } + + // if using advanced, boolean operators, or paired "s, switch into boolean mode + $booleans_used = preg_match("/([\-\+~])([\w]+)/i", $query); + $advanced_search = (isset($params['advanced_search']) && $params['advanced_search']); + $quotes_used = (elgg_substr_count($query, '"') >= 2); + + if (!$use_fulltext || $booleans_used || $advanced_search || $quotes_used) { $options = 'IN BOOLEAN MODE'; } else { - // natural language mode is default and this keyword isn't supported - // in < 5.1 + // natural language mode is default and this keyword isn't supported in < 5.1 //$options = 'IN NATURAL LANGUAGE MODE'; $options = ''; } - + // if short query, use query expansion. - if (strlen($query) < 6) { - //$options .= ' WITH QUERY EXPANSION'; - } + // @todo doesn't seem to be working well. +// if (elgg_strlen($query) < 5) { +// $options .= ' WITH QUERY EXPANSION'; +// } $query = sanitise_string($query); - // if query is shorter than the ft_min_word_len switch to literal mode. $fields_str = implode(',', $fields); - //$where = "($table.guid = e.guid AND (MATCH ($fields_str) AGAINST ('$query' $options)))"; $where = "(MATCH ($fields_str) AGAINST ('$query' $options))"; } return $where; } -function search_get_query_where_sql($table, $query) { - // if there are multiple "s or 's it's a literal string. -} +/** + * Returns ORDER BY sql for insertion into elgg_get_entities(). + * + * @param str $entities_table Prefix for entities table. + * @param str $type_table Prefix for the type table. + * @param str $sort ORDER BY part + * @param str $order ASC or DESC + * @return str + */ +function search_get_order_by_sql($entities_table, $type_table, $sort, $order) { -/** Register init system event **/ + $on = NULL; -register_elgg_event_handler('init','system','search_init');
\ No newline at end of file + switch ($sort) { + default: + case 'relevance': + // default is relevance descending. + // ascending relevancy is silly and complicated. + $on = ''; + break; + case 'created': + $on = "$entities_table.time_created"; + break; + case 'updated': + $on = "$entities_table.time_updated"; + break; + case 'action_on': + // @todo not supported yet in core + $on = ''; + break; + case 'alpha': + // @todo not support yet because both title + // and name columns are used for this depending + // on the entity, which we don't always know. >:O + break; + } + $order = strtolower($order); + if ($order != 'asc' && $order != 'desc') { + $order = 'DESC'; + } + + if ($on) { + $order_by = "$on $order"; + } else { + $order_by = ''; + } + + return $order_by; +} |
