aboutsummaryrefslogtreecommitdiff
path: root/mod/search/start.php
diff options
context:
space:
mode:
Diffstat (limited to 'mod/search/start.php')
-rw-r--r--mod/search/start.php480
1 files changed, 177 insertions, 303 deletions
diff --git a/mod/search/start.php b/mod/search/start.php
index 18b743cde..42366318a 100644
--- a/mod/search/start.php
+++ b/mod/search/start.php
@@ -40,7 +40,7 @@ function search_init() {
// can't use get_data() here because some servers don't have these globals set,
// which throws a db exception.
$r = mysql_query('SELECT @@ft_min_word_len as min, @@ft_max_word_len as max');
- if ($word_lens = mysql_fetch_assoc($r)) {
+ if ($r && ($word_lens = mysql_fetch_assoc($r))) {
$CONFIG->search_info['min_chars'] = $word_lens['min'];
$CONFIG->search_info['max_chars'] = $word_lens['max'];
} else {
@@ -74,11 +74,8 @@ function search_page_handler($page) {
}
/**
- * Return a string with highlighted matched elements.
- * Checks for "s
- * Provides context for matched elements.
- * Will not return more than $max_length of full context.
- * Only highlights words
+ * Return a string with highlighted matched queries and relevant context
+ * Determins context based upon occurance and distance of words with each other.
*
* @param unknown_type $haystack
* @param unknown_type $need
@@ -86,358 +83,235 @@ function search_page_handler($page) {
* @param unknown_type $max_length
* @return unknown_type
*/
-function search_get_highlighted_relevant_substrings($haystack, $needle, $min_match_context = 15, $max_length = 500) {
+function search_get_highlighted_relevant_substrings($haystack, $query, $min_match_context = 30, $max_length = 300) {
global $CONFIG;
$haystack = strip_tags($haystack);
- $haystack_lc = strtolower($haystack);
-//
-// $haystack = "Like merge sort, quicksort can also be easily parallelized due to its "
-// . "divide-and-conquer nature. Individual in-place partition operations are difficult "
-// . "to parallelize, but once divided, different sections of the list can be sorted in parallel. "
-// . "If we have p processors, we can divide a list of n ele";
-//
-// $needle = 'difficult to sort in parallel';
-
- // for now don't worry about "s or boolean operators
- $needle = str_replace(array('"', '-', '+', '~'), '', stripslashes(strip_tags($needle)));
- $words = explode(' ', $needle);
+ $haystack_length = elgg_strlen($haystack);
+ $haystack_lc = elgg_strtolower($haystack);
- $min_chars = $CONFIG->search_info['min_chars'];
- // if > ft_min_word == not running in literal mode.
- if ($needle >= $min_chars) {
- // clean out any words that are ignored by mysql
- foreach ($words as $i => $word) {
- if (strlen($word) < $min_chars) {
- unset ($words[$i]);
- }
- }
- }
+ $words = search_remove_ignored_words($query, 'array');
- /*
+ // if haystack < $max_length return the entire haystack w/formatting immediately
+ if ($haystack_length <= $max_length) {
+ $return = search_highlight_words($words, $haystack);
- $body_len = 250
-
- $context = 5-30, 20-45, 75-100, 150
-
- can pull out context either on:
- one of each matching term
- X # of highest matching terms
+ return $return;
+ }
- */
- $substr_counts = array();
- $str_pos = array();
- // matrices for being and end context lengths.
- // defaults to min context. will add additional context later if needed
+ // get the starting positions and lengths for all matching words
$starts = array();
- $stops = array();
-
- // map the words to the starts and stops
- $words_arg = array();
- $context_count = 0;
-
-
- // get the full count of matches.
+ $lengths = array();
foreach ($words as $word) {
- $word = strtolower($word);
- $count = substr_count($haystack, $word);
- $word_len = strlen($word);
+ $word = elgg_strtolower($word);
+ $count = elgg_substr_count($haystack_lc, $word);
+ $word_len = elgg_strlen($word);
// find the start positions for the words
if ($count > 1) {
- $str_pos[$word] = array();
$offset = 0;
- while (FALSE !== $pos = strpos($haystack, $word, $offset)) {
- $str_pos[$word][] = $pos;
- $starts[] = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
- $stops[] = $pos + $word_len + $min_match_context;
- $words_arg[] = $word;
- $context_count += $min_match_context + $word_len;
+ while (FALSE !== $pos = elgg_strpos($haystack_lc, $word, $offset)) {
+ $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
+ $starts[] = $start;
+ $stop = $pos + $word_len + $min_match_context;
+ $lengths[] = $stop - $start;
$offset += $pos + $word_len;
}
} else {
- $pos = strpos($haystack, $word);
- $str_pos[$word] = array($pos);
- $starts[] = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
- $stops[] = $pos + $word_len + $min_match_context;
- $context_count += $min_match_context + $word_len;
- $words_arg[] = $word;
- }
- $substr_counts[$word] = $count;
- }
-
- // sort by order of occurence
- //krsort($substr_counts);
- $full_count = array_sum($substr_counts);
-
- // figure out what the context needs to be.
- // take one of each matched phrase
- // if there are any
-
-//
-// var_dump($str_pos);
-// var_dump($substr_counts);
-// var_dump($context_count);
-
-
- // sort to put them in order of occurence
- asort($starts, SORT_NUMERIC);
- asort($stops, SORT_NUMERIC);
-
- // offset them correctly
- $starts[] = 0;
- $new_stops = array(0);
- foreach ($stops as $i => $pos) {
- $new_stops[$i+1] = $pos;
- }
- $stops = $new_stops;
-
- $substrings = array();
- $len = count($starts);
-
- $starts = array_merge($starts);
- $stops = array_merge($stops);
-
- $offsets = array();
- $limits = array();
- $c = 0;
- foreach ($starts as $i => $start) {
- $stop = $stops[$i];
- $offsets[$c] = $start;
- $limits[$c] = $stop;
-
- // never need the last one as it's just a displacing entry
- if ($c+1 == count($starts)) {
- break;
+ $pos = elgg_strpos($haystack_lc, $word);
+ $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
+ $starts[] = $start;
+ $stop = $pos + $word_len + $min_match_context;
+ $lengths[] = $stop - $start;
}
-
- if ($start - $stop < 0) {
- //var_dump("Looking at c=$c & $start - $stop and going to unset {$limits[$c]}");
- unset($offsets[$c]);
- unset($limits[$c]);
- }
- $c++;
}
- // reset indexes and remove placeholder elements.
- $limits = array_merge($limits);
- array_shift($limits);
- $offsets = array_merge($offsets);
- array_pop($offsets);
+ $offsets = search_consolidate_substrings($starts, $lengths);
- // figure out if we need to adjust the offsets from the base
- // this could result in overlapping summaries.
- // might be nicer to just remove it.
-
- $total_len = 0;
- foreach ($offsets as $i => $offset) {
- $total_len += $limits[$i] - $offset;
- }
+ // figure out if we can adjust the offsets and lengths
+ // in order to return more context
+ $total_length = array_sum($offsets);
$add_length = 0;
if ($total_length < $max_length) {
- $add_length = floor((($max_length - $total_len) / count($offsets)) / 2);
- }
-
- $lengths = array();
- foreach ($offsets as $i => $offset) {
- $limit = $limits[$i];
- if ($offset == 0 && $add_length) {
- $limit += $add_length;
- } else {
- $offset = $offset - $add_length;
+ $add_length = floor((($max_length - $total_length) / count($offsets)) / 2);
+
+ $starts = array();
+ $lengths = array();
+ foreach ($offsets as $offset => $length) {
+ $start = ($offset - $add_length > 0) ? $offset - $add_length : 0;
+ $length = $length + $add_length;
+ $starts[] = $start;
+ $lengths[] = $length;
}
- $string = substr($haystack, $offset, $limit - $offset);
- if ($offset != 0) {
- $string = "...$string";
- }
-
- if ($limit + $offset >= strlen($haystack)) {
- $string .= '...';
- }
-
- $substrings[] = $string;
- $lengths[] = strlen($string);
+ $offsets = search_consolidate_substrings($starts, $lengths);
}
- // sort by length of context.
- asort($lengths);
+ // sort by order of string size descending (which is roughly
+ // the proximity of matched terms) so we can keep the
+ // substrings with terms closest together and discard
+ // the others as needed to fit within $max_length.
+ arsort($offsets);
- $matched = '';
- foreach ($lengths as $i => $len) {
- $string = $substrings[$i];
+ $return_strs = array();
+ $total_length = 0;
+ foreach ($offsets as $start => $length) {
+ $string = trim(elgg_substr($haystack, $start, $length));
- if (strlen($matched) + strlen($string) < $max_length) {
- $matched .= $string;
+ // continue past if adding this substring exceeds max length
+ if ($total_length + $length > $max_length) {
+ continue;
}
- }
- $i = 1;
- foreach ($words as $word) {
- $search = "/($word)/i";
- $replace = "<strong class=\"searchMatch searchMatchColor$i\">$1</strong>";
- $matched = preg_replace($search, $replace, $matched);
- $i++;
+ $total_length += $length;
+ $return_strs[$start] = $string;
}
- return $matched;
-
-
- // crap below..
-
-
-
- for ($i=0; $i<$len; $i++) {
- $start = $starts[$i];
- $stop = $stops[$i];
- var_dump("Looking at $i = $start - $stop");
-
- while ($start - $stop <= 0) {
- $stop = $stops[$i++];
- var_dump("New start is $stop");
- }
+ // put the strings in order of occurence
+ ksort($return_strs);
- var_dump("$start-$stop");
+ // add ...s where needed
+ $return = implode('...', $return_strs);
+ if (!array_key_exists(0, $return_strs)) {
+ $return = "...$return";
}
- // find the intersecting contexts
- foreach ($starts as $i => $start_pos) {
- $words .= "{$words_arg[$i]}\t\t\t";
- echo "$start_pos\t\t\t";
+ // add to end of string if last substring doesn't hit the end.
+ $starts = array_keys($return_strs);
+ $last_pos = $starts[count($starts)-1];
+ if ($last_pos + elgg_strlen($return_strs[$last_pos]) < $haystack_length) {
+ $return .= '...';
}
- echo "\n";
+ $return = search_highlight_words($words, $return);
- foreach ($stops as $i => $stop_pos) {
- echo "$stop_pos\t\t\t";
- }
-echo "\n$words\n";
-
- // get full number of matches against all words to see how many we actually want to look at.
-
-
-
-
-// $desc = search_get_relevant_substring($entity->description, $params['query'], '<strong class="searchMatch">', '</strong>');
-
-
- $params['query'];
- // "this is"just a test "silly person"
+ return $return;
+}
- // check for "s
- $words_quotes = explode('"', $needle);
- $words_orig = explode(' ', $needle);
- $words = array();
+/**
+ * Takes an array of offsets and lengths and consolidates any
+ * overlapping entries, returning an array of new offsets and lengths
+ *
+ * Offsets and lengths are specified in separate arrays because of possible
+ * index collisions with the offsets.
+ *
+ * @param array $offsets
+ * @param array $lengths
+ * @return array
+ */
+function search_consolidate_substrings($offsets, $lengths) {
+ // sort offsets by occurence
+ asort($offsets, SORT_NUMERIC);
- foreach ($words_orig as $i => $word) {
- // figure out if we have a special operand
- $operand = substr($word, 0, 1);
- switch($operand) {
- case '"':
- // find the matching " if any. else, remove the "
- if (substr_count($query, '"') < 2) {
- $words[] = substr($word, 1);
- } else {
- $word = substr($word, 1);
- $word_i = $i;
- while ('"' != strpos($words_orig[$word_i], '"')) {
- $word .= " {$words_orig[$word_i]}";
- unset($words_orig[$word_i]);
- }
- }
+ // reset the indexes maintaining association with the original offsets.
+ $offsets = array_merge($offsets);
- break;
+ $new_lengths = array();
+ foreach ($offsets as $i => $offset) {
+ $new_lengths[] = $lengths[$i];
+ }
- case '+':
- // remove +
- $words[] = substr($word, 1);
- break;
+ $lengths = $new_lengths;
- case '~':
- case '-':
- // remove this from highlighted list.
+ $return = array();
+ $count = count($offsets);
+ for ($i=0; $i<$count; $i++) {
+ $offset = $offsets[$i];
+ $length = $lengths[$i];
+ $end_pos = $offset + $length;
+ // find the next entry that doesn't overlap
+ while(array_key_exists($i+1, $offsets) && $end_pos > $offsets[$i+1]) {
+ $i++;
+ if (!array_key_exists($i, $offsets)) {
break;
+ }
+ $end_pos = $lengths[$i] + $offsets[$i];
}
- }
- // pick out " queries
- if (substr_count($query, '"') >= 2) {
+ $length = $end_pos - $offset;
+ // will never have a colliding offset, so can return as a single array
+ $return[$offset] = $length;
}
- // ignore queries starting with -
-
-
- // @todo figure out a way to "center" the matches within the max_length.
- // if only one match, its context is $context + $max_length / 2
- // if 2 matches, its context is $context + $max_length / 4
- // if 3 matches, its context is $context + $max_length / 6
- // $context per match = $min_match_context + ($max_length / $num_count_match)
-
- // if $max_length / ($matched_count * 2) < $context
- // only match against the first X matches where $context >= $context
+ return $return;
}
/**
- * Returns a matching string with $context amount of context, optionally
- * surrounded by $before and $after.
- *
- * If no match is found, restricts string to $context*2 starting from strpos 0.
+ * Safely highlights the words in $words found in $string avoiding recursion
*
- * @param str $haystack
- * @param str $needle
- * @param str $before
- * @param str $after
- * @param int $context
- * @return str
+ * @param array $words
+ * @param string $string
+ * @return string
*/
-function search_get_relevant_substring($haystack, $needle, $before = '', $after = '', $context = 75) {
- $haystack = strip_tags($haystack);
- $needle = strip_tags($needle);
-
- $pos = strpos(strtolower($haystack), strtolower($needle));
+function search_highlight_words($words, $string) {
+ $i = 1;
+ $replace_html = array(
+ 'strong' => rand(10000,99999),
+ 'class' => rand(10000,99999),
+ 'searchMatch' => rand(10000,99999),
+ 'searchMatchColor' => rand(10000,99999)
+ );
- if ($pos === FALSE) {
- $str = substr($haystack, 0, $context*2);
- if (strlen($haystack) > $context*2) {
- $str .= '...';
- }
+ foreach ($words as $word) {
+ $search = "/($word)/i";
- return $str;
+ // must replace with placeholders in case one of the search terms is
+ // in the html string.
+ // later, will replace the placeholders with the actual html.
+ // Yeah this is hacky. I'm tired.
+ $strong = $replace_html['strong'];
+ $class = $replace_html['class'];
+ $searchMatch = $replace_html['searchMatch'];
+ $searchMatchColor = $replace_html['searchMatchColor'];
+
+ $replace = "<$strong $class=\"$searchMatch $searchMatchColor{$i}\">$1</$strong>";
+ $string = preg_replace($search, $replace, $string);
+ $i++;
}
- $start_pos = $pos - $context;
-
- if ($start_pos < 0) {
- $start_pos = 0;
+ foreach ($replace_html as $replace => $search) {
+ $string = str_replace($search, $replace, $string);
}
- // get string from -context to +context
- $matched = substr($haystack, $start_pos, $context*2);
+ return $string;
+}
- // add elipses to front.
- if ($start_pos > 0) {
- $matched = "...$matched";
- }
+/**
+ * Returns a query with stop and too short words removed.
+ * (Unless the entire query is < ft_min_word_chars, in which case
+ * it's taken literally.)
+ *
+ * @param array $query
+ * @param str $format Return as an array or a string
+ * @return mixed
+ */
+function search_remove_ignored_words($query, $format = 'array') {
+ global $CONFIG;
+
+ // don't worry about "s or boolean operators
+ $query = str_replace(array('"', '-', '+', '~'), '', stripslashes(strip_tags($query)));
+ $words = explode(' ', $query);
- // add elipses to end.
- if ($pos + strlen($needle) + $context*2 < strlen($haystack)) {
- $matched = "$matched...";
+ $min_chars = $CONFIG->search_info['min_chars'];
+ // if > ft_min_word we're not running in literal mode.
+ if ($query >= $min_chars) {
+ // clean out any words that are ignored by mysql
+ foreach ($words as $i => $word) {
+ if (elgg_strlen($word) < $min_chars) {
+ unset ($words[$i]);
+ }
+ }
}
- // surround if needed
- // @todo would getting each position of the match then
- // inserting manually based on the position be faster than preg_replace()?
- if ($before || $after) {
- $matched = str_ireplace($needle, $before . $needle . $after, $matched);
- //$matched = mb_ereg_replace("")
- // insert before
+ if ($format == 'string') {
+ return implode(' ', $words);
}
- return $matched;
+ return $words;
}
@@ -498,7 +372,7 @@ function search_get_listing_html($entities, $count, $params) {
* @param array $params Original search params
* @return str
*/
-function search_get_where_sql($table, $fields, $params) {
+function search_get_where_sql($table, $fields, $params, $use_fulltext = TRUE) {
global $CONFIG;
$query = $params['query'];
@@ -507,49 +381,49 @@ function search_get_where_sql($table, $fields, $params) {
$fields[$i] = "$table.$field";
}
+ // if we're not using full text, rewrite the query for bool mode.
+ // exploiting a feature(ish) of bool mode where +-word is the same as -word
+ if (!$use_fulltext) {
+ $query = '+' . str_replace(' ', ' +', $query);
+ }
+
// if query is shorter than the min for fts words
// it's likely a single acronym or similar
// switch to literal mode
- if (strlen($query) < $CONFIG->search_info['min_chars']) {
+ if (elgg_strlen($query) < $CONFIG->search_info['min_chars']) {
$likes = array();
$query = sanitise_string($query);
foreach ($fields as $field) {
$likes[] = "$field LIKE '%$query%'";
}
$likes_str = implode(' OR ', $likes);
- //$where = "($table.guid = e.guid AND ($likes_str))";
$where = "($likes_str)";
} else {
// if using advanced or paired "s, switch into boolean mode
- if ((isset($params['advanced_search']) && $params['advanced_search']) || substr_count($query, '"') >= 2 ) {
+ if (!$use_fulltext
+ || (isset($params['advanced_search']) && $params['advanced_search'])
+ || elgg_substr_count($query, '"') >= 2 ) {
$options = 'IN BOOLEAN MODE';
} else {
- // natural language mode is default and this keyword isn't supported
- // in < 5.1
+ // natural language mode is default and this keyword isn't supported in < 5.1
//$options = 'IN NATURAL LANGUAGE MODE';
$options = '';
}
// if short query, use query expansion.
- if (strlen($query) < 6) {
+ // @todo doesn't seem to be working well.
+ if (elgg_strlen($query) < 5) {
//$options .= ' WITH QUERY EXPANSION';
}
$query = sanitise_string($query);
- // if query is shorter than the ft_min_word_len switch to literal mode.
$fields_str = implode(',', $fields);
- //$where = "($table.guid = e.guid AND (MATCH ($fields_str) AGAINST ('$query' $options)))";
$where = "(MATCH ($fields_str) AGAINST ('$query' $options))";
}
return $where;
}
-function search_get_query_where_sql($table, $query) {
- // if there are multiple "s or 's it's a literal string.
-
-}
-
/** Register init system event **/
register_elgg_event_handler('init','system','search_init'); \ No newline at end of file