1 files changed, 177 insertions, 303 deletions
diff --git a/mod/search/start.php b/mod/search/start.php
index 18b743cde..42366318a 100644
--- a/mod/search/start.php
+++ b/mod/search/start.php
@@ -40,7 +40,7 @@ function search_init() {
 	// can't use get_data() here because some servers don't have these globals set,
 	// which throws a db exception.
 	$r = mysql_query('SELECT @@ft_min_word_len as min, @@ft_max_word_len as max');
-	if ($word_lens = mysql_fetch_assoc($r)) {
+	if ($r && ($word_lens = mysql_fetch_assoc($r))) {
 		$CONFIG->search_info['min_chars'] = $word_lens['min'];
 		$CONFIG->search_info['max_chars'] = $word_lens['max'];
 	} else {
@@ -74,11 +74,8 @@ function search_page_handler($page) {
 }
 
 /**
- * Return a string with highlighted matched elements.
- * Checks for "s
- * Provides context for matched elements.
- * Will not return more than $max_length of full context.
- * Only highlights words
+ * Return a string with highlighted matched queries and relevant context
+ * Determins context based upon occurance and distance of words with each other.
  *
  * @param unknown_type $haystack
  * @param unknown_type $need
@@ -86,358 +83,235 @@ function search_page_handler($page) {
  * @param unknown_type $max_length
  * @return unknown_type
  */
-function search_get_highlighted_relevant_substrings($haystack, $needle, $min_match_context = 15, $max_length = 500) {
+function search_get_highlighted_relevant_substrings($haystack, $query, $min_match_context = 30, $max_length = 300) {
 	global $CONFIG;
 	$haystack = strip_tags($haystack);
-	$haystack_lc = strtolower($haystack);
-//
-//	$haystack = "Like merge sort, quicksort can also be easily parallelized due to its "
-//		. "divide-and-conquer nature. Individual in-place partition operations are difficult "
-//		. "to parallelize, but once divided, different sections of the list can be sorted in parallel.  "
-//		. "If we have p processors, we can divide a list of n ele";
-//
-//	$needle = 'difficult to sort in parallel';
-
-	// for now don't worry about "s or boolean operators
-	$needle = str_replace(array('"', '-', '+', '~'), '', stripslashes(strip_tags($needle)));
-	$words = explode(' ', $needle);
+	$haystack_length = elgg_strlen($haystack);
+	$haystack_lc = elgg_strtolower($haystack);
 
-	$min_chars = $CONFIG->search_info['min_chars'];
-	// if > ft_min_word == not running in literal mode.
-	if ($needle >= $min_chars) {
-		// clean out any words that are ignored by mysql
-		foreach ($words as $i => $word) {
-			if (strlen($word) < $min_chars) {
-				unset ($words[$i]);
-			}
-		}
-	}
+	$words = search_remove_ignored_words($query, 'array');
 
-	/*
+	// if haystack < $max_length return the entire haystack w/formatting immediately
+	if ($haystack_length <= $max_length) {
+		$return = search_highlight_words($words, $haystack);
 
-	$body_len = 250
-
-	$context = 5-30, 20-45, 75-100, 150
-
-	can pull out context either on:
-		one of each matching term
-		X # of highest matching terms
+		return $return;
+	}
 
 
-	*/
-	$substr_counts = array();
-	$str_pos = array();
-	// matrices for being and end context lengths.
-	// defaults to min context.  will add additional context later if needed
+	// get the starting positions and lengths for all matching words
 	$starts = array();
-	$stops = array();
-
-	// map the words to the starts and stops
-	$words_arg = array();
-	$context_count = 0;
-
-
-	// get the full count of matches.
+	$lengths = array();
 	foreach ($words as $word) {
-		$word = strtolower($word);
-		$count = substr_count($haystack, $word);
-		$word_len = strlen($word);
+		$word = elgg_strtolower($word);
+		$count = elgg_substr_count($haystack_lc, $word);
+		$word_len = elgg_strlen($word);
 
 		// find the start positions for the words
 		if ($count > 1) {
-			$str_pos[$word] = array();
 			$offset = 0;
-			while (FALSE !== $pos = strpos($haystack, $word, $offset)) {
-				$str_pos[$word][] = $pos;
-				$starts[] = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
-				$stops[] = $pos + $word_len + $min_match_context;
-				$words_arg[] = $word;
-				$context_count += $min_match_context + $word_len;
+			while (FALSE !== $pos = elgg_strpos($haystack_lc, $word, $offset)) {
+				$start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
+				$starts[] = $start;
+				$stop = $pos + $word_len + $min_match_context;
+				$lengths[] = $stop - $start;
 				$offset += $pos + $word_len;
 			}
 		} else {
-			$pos = strpos($haystack, $word);
-			$str_pos[$word] = array($pos);
-			$starts[] = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
-			$stops[] = $pos + $word_len + $min_match_context;
-			$context_count += $min_match_context + $word_len;
-			$words_arg[] = $word;
-		}
-		$substr_counts[$word] = $count;
-	}
-
-	// sort by order of occurence
-	//krsort($substr_counts);
-	$full_count = array_sum($substr_counts);
-
-	// figure out what the context needs to be.
-	// take one of each matched phrase
-	// if there are any
-
-//
-//	var_dump($str_pos);
-//	var_dump($substr_counts);
-//	var_dump($context_count);
-
-
-	// sort to put them in order of occurence
-	asort($starts, SORT_NUMERIC);
-	asort($stops, SORT_NUMERIC);
-
-	// offset them correctly
-	$starts[] = 0;
-	$new_stops = array(0);
-	foreach ($stops as $i => $pos) {
-		$new_stops[$i+1] = $pos;
-	}
-	$stops = $new_stops;
-
-	$substrings = array();
-	$len = count($starts);
-
-	$starts = array_merge($starts);
-	$stops = array_merge($stops);
-
-	$offsets = array();
-	$limits = array();
-	$c = 0;
-	foreach ($starts as $i => $start) {
-		$stop = $stops[$i];
-		$offsets[$c] = $start;
-		$limits[$c] = $stop;
-
-		// never need the last one as it's just a displacing entry
-		if ($c+1 == count($starts)) {
-			break;
+			$pos = elgg_strpos($haystack_lc, $word);
+			$start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
+			$starts[] = $start;
+			$stop = $pos + $word_len + $min_match_context;
+			$lengths[] = $stop - $start;
 		}
-
-		if ($start - $stop < 0) {
-			//var_dump("Looking at c=$c & $start - $stop and going to unset {$limits[$c]}");
-			unset($offsets[$c]);
-			unset($limits[$c]);
-		}
-		$c++;
 	}
 
-	// reset indexes and remove placeholder elements.
-	$limits = array_merge($limits);
-	array_shift($limits);
-	$offsets = array_merge($offsets);
-	array_pop($offsets);
+	$offsets = search_consolidate_substrings($starts, $lengths);
 
-	// figure out if we need to adjust the offsets from the base
-	// this could result in overlapping summaries.
-	// might be nicer to just remove it.
-
-	$total_len = 0;
-	foreach ($offsets as $i => $offset) {
-		$total_len += $limits[$i] - $offset;
-	}
+	// figure out if we can adjust the offsets and lengths
+	// in order to return more context
+	$total_length = array_sum($offsets);
 
 	$add_length = 0;
 	if ($total_length < $max_length) {
-		$add_length = floor((($max_length - $total_len) / count($offsets)) / 2);
-	}
-
-	$lengths = array();
-	foreach ($offsets as $i => $offset) {
-		$limit = $limits[$i];
-		if ($offset == 0 && $add_length) {
-			$limit += $add_length;
-		} else {
-			$offset = $offset - $add_length;
+		$add_length = floor((($max_length - $total_length) / count($offsets)) / 2);
+
+		$starts = array();
+		$lengths = array();
+		foreach ($offsets as $offset => $length) {
+			$start = ($offset - $add_length > 0) ? $offset - $add_length : 0;
+			$length = $length + $add_length;
+			$starts[] = $start;
+			$lengths[] = $length;
 		}
-		$string = substr($haystack, $offset, $limit - $offset);
 
-		if ($offset != 0) {
-			$string = "...$string";
-		}
-
-		if ($limit + $offset >= strlen($haystack)) {
-			$string .= '...';
-		}
-
-		$substrings[] = $string;
-		$lengths[] = strlen($string);
+		$offsets = search_consolidate_substrings($starts, $lengths);
 	}
 
-	// sort by length of context.
-	asort($lengths);
+	// sort by order of string size descending (which is roughly
+	// the proximity of matched terms) so we can keep the
+	// substrings with terms closest together and discard
+	// the others as needed to fit within $max_length.
+	arsort($offsets);
 
-	$matched = '';
-	foreach ($lengths as $i => $len) {
-		$string = $substrings[$i];
+	$return_strs = array();
+	$total_length = 0;
+	foreach ($offsets as $start => $length) {
+		$string = trim(elgg_substr($haystack, $start, $length));
 
-		if (strlen($matched) + strlen($string) < $max_length) {
-			$matched .= $string;
+		// continue past if adding this substring exceeds max length
+		if ($total_length + $length > $max_length) {
+			continue;
 		}
-	}
 
-	$i = 1;
-	foreach ($words as $word) {
-		$search = "/($word)/i";
-		$replace = "<strong class=\"searchMatch searchMatchColor$i\">$1</strong>";
-		$matched = preg_replace($search, $replace, $matched);
-		$i++;
+		$total_length += $length;
+		$return_strs[$start] = $string;
 	}
 
-	return $matched;
-
-
-	// crap below..
-
-
-
-	for ($i=0; $i<$len; $i++) {
-		$start = $starts[$i];
-		$stop = $stops[$i];
-		var_dump("Looking at $i = $start - $stop");
-
-		while ($start - $stop <= 0) {
-			$stop = $stops[$i++];
-			var_dump("New start is $stop");
-		}
+	// put the strings in order of occurence
+	ksort($return_strs);
 
-		var_dump("$start-$stop");
+	// add ...s where needed
+	$return = implode('...', $return_strs);
+	if (!array_key_exists(0, $return_strs)) {
+		$return = "...$return";
 	}
 
-	// find the intersecting contexts
-	foreach ($starts as $i => $start_pos) {
-		$words .= "{$words_arg[$i]}\t\t\t";
-		echo "$start_pos\t\t\t";
+	// add to end of string if last substring doesn't hit the end.
+	$starts = array_keys($return_strs);
+	$last_pos = $starts[count($starts)-1];
+	if ($last_pos + elgg_strlen($return_strs[$last_pos]) < $haystack_length) {
+		$return .= '...';
 	}
 
-	echo "\n";
+	$return = search_highlight_words($words, $return);
 
-	foreach ($stops as $i => $stop_pos) {
-		echo "$stop_pos\t\t\t";
-	}
-echo "\n$words\n";
-
-	// get full number of matches against all words to see how many we actually want to look at.
-
-
-
-
-//	$desc = search_get_relevant_substring($entity->description, $params['query'], '<strong class="searchMatch">', '</strong>');
-
-
-	$params['query'];
-	// "this is"just a test "silly person"
+	return $return;
+}
 
-	// check for "s
-	$words_quotes = explode('"', $needle);
 
-	$words_orig = explode(' ', $needle);
-	$words = array();
+/**
+ * Takes an array of offsets and lengths and consolidates any
+ * overlapping entries, returning an array of new offsets and lengths
+ *
+ * Offsets and lengths are specified in separate arrays because of possible
+ * index collisions with the offsets.
+ *
+ * @param array $offsets
+ * @param array $lengths
+ * @return array
+ */
+function search_consolidate_substrings($offsets, $lengths) {
+	// sort offsets by occurence
+	asort($offsets, SORT_NUMERIC);
 
-	foreach ($words_orig as $i => $word) {
-		// figure out if we have a special operand
-		$operand = substr($word, 0, 1);
-		switch($operand) {
-			case '"':
-				// find the matching " if any.  else, remove the "
-				if (substr_count($query, '"') < 2) {
-					$words[] = substr($word, 1);
-				} else {
-					$word = substr($word, 1);
-					$word_i = $i;
-					while ('"' != strpos($words_orig[$word_i], '"')) {
-						$word .= " {$words_orig[$word_i]}";
-						unset($words_orig[$word_i]);
-					}
-				}
+	// reset the indexes maintaining association with the original offsets.
+	$offsets = array_merge($offsets);
 
-				break;
+	$new_lengths = array();
+	foreach ($offsets as $i => $offset) {
+		$new_lengths[] = $lengths[$i];
+	}
 
-			case '+':
-				// remove +
-				$words[] = substr($word, 1);
-				break;
+	$lengths = $new_lengths;
 
-			case '~':
-			case '-':
-				// remove this from highlighted list.
+	$return = array();
+	$count = count($offsets);
+	for ($i=0; $i<$count; $i++) {
+		$offset = $offsets[$i];
+		$length = $lengths[$i];
+		$end_pos = $offset + $length;
 
+		// find the next entry that doesn't overlap
+		while(array_key_exists($i+1, $offsets) && $end_pos > $offsets[$i+1]) {
+			$i++;
+			if (!array_key_exists($i, $offsets)) {
 				break;
+			}
+			$end_pos = $lengths[$i] + $offsets[$i];
 		}
-	}
 
-	// pick out " queries
-	if (substr_count($query, '"') >= 2) {
+		$length = $end_pos - $offset;
 
+		// will never have a colliding offset, so can return as a single array
+		$return[$offset] = $length;
 	}
 
-	// ignore queries starting with -
-
-
-	// @todo figure out a way to "center" the matches within the max_length.
-	// if only one match, its context is $context + $max_length / 2
-	// if 2 matches, its context is $context + $max_length / 4
-	// if 3 matches, its context is $context + $max_length / 6
-	// $context per match = $min_match_context + ($max_length / $num_count_match)
-
-	// if $max_length / ($matched_count * 2) < $context
-	// only match against the first X matches where $context >= $context
+	return $return;
 }
 
 /**
- * Returns a matching string with $context amount of context, optionally
- * surrounded by $before and $after.
- *
- * If no match is found, restricts string to $context*2 starting from strpos 0.
+ * Safely highlights the words in $words found in $string avoiding recursion
  *
- * @param str $haystack
- * @param str $needle
- * @param str $before
- * @param str $after
- * @param int $context
- * @return str
+ * @param array $words
+ * @param string $string
+ * @return string
  */
-function search_get_relevant_substring($haystack, $needle, $before = '', $after = '', $context = 75) {
-	$haystack = strip_tags($haystack);
-	$needle = strip_tags($needle);
-
-	$pos = strpos(strtolower($haystack), strtolower($needle));
+function search_highlight_words($words, $string) {
+	$i = 1;
+	$replace_html = array(
+		'strong' => rand(10000,99999),
+		'class' => rand(10000,99999),
+		'searchMatch' => rand(10000,99999),
+		'searchMatchColor' => rand(10000,99999)
+	);
 
-	if ($pos === FALSE) {
-		$str = substr($haystack, 0, $context*2);
-		if (strlen($haystack) > $context*2) {
-			$str .= '...';
-		}
+	foreach ($words as $word) {
+		$search = "/($word)/i";
 
-		return $str;
+		// must replace with placeholders in case one of the search terms is
+		// in the html string.
+		// later, will replace the placeholders with the actual html.
+		// Yeah this is hacky.  I'm tired.
+		$strong = $replace_html['strong'];
+		$class = $replace_html['class'];
+		$searchMatch = $replace_html['searchMatch'];
+		$searchMatchColor = $replace_html['searchMatchColor'];
+
+		$replace = "<$strong $class=\"$searchMatch $searchMatchColor{$i}\">$1</$strong>";
+		$string = preg_replace($search, $replace, $string);
+		$i++;
 	}
 
-	$start_pos = $pos - $context;
-
-	if ($start_pos < 0) {
-		$start_pos = 0;
+	foreach ($replace_html as $replace => $search) {
+		$string = str_replace($search, $replace, $string);
 	}
 
-	// get string from -context to +context
-	$matched = substr($haystack, $start_pos, $context*2);
+	return $string;
+}
 
-	// add elipses to front.
-	if ($start_pos > 0) {
-		$matched = "...$matched";
-	}
+/**
+ * Returns a query with stop and too short words removed.
+ * (Unless the entire query is < ft_min_word_chars, in which case
+ * it's taken literally.)
+ *
+ * @param array $query
+ * @param str $format Return as an array or a string
+ * @return mixed
+ */
+function search_remove_ignored_words($query, $format = 'array') {
+	global $CONFIG;
+
+	// don't worry about "s or boolean operators
+	$query = str_replace(array('"', '-', '+', '~'), '', stripslashes(strip_tags($query)));
+	$words = explode(' ', $query);
 
-	// add elipses to end.
-	if ($pos + strlen($needle) + $context*2 < strlen($haystack)) {
-		$matched = "$matched...";
+	$min_chars = $CONFIG->search_info['min_chars'];
+	// if > ft_min_word we're not running in literal mode.
+	if ($query >= $min_chars) {
+		// clean out any words that are ignored by mysql
+		foreach ($words as $i => $word) {
+			if (elgg_strlen($word) < $min_chars) {
+				unset ($words[$i]);
+			}
+		}
 	}
 
-	// surround if needed
-	// @todo would getting each position of the match then
-	// inserting manually based on the position be faster than preg_replace()?
-	if ($before || $after) {
-		$matched = str_ireplace($needle, $before . $needle . $after, $matched);
-		//$matched = mb_ereg_replace("")
-		// insert before
+	if ($format == 'string') {
+		return implode(' ', $words);
 	}
 
-	return $matched;
+	return $words;
 }
 
 
@@ -498,7 +372,7 @@ function search_get_listing_html($entities, $count, $params) {
  * @param array $params Original search params
  * @return str
  */
-function search_get_where_sql($table, $fields, $params) {
+function search_get_where_sql($table, $fields, $params, $use_fulltext = TRUE) {
 	global $CONFIG;
 	$query = $params['query'];
 
@@ -507,49 +381,49 @@ function search_get_where_sql($table, $fields, $params) {
 		$fields[$i] = "$table.$field";
 	}
 
+	// if we're not using full text, rewrite the query for bool mode.
+	// exploiting a feature(ish) of bool mode where +-word is the same as -word
+	if (!$use_fulltext) {
+		$query = '+' . str_replace(' ', ' +', $query);
+	}
+
 	// if query is shorter than the min for fts words
 	// it's likely a single acronym or similar
 	// switch to literal mode
-	if (strlen($query) < $CONFIG->search_info['min_chars']) {
+	if (elgg_strlen($query) < $CONFIG->search_info['min_chars']) {
 		$likes = array();
 		$query = sanitise_string($query);
 		foreach ($fields as $field) {
 			$likes[] = "$field LIKE '%$query%'";
 		}
 		$likes_str = implode(' OR ', $likes);
-		//$where = "($table.guid = e.guid AND	($likes_str))";
 		$where = "($likes_str)";
 	} else {
 		// if using advanced or paired "s, switch into boolean mode
-		if ((isset($params['advanced_search']) && $params['advanced_search']) || substr_count($query, '"') >= 2 ) {
+		if (!$use_fulltext
+		|| (isset($params['advanced_search']) && $params['advanced_search'])
+		|| elgg_substr_count($query, '"') >= 2 ) {
 			$options = 'IN BOOLEAN MODE';
 		} else {
-			// natural language mode is default and this keyword isn't supported
-			// in < 5.1
+			// natural language mode is default and this keyword isn't supported in < 5.1
 			//$options = 'IN NATURAL LANGUAGE MODE';
 			$options = '';
 		}
 
 		// if short query, use query expansion.
-		if (strlen($query) < 6) {
+		// @todo doesn't seem to be working well.
+		if (elgg_strlen($query) < 5) {
 			//$options .= ' WITH QUERY EXPANSION';
 		}
 		$query = sanitise_string($query);
 
-		// if query is shorter than the ft_min_word_len switch to literal mode.
 		$fields_str = implode(',', $fields);
-		//$where = "($table.guid = e.guid AND (MATCH ($fields_str) AGAINST ('$query' $options)))";
 		$where = "(MATCH ($fields_str) AGAINST ('$query' $options))";
 	}
 
 	return $where;
 }
 
-function search_get_query_where_sql($table, $query) {
-	// if there are multiple "s or 's it's a literal string.
-
-}
-
 /** Register init system event **/
 
 register_elgg_event_handler('init','system','search_init');
 \ No newline at end of file