aboutsummaryrefslogtreecommitdiff
path: root/mod/search/start.php
diff options
context:
space:
mode:
authorbrettp <brettp@36083f99-b078-4883-b0ff-0f9b5a30f544>2009-11-10 03:48:41 +0000
committerbrettp <brettp@36083f99-b078-4883-b0ff-0f9b5a30f544>2009-11-10 03:48:41 +0000
commit12779ad9e454aa4872bf564849f94db0081a7cd9 (patch)
tree6d690da07feaeb7e7f66beb3680db63fc1e685d9 /mod/search/start.php
parentfcaf7ad2e618deb50cbe48d2a4f8256e87878761 (diff)
downloadelgg-12779ad9e454aa4872bf564849f94db0081a7cd9.tar.gz
elgg-12779ad9e454aa4872bf564849f94db0081a7cd9.tar.bz2
Trying out the new context grabbing function in search.
git-svn-id: http://code.elgg.org/elgg/trunk@3652 36083f99-b078-4883-b0ff-0f9b5a30f544
Diffstat (limited to 'mod/search/start.php')
-rw-r--r--mod/search/start.php180
1 files changed, 173 insertions, 7 deletions
diff --git a/mod/search/start.php b/mod/search/start.php
index aa76c13b2..a53cebbb6 100644
--- a/mod/search/start.php
+++ b/mod/search/start.php
@@ -87,8 +87,16 @@ function search_page_handler($page) {
* @return unknown_type
*/
function search_get_highlighted_relevant_substrings($haystack, $needle, $min_match_context = 15, $max_length = 250) {
+ global $CONFIG;
$haystack = strip_tags($haystack);
$haystack_lc = strtolower($haystack);
+//
+// $haystack = "Like merge sort, quicksort can also be easily parallelized due to its "
+// . "divide-and-conquer nature. Individual in-place partition operations are difficult "
+// . "to parallelize, but once divided, different sections of the list can be sorted in parallel. "
+// . "If we have p processors, we can divide a list of n ele";
+//
+// $needle = 'difficult to sort in parallel';
// for now don't worry about "s or boolean operators
$needle = str_replace(array('"', '-', '+', '~'), '', stripslashes(strip_tags($needle)));
@@ -105,8 +113,30 @@ function search_get_highlighted_relevant_substrings($haystack, $needle, $min_mat
}
}
+ /*
+
+ $body_len = 250
+
+ $context = 5-30, 20-45, 75-100, 150
+
+ can pull out context either on:
+ one of each matching term
+ X # of highest matching terms
+
+
+ */
$substr_counts = array();
$str_pos = array();
+ // matrices for being and end context lengths.
+ // defaults to min context. will add additional context later if needed
+ $starts = array();
+ $stops = array();
+
+ // map the words to the starts and stops
+ $words_arg = array();
+ $context_count = 0;
+
+
// get the full count of matches.
foreach ($words as $word) {
$word = strtolower($word);
@@ -114,29 +144,163 @@ function search_get_highlighted_relevant_substrings($haystack, $needle, $min_mat
$word_len = strlen($word);
// find the start positions for the words
- // get the context for words based upon
if ($count > 1) {
$str_pos[$word] = array();
$offset = 0;
while (FALSE !== $pos = strpos($haystack, $word, $offset)) {
$str_pos[$word][] = $pos;
+ $starts[] = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
+ $stops[] = $pos + $word_len + $min_match_context;
+ $words_arg[] = $word;
+ $context_count += $min_match_context + $word_len;
$offset += $pos + $word_len;
}
} else {
- $str_pos[$word] = array(strpos($haystack, $word));
+ $pos = strpos($haystack, $word);
+ $str_pos[$word] = array($pos);
+ $starts[] = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
+ $stops[] = $pos + $word_len + $min_match_context;
+ $context_count += $min_match_context + $word_len;
+ $words_arg[] = $word;
}
$substr_counts[$word] = $count;
}
-//A test with multiple words and now more in the subject too because words need to be everywhere
-
// sort by order of occurence
- krsort($substr_counts);
+ //krsort($substr_counts);
$full_count = array_sum($substr_counts);
+ // figure out what the context needs to be.
+ // take one of each matched phrase
+ // if there are any
+
+//
+// var_dump($str_pos);
+// var_dump($substr_counts);
+// var_dump($context_count);
+
+
+ // sort to put them in order of occurence
+ asort($starts, SORT_NUMERIC);
+ asort($stops, SORT_NUMERIC);
+
+ // offset them correctly
+ $starts[] = 0;
+ $new_stops = array(0);
+ foreach ($stops as $i => $pos) {
+ $new_stops[$i+1] = $pos;
+ }
+ $stops = $new_stops;
+
+ $substrings = array();
+ $len = count($starts);
+
+ $starts = array_merge($starts);
+ $stops = array_merge($stops);
+
+ $offsets = array();
+ $limits = array();
+ $c = 0;
+ foreach ($starts as $i => $start) {
+ $stop = $stops[$i];
+ $offsets[$c] = $start;
+ $limits[$c] = $stop;
+
+ // never need the last one as it's just a displacing entry
+ if ($c+1 == count($starts)) {
+ break;
+ }
+
+ if ($start - $stop < 0) {
+ //var_dump("Looking at c=$c & $start - $stop and going to unset {$limits[$c]}");
+ unset($offsets[$c]);
+ unset($limits[$c]);
+ }
+ $c++;
+ }
+
+ // reset indexes and remove placeholder elements.
+ $limits = array_merge($limits);
+ array_shift($limits);
+ $offsets = array_merge($offsets);
+ array_pop($offsets);
+
+ // figure out if we need to adjust the offsets from the base
+ // this could result in overlapping summaries.
+ // might be nicer to just remove it.
+
+ $total_len = 0;
+ foreach ($offsets as $i => $offset) {
+ $total_len += $limits[$i] - $offset;
+ }
+
+ $add_length = 0;
+ if ($total_length < $max_length) {
+ $add_length = floor((($max_length - $total_len) / count($offsets)) / 2);
+ }
+
+
+ foreach ($offsets as $i => $offset) {
+ $limit = $limits[$i];
+ if ($offset == 0 && $add_length) {
+ $limit += $add_length;
+ } else {
+ $offset = $offset - $add_length;
+ }
+ $string = substr($haystack, $offset, $limit - $offset);
+
+ if ($limit-$offset < strlen($haystack)) {
+ $string = "$string...";
+ }
+
+ $substrings[] = $string;
+ }
+ $matched = '';
+ foreach ($substrings as $string) {
+ if (strlen($matched) + strlen($string) < $max_length) {
+ $matched .= $string;
+ }
+ }
+
+ foreach ($words as $word) {
+ $search = "/($word)/i";
+ $replace = "<strong class=\"searchMatch\">$1</strong>";
+ $matched = preg_replace($search, $replace, $matched);
+ }
+
+ return $matched;
+ // crap below..
+
+
+
+ for ($i=0; $i<$len; $i++) {
+ $start = $starts[$i];
+ $stop = $stops[$i];
+ var_dump("Looking at $i = $start - $stop");
+
+ while ($start - $stop <= 0) {
+ $stop = $stops[$i++];
+ var_dump("New start is $stop");
+ }
+
+ var_dump("$start-$stop");
+ }
+
+ // find the intersecting contexts
+ foreach ($starts as $i => $start_pos) {
+ $words .= "{$words_arg[$i]}\t\t\t";
+ echo "$start_pos\t\t\t";
+ }
+
+ echo "\n";
+
+ foreach ($stops as $i => $stop_pos) {
+ echo "$stop_pos\t\t\t";
+ }
+echo "\n$words\n";
// get full number of matches against all words to see how many we actually want to look at.
@@ -170,8 +334,6 @@ function search_get_highlighted_relevant_substrings($haystack, $needle, $min_mat
$word .= " {$words_orig[$word_i]}";
unset($words_orig[$word_i]);
}
-
-
}
break;
@@ -255,8 +417,12 @@ function search_get_relevant_substring($haystack, $needle, $before = '', $after
}
// surround if needed
+ // @todo would getting each position of the match then
+ // inserting manually based on the position be faster than preg_replace()?
if ($before || $after) {
$matched = str_ireplace($needle, $before . $needle . $after, $matched);
+ //$matched = mb_ereg_replace("")
+ // insert before
}
return $matched;