From 12779ad9e454aa4872bf564849f94db0081a7cd9 Mon Sep 17 00:00:00 2001 From: brettp Date: Tue, 10 Nov 2009 03:48:41 +0000 Subject: Trying out the new context grabbing function in search. git-svn-id: http://code.elgg.org/elgg/trunk@3652 36083f99-b078-4883-b0ff-0f9b5a30f544 --- mod/search/search_hooks.php | 18 +++-- mod/search/start.php | 180 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 183 insertions(+), 15 deletions(-) (limited to 'mod/search') diff --git a/mod/search/search_hooks.php b/mod/search/search_hooks.php index 941e01e8c..f7a49400f 100644 --- a/mod/search/search_hooks.php +++ b/mod/search/search_hooks.php @@ -43,10 +43,12 @@ function search_objects_hook($hook, $type, $value, $params) { // add the volatile data for why these entities have been returned. foreach ($entities as $entity) { //$title = search_get_highlighted_relevant_substrings($entity->title, $params['query']); - $title = search_get_relevant_substring($entity->title, $params['query'], '', ''); + //$title = search_get_relevant_substring($entity->title, $params['query'], '', ''); + $title = search_get_highlighted_relevant_substrings($entity->title, $params['query']); $entity->setVolatileData('search_matched_title', $title); - $desc = search_get_relevant_substring($entity->description, $params['query'], '', ''); + //$desc = search_get_relevant_substring($entity->description, $params['query'], '', ''); + $desc = search_get_highlighted_relevant_substrings($entity->description, $params['query']); $entity->setVolatileData('search_matched_description', $desc); } @@ -91,10 +93,10 @@ function search_groups_hook($hook, $type, $value, $params) { // add the volatile data for why these entities have been returned. foreach ($entities as $entity) { - $description = search_get_relevant_substring($entity->description, $query, '', ''); + $description = search_get_highlighted_relevant_substrings($entity->description, $query); $entity->setVolatileData('search_matched_title', $description); - $name = search_get_relevant_substring($entity->name, $query, '', ''); + $name = search_get_highlighted_relevant_substrings($entity->name, $query); $entity->setVolatileData('search_matched_description', $name); } @@ -139,10 +141,10 @@ function search_users_hook($hook, $type, $value, $params) { // add the volatile data for why these entities have been returned. foreach ($entities as $entity) { - $username = search_get_relevant_substring($entity->username, $query, '', ''); + $username = search_get_highlighted_relevant_substrings($entity->username, $query); $entity->setVolatileData('search_matched_title', $username); - $name = search_get_relevant_substring($entity->name, $query, '', ''); + $name = search_get_highlighted_relevant_substrings($entity->name, $query); $entity->setVolatileData('search_matched_description', $name); } @@ -180,7 +182,7 @@ function search_tags_hook($hook, $type, $value, $params) { // add the volatile data for why these entities have been returned. foreach ($entities as $entity) { $tags = implode(',', $entity->tags); - $tags_str = search_get_relevant_substring($tags, $query, '', ''); + $tags_str = search_get_highlighted_relevant_substrings($tags, $params['query']); $entity->setVolatileData('search_matched_tags', $tags_str); } @@ -253,7 +255,7 @@ function search_comments_hook($hook, $type, $value, $params) { if (!$entity = get_entity($comment->entity_guid)) { continue; } - $comment_str = search_get_relevant_substring($comment->comment, $query, '', ''); + $comment_str = search_get_highlighted_relevant_substrings($comment->comment, $query); $entity->setVolatileData('search_matched_comment', $comment_str); $entity->setVolatileData('search_matched_comment_owner_guid', $comment->owner_guid); $entity->setVolatileData('search_matched_comment_time_created', $comment->time_created); diff --git a/mod/search/start.php b/mod/search/start.php index aa76c13b2..a53cebbb6 100644 --- a/mod/search/start.php +++ b/mod/search/start.php @@ -87,8 +87,16 @@ function search_page_handler($page) { * @return unknown_type */ function search_get_highlighted_relevant_substrings($haystack, $needle, $min_match_context = 15, $max_length = 250) { + global $CONFIG; $haystack = strip_tags($haystack); $haystack_lc = strtolower($haystack); +// +// $haystack = "Like merge sort, quicksort can also be easily parallelized due to its " +// . "divide-and-conquer nature. Individual in-place partition operations are difficult " +// . "to parallelize, but once divided, different sections of the list can be sorted in parallel. " +// . "If we have p processors, we can divide a list of n ele"; +// +// $needle = 'difficult to sort in parallel'; // for now don't worry about "s or boolean operators $needle = str_replace(array('"', '-', '+', '~'), '', stripslashes(strip_tags($needle))); @@ -105,8 +113,30 @@ function search_get_highlighted_relevant_substrings($haystack, $needle, $min_mat } } + /* + + $body_len = 250 + + $context = 5-30, 20-45, 75-100, 150 + + can pull out context either on: + one of each matching term + X # of highest matching terms + + + */ $substr_counts = array(); $str_pos = array(); + // matrices for being and end context lengths. + // defaults to min context. will add additional context later if needed + $starts = array(); + $stops = array(); + + // map the words to the starts and stops + $words_arg = array(); + $context_count = 0; + + // get the full count of matches. foreach ($words as $word) { $word = strtolower($word); @@ -114,29 +144,163 @@ function search_get_highlighted_relevant_substrings($haystack, $needle, $min_mat $word_len = strlen($word); // find the start positions for the words - // get the context for words based upon if ($count > 1) { $str_pos[$word] = array(); $offset = 0; while (FALSE !== $pos = strpos($haystack, $word, $offset)) { $str_pos[$word][] = $pos; + $starts[] = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; + $stops[] = $pos + $word_len + $min_match_context; + $words_arg[] = $word; + $context_count += $min_match_context + $word_len; $offset += $pos + $word_len; } } else { - $str_pos[$word] = array(strpos($haystack, $word)); + $pos = strpos($haystack, $word); + $str_pos[$word] = array($pos); + $starts[] = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0; + $stops[] = $pos + $word_len + $min_match_context; + $context_count += $min_match_context + $word_len; + $words_arg[] = $word; } $substr_counts[$word] = $count; } -//A test with multiple words and now more in the subject too because words need to be everywhere - // sort by order of occurence - krsort($substr_counts); + //krsort($substr_counts); $full_count = array_sum($substr_counts); + // figure out what the context needs to be. + // take one of each matched phrase + // if there are any + +// +// var_dump($str_pos); +// var_dump($substr_counts); +// var_dump($context_count); + + + // sort to put them in order of occurence + asort($starts, SORT_NUMERIC); + asort($stops, SORT_NUMERIC); + + // offset them correctly + $starts[] = 0; + $new_stops = array(0); + foreach ($stops as $i => $pos) { + $new_stops[$i+1] = $pos; + } + $stops = $new_stops; + + $substrings = array(); + $len = count($starts); + + $starts = array_merge($starts); + $stops = array_merge($stops); + + $offsets = array(); + $limits = array(); + $c = 0; + foreach ($starts as $i => $start) { + $stop = $stops[$i]; + $offsets[$c] = $start; + $limits[$c] = $stop; + + // never need the last one as it's just a displacing entry + if ($c+1 == count($starts)) { + break; + } + + if ($start - $stop < 0) { + //var_dump("Looking at c=$c & $start - $stop and going to unset {$limits[$c]}"); + unset($offsets[$c]); + unset($limits[$c]); + } + $c++; + } + + // reset indexes and remove placeholder elements. + $limits = array_merge($limits); + array_shift($limits); + $offsets = array_merge($offsets); + array_pop($offsets); + + // figure out if we need to adjust the offsets from the base + // this could result in overlapping summaries. + // might be nicer to just remove it. + + $total_len = 0; + foreach ($offsets as $i => $offset) { + $total_len += $limits[$i] - $offset; + } + + $add_length = 0; + if ($total_length < $max_length) { + $add_length = floor((($max_length - $total_len) / count($offsets)) / 2); + } + + + foreach ($offsets as $i => $offset) { + $limit = $limits[$i]; + if ($offset == 0 && $add_length) { + $limit += $add_length; + } else { + $offset = $offset - $add_length; + } + $string = substr($haystack, $offset, $limit - $offset); + + if ($limit-$offset < strlen($haystack)) { + $string = "$string..."; + } + + $substrings[] = $string; + } + $matched = ''; + foreach ($substrings as $string) { + if (strlen($matched) + strlen($string) < $max_length) { + $matched .= $string; + } + } + + foreach ($words as $word) { + $search = "/($word)/i"; + $replace = "$1"; + $matched = preg_replace($search, $replace, $matched); + } + + return $matched; + // crap below.. + + + + for ($i=0; $i<$len; $i++) { + $start = $starts[$i]; + $stop = $stops[$i]; + var_dump("Looking at $i = $start - $stop"); + + while ($start - $stop <= 0) { + $stop = $stops[$i++]; + var_dump("New start is $stop"); + } + + var_dump("$start-$stop"); + } + + // find the intersecting contexts + foreach ($starts as $i => $start_pos) { + $words .= "{$words_arg[$i]}\t\t\t"; + echo "$start_pos\t\t\t"; + } + + echo "\n"; + + foreach ($stops as $i => $stop_pos) { + echo "$stop_pos\t\t\t"; + } +echo "\n$words\n"; // get full number of matches against all words to see how many we actually want to look at. @@ -170,8 +334,6 @@ function search_get_highlighted_relevant_substrings($haystack, $needle, $min_mat $word .= " {$words_orig[$word_i]}"; unset($words_orig[$word_i]); } - - } break; @@ -255,8 +417,12 @@ function search_get_relevant_substring($haystack, $needle, $before = '', $after } // surround if needed + // @todo would getting each position of the match then + // inserting manually based on the position be faster than preg_replace()? if ($before || $after) { $matched = str_ireplace($needle, $before . $needle . $after, $matched); + //$matched = mb_ereg_replace("") + // insert before } return $matched; -- cgit v1.2.3