From 24e3ff747614364d0d44fc1a7644f164146c66e1 Mon Sep 17 00:00:00 2001 From: brettp Date: Sat, 7 Nov 2009 20:57:32 +0000 Subject: Moved default search hooks into search mod. Using MySQL's MATCH ... AGAINST instead of likes for most searches. Changed 'tag' to 'q' while maintaining backward compatibility. git-svn-id: http://code.elgg.org/elgg/trunk@3633 36083f99-b078-4883-b0ff-0f9b5a30f544 --- mod/search/start.php | 300 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 209 insertions(+), 91 deletions(-) (limited to 'mod/search/start.php') diff --git a/mod/search/start.php b/mod/search/start.php index 4bd342285..47405450a 100644 --- a/mod/search/start.php +++ b/mod/search/start.php @@ -1,5 +1,4 @@ search_info = array(); + $CONFIG->search_info['min_chars'] = $word_lens[0]->min; + $CONFIG->search_info['max_chars'] = $word_lens[0]->max; + // add in CSS for search elements extend_view('css', 'search/css'); } @@ -29,114 +50,151 @@ function search_init() { function search_page_handler($page) { global $CONFIG; - if(!get_input('tag')) { - set_input('tag', $page[0]); + // if there is no q set, we're being called from a legacy installation + // it expects a search by tags. + // actually it doesn't, but maybe it should. + // maintain backward compatibility + if(!get_input('q', get_input('tag', NULL))) { + set_input('q', $page[0]); + //set_input('search_type', 'tags'); } - include_once($CONFIG->path . "mod/search/index.php"); + include_once('index.php'); } /** - * Core search hook. - * Returns an object with two parts: - * ->entities: an array of instantiated entities that have been decorated with - * volatile "search" data indicating what they matched. These are - * the entities to be displayed to the user on this page. - * ->total: total number of entities overall. This function can update this - * limit to ask for more pages in the pagination. + * Return a string with highlighted matched elements. + * Checks for "s + * Provides context for matched elements. + * Will not return more than $max_length of full context. + * Only highlights words + * + * @param unknown_type $haystack + * @param unknown_type $need + * @param unknown_type $context + * @param unknown_type $max_length + * @return unknown_type */ -function search_original_hook($hook, $type, $returnvalue, $params) { - global $CONFIG; +function search_get_highlighted_relevant_substrings($haystack, $needle, $min_match_context = 15, $max_length = 250) { + $haystack = strip_tags($haystack); + $haystack_lc = strtolower($haystack); + + // for now don't worry about "s or boolean operators + $needle = str_replace(array('"', '-', '+', '~'), '', stripslashes(strip_tags($needle))); + $words = explode(' ', $needle); + + $min_chars = $CONFIG->search_info['min_chars']; + // if > ft_min_word == not running in literal mode. + if ($needle >= $min_chars) { + // clean out any words that are ignored by mysql + foreach ($words as $i => $word) { + if (strlen($word) < $min_chars) { + unset ($words[$i]); + } + } + } - var_dump($CONFIG->hooks); - - $tag = $params['tag']; - $offset = $params['offset']; // starting page - $limit = $params['limit']; // number per page - $searchtype = $params['searchtype']; // the search type we're looking for - $object_type = $params['object_type']; - $subtype = $params['subtype']; - $owner_guid = $params['owner_guid']; - $tagtype = $params['tagtype']; - - $count = get_entities_from_metadata($tagtype, elgg_strtolower($tag), $object_type, $subtype, $owner_guid, $limit, $offset, "", 0, TRUE, FALSE); - $ents = get_entities_from_metadata($tagtype, elgg_strtolower($tag), $object_type, $subtype, $owner_guid, $limit, $offset, "", 0, FALSE, FALSE); - -// $options = array( -// 'metadata_name_value_pair' => array('name' => $params['tagtype'], 'value' => $params['tag'], 'case_sensitive' => false), -// 'offset' => $params['offset'], -// 'limit' => $params['limit'], -// 'type' => $params['object_type'], -// 'subtype' => $params['subtype'], -// 'owner_guid' => $params['owner_guid'] -// ); -// -// $count = elgg_get_entities_from_metadata(array_merge($options, array('count' => TRUE))); -// $entities = elgg_get_entities_from_metadata($options); - - /* - * Foreach entity - * get the metadata keys - * If the value matches, hang onto the key - * add all the matched keys to VolatileData - * This tells us *why* each entity matched - */ - foreach ($ents as $ent) { - $metadata = get_metadata_for_entity($ent->getGUID()); - $matched = array(); - if ($metadata) { - foreach ($metadata as $tuple) { - if ($tag === $tuple->value) { - // This is one of the matching elements - $matched[] = $tuple->name; - } + $substr_counts = array(); + $str_pos = array(); + // get the full count of matches. + foreach ($words as $word) { + $word = strtolower($word); + $count = substr_count($haystack, $word); + $word_len = strlen($word); + + // find the start positions for the words + // get the context for words based upon + if ($count > 1) { + $str_pos[$word] = array(); + $offset = 0; + while (FALSE !== $pos = strpos($haystack, $word, $offset)) { + $str_pos[$word][] = $pos; + $offset += $pos + $word_len; } - $ent->setVolatileData('search', $matched); + } else { + $str_pos[$word] = array(strpos($haystack, $word)); } + $substr_counts[$word] = $count; } - // merge in our entities with any coming in from elsewhere - $returnvalue->entities = array_merge($returnvalue->entities, $ents); +//A test with multiple words and now more in the subject too because words need to be everywhere - // expand the total entity count if necessary - if ($count > $returnvalue->total) { - $returnvalue->total = $count; - } + // sort by order of occurence + krsort($substr_counts); + $full_count = array_sum($substr_counts); - return $returnvalue; -} -/** - * Provides default search for registered entity subtypes. - * Entity types should be dealt with in the entity classes. (Objects are an exception). - * - * @param unknown_type $hook - * @param unknown_type $type - * @param unknown_type $returnvalue - * @param unknown_type $params - * @return unknown_type - */ -function search_registered_entities($hook, $type, $returnvalue, $params) { - $entity_types = get_registered_entity_types(); - foreach ($entity_types as $type => $subtypes) { - if (is_array($subtypes) && count($subtypes)) { - } + + // get full number of matches against all words to see how many we actually want to look at. + + + + +// $desc = search_get_relevant_substring($entity->description, $params['query'], '', ''); + + + $params['query']; + // "this is"just a test "silly person" + + // check for "s + $words_quotes = explode('"', $needle); + + $words_orig = explode(' ', $needle); + $words = array(); + + foreach ($words_orig as $i => $word) { + // figure out if we have a special operand + $operand = substr($word, 0, 1); + switch($operand) { + case '"': + // find the matching " if any. else, remove the " + if (substr_count($query, '"') < 2) { + $words[] = substr($word, 1); + } else { + $word = substr($word, 1); + $word_i = $i; + while ('"' != strpos($words_orig[$word_i], '"')) { + $word .= " {$words_orig[$word_i]}"; + unset($words_orig[$word_i]); + } + + + } + + break; + + case '+': + // remove + + $words[] = substr($word, 1); + break; + + case '~': + case '-': + // remove this from highlighted list. + + break; + } } -} -/** - * return our base search types (right now, we have none) - */ -function search_base_search_types_hook($hook, $type, $returnvalue, $params) { - if (!is_array($returnvalue)) { - $returnvalue = array(); + // pick out " queries + if (substr_count($query, '"') >= 2) { + } - return $returnvalue; -} + // ignore queries starting with - + + // @todo figure out a way to "center" the matches within the max_length. + // if only one match, its context is $context + $max_length / 2 + // if 2 matches, its context is $context + $max_length / 4 + // if 3 matches, its context is $context + $max_length / 6 + // $context per match = $min_match_context + ($max_length / $num_count_match) + + // if $max_length / ($matched_count * 2) < $context + // only match against the first X matches where $context >= $context +} /** * Returns a matching string with $context amount of context, optionally @@ -148,7 +206,7 @@ function search_base_search_types_hook($hook, $type, $returnvalue, $params) { * @param str $needle * @param str $before * @param str $after - * @param str $context + * @param int $context * @return str */ function search_get_relevant_substring($haystack, $needle, $before = '', $after = '', $context = 75) { @@ -181,7 +239,7 @@ function search_get_relevant_substring($haystack, $needle, $before = '', $after } // add elipses to end. - if ($start_pos + $context < strlen($haystack)) { + if ($pos + strlen($needle) + $context*2 < strlen($haystack)) { $matched = "$matched..."; } @@ -194,7 +252,15 @@ function search_get_relevant_substring($haystack, $needle, $before = '', $after } - +/** + * Passes entities, count, and original params to the view functions for + * search type. + * + * @param array $entities + * @param int $count + * @param array $params + * @return string + */ function search_get_listing_html($entities, $count, $params) { if (!is_array($entities) || !$count) { return FALSE; @@ -235,6 +301,58 @@ function search_get_listing_html($entities, $count, $params) { return FALSE; } +/** + * Returns a where clause for a search query. + * + * @param str $table Prefix for table to search on + * @param array $fields Fields to match against + * @param array $params Original search params + * @return str + */ +function search_get_where_sql($table, $fields, $params) { + global $CONFIG; + $query = $params['query']; + + // add the table prefix to the fields + foreach ($fields as $i => $field) { + $fields[$i] = "$table.$field"; + } + + // if query is shorter than the min for fts words + // it's likely a single acronym or similar + // switch to literal mode + if (strlen($query) < $CONFIG->search_info['min_chars']) { + $likes = array(); + foreach ($fields as $field) { + $likes[] = "$field LIKE '%$query%'"; + } + $likes_str = implode(' OR ', $likes); + $where = "($table.guid = e.guid AND ($likes_str))"; + } else { + // if using advanced or paired "s, switch into boolean mode + if ((isset($params['advanced_search']) && $params['advanced_search']) || substr_count($query, '"') >= 2 ) { + $options = 'IN BOOLEAN MODE'; + } else { + $options = 'IN NATURAL LANGUAGE MODE'; + } + + // if short query, use query expansion. + if (strlen($query) < 6) { + $options .= ' WITH QUERY EXPANSION'; + } + // if query is shorter than the ft_min_word_len switch to literal mode. + $fields_str = implode(',', $fields); + $where = "($table.guid = e.guid AND (MATCH ($fields_str) AGAINST ('$query' $options)))"; + } + + return $where; +} + +function search_get_query_where_sql($table, $query) { + // if there are multiple "s or 's it's a literal string. + +} + /** Register init system event **/ register_elgg_event_handler('init','system','search_init'); \ No newline at end of file -- cgit v1.2.3