Index: modules/search/search.module =================================================================== RCS file: /cvs/drupal/drupal/modules/search/search.module,v retrieving revision 1.256 diff -u -F^f -r1.256 search.module --- modules/search/search.module 6 May 2008 12:18:50 -0000 1.256 +++ modules/search/search.module 10 May 2008 19:43:09 -0000 @@ -298,41 +298,80 @@ function search_update_totals() { } } +function search_filter($op, $delta = 0, $format = -1, $text = '') { + switch ($op) { + case 'list': + return array(0 => t('Decode entities'), + 1 => t('Drupal string-to-lower'), + 2 => t('Simple CJK handling'), + 3 => t('Numerical data'), + 4 => t('Punctuation'), + ); + + case 'description': + switch ($delta) { + case 0: + return t('Decode all HTML entities (including numerical ones) to regular UTF-8 bytes.'); + case 1; + return t('Lowercase strings in a unicode-safe way.'); + case 2; + return t('Matches all Chinese, Japanese, and Korean characters that are candidates for auto-splitting.'); + case 3: + return t('Remove the punctuation from numerical data.'); + case 4: + return t('Remove punctuation and make variants of hyphenated and underscored words, so that low-budget will become "lowbudget", "low" and "budget".'); + default; + return ''; + } + + case 'no cache': + return TRUE; + + case 'process': + switch ($delta) { + case 0: + return decode_entities($text); + case 1: + return drupal_strtolower($text); + case 2: + return preg_replace_callback('/[' . PREG_CLASS_CJK . ']+/u', 'search_expand_cjk', $text); + case 3: + // To improve searching for numerical data such as dates, IP addresses + // or version numbers, we consider a group of numerical characters + // separated only by punctuation characters to be one piece. + // This also means that searching for e.g. '20/03/1984' also returns + // results with '20-03-1984' in them. + // Readable regexp: ([number]+)[punctuation]+(?=[number]) + return preg_replace('/([' . PREG_CLASS_NUMBERS . ']+)[' . PREG_CLASS_PUNCTUATION . ']+(?=[' . PREG_CLASS_NUMBERS . '])/u', '\1', $text); + case 4: + // The dot, underscore and dash are simply removed. This allows meaningful + // search behavior with acronyms and URLs. + $matches = array(); + + //something full-fledged something + preg_match_all('/(\s*)(\w+)([._-]+)(\w+)(\s*)/', $text, $matches); + foreach ($matches[0] as $key => $match) { + $text = str_replace($match, $matches[1][$key] . $matches[2][$key] . $matches[4][$key] . $matches[5][$key], $text); + $text .= ' ' . $matches[2][$key] . ' ' . $matches[4][$key]; + } + + // With the exception of the rules above, we consider all punctuation, + // marks, spacers, etc, to be a word boundary. + return preg_replace('/[' . PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text); + default: + return $text; + } + + default: + return $text; + } +} + /** * Simplifies a string according to indexing rules. */ function search_simplify($text) { - // Decode entities to UTF-8 - $text = decode_entities($text); - - // Lowercase - $text = drupal_strtolower($text); - - // Call an external processor for word handling. - search_invoke_preprocess($text); - - // Simple CJK handling - if (variable_get('overlap_cjk', TRUE)) { - $text = preg_replace_callback('/[' . PREG_CLASS_CJK . ']+/u', 'search_expand_cjk', $text); - } - - // To improve searching for numerical data such as dates, IP addresses - // or version numbers, we consider a group of numerical characters - // separated only by punctuation characters to be one piece. - // This also means that searching for e.g. '20/03/1984' also returns - // results with '20-03-1984' in them. - // Readable regexp: ([number]+)[punctuation]+(?=[number]) - $text = preg_replace('/([' . PREG_CLASS_NUMBERS . ']+)[' . PREG_CLASS_PUNCTUATION . ']+(?=[' . PREG_CLASS_NUMBERS . '])/u', '\1', $text); - - // The dot, underscore and dash are simply removed. This allows meaningful - // search behavior with acronyms and URLs. - $text = preg_replace('/[._-]+/', '', $text); - - // With the exception of the rules above, we consider all punctuation, - // marks, spacers, etc, to be a word boundary. - $text = preg_replace('/[' . PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text); - - return $text; + return check_markup($text, 3, FALSE); } /** @@ -396,9 +435,9 @@ function _search_index_truncate(&$text) /** * Invokes hook_search_preprocess() in modules. */ -function search_invoke_preprocess(&$text) { +function search_invoke_preprocess(&$text, $language = NULL) { foreach (module_implements('search_preprocess') as $module) { - $text = module_invoke($module, 'search_preprocess', $text); + $text = module_invoke($module, 'search_preprocess', $text, $language); } }