Index: modules/search/search.admin.inc =================================================================== RCS file: /cvs/drupal/drupal/modules/search/search.admin.inc,v retrieving revision 1.5 diff -u -F^f -r1.5 search.admin.inc --- modules/search/search.admin.inc 14 Apr 2008 17:48:41 -0000 1.5 +++ modules/search/search.admin.inc 10 May 2008 21:37:19 -0000 @@ -57,10 +57,15 @@ function search_admin_settings() { $form['indexing_throttle'] = array('#type' => 'fieldset', '#title' => t('Indexing throttle')); $form['indexing_throttle']['search_cron_limit'] = array('#type' => 'select', '#title' => t('Number of items to index per cron run'), '#default_value' => variable_get('search_cron_limit', 100), '#options' => $items, '#description' => t('The maximum number of items indexed in each pass of a cron maintenance task. If necessary, reduce the number of items to prevent timeouts and memory errors while indexing.', array('@cron' => url('admin/reports/status')))); // Indexing settings: + $input_filters = filter_formats(); + foreach ($input_filters as $fid => $filter) { + $options[$fid] = $filter->name; + } $form['indexing_settings'] = array('#type' => 'fieldset', '#title' => t('Indexing settings')); $form['indexing_settings']['info'] = array('#value' => t('
Changing the settings below will cause the site index to be rebuilt. The search index is not cleared but systematically updated to reflect the new settings. Searching will continue to work but new content won\'t be indexed until all existing content has been re-indexed.
The default settings should be appropriate for the majority of sites.
')); $form['indexing_settings']['minimum_word_size'] = array('#type' => 'textfield', '#title' => t('Minimum word length to index'), '#default_value' => variable_get('minimum_word_size', 3), '#size' => 5, '#maxlength' => 3, '#description' => t('The number of characters a word has to be to be indexed. A lower setting means better search result ranking, but also a larger database. Each search query must contain at least one keyword that is this size (or longer).')); $form['indexing_settings']['overlap_cjk'] = array('#type' => 'checkbox', '#title' => t('Simple CJK handling'), '#default_value' => variable_get('overlap_cjk', TRUE), '#description' => t('Whether to apply a simple Chinese/Japanese/Korean tokenizer based on overlapping sequences. Turn this off if you want to use an external preprocessor for this instead. Does not affect other languages.')); + $form['indexing_settings']['search_text_processor'] = array('#type' => 'select', '#title' => t('Input Filter'), '#options' => $options, '#default_value' => variable_get('search_text_processor', 'default_value'), '#description' => t('Choose which input filter should be used to process text before it is inserted into the search index. Use the !link to create/modify filters as needed.', array('!link' => l(t('input filter admin'), 'admin/settings/filters')))); $form['#validate'] = array('search_admin_settings_validate'); Index: modules/search/search.module =================================================================== RCS file: /cvs/drupal/drupal/modules/search/search.module,v retrieving revision 1.256 diff -u -F^f -r1.256 search.module --- modules/search/search.module 6 May 2008 12:18:50 -0000 1.256 +++ modules/search/search.module 10 May 2008 21:37:19 -0000 @@ -298,41 +298,80 @@ function search_update_totals() { } } +function search_filter($op, $delta = 0, $format = -1, $text = '') { + switch ($op) { + case 'list': + return array(0 => t('Decode entities'), + 1 => t('Drupal string-to-lower'), + 2 => t('Simple CJK handling'), + 3 => t('Numerical data'), + 4 => t('Punctuation'), + ); + + case 'description': + switch ($delta) { + case 0: + return t('Decode all HTML entities (including numerical ones) to regular UTF-8 bytes.'); + case 1; + return t('Lowercase strings in a unicode-safe way.'); + case 2; + return t('Matches all Chinese, Japanese, and Korean characters that are candidates for auto-splitting.'); + case 3: + return t('Remove the punctuation from numerical data.'); + case 4: + return t('Remove punctuation and make variants of hyphenated and underscored words, so that low-budget will become "lowbudget", "low" and "budget".'); + default; + return ''; + } + + case 'no cache': + return TRUE; + + case 'process': + switch ($delta) { + case 0: + return decode_entities($text); + case 1: + return drupal_strtolower($text); + case 2: + return preg_replace_callback('/[' . PREG_CLASS_CJK . ']+/u', 'search_expand_cjk', $text); + case 3: + // To improve searching for numerical data such as dates, IP addresses + // or version numbers, we consider a group of numerical characters + // separated only by punctuation characters to be one piece. + // This also means that searching for e.g. '20/03/1984' also returns + // results with '20-03-1984' in them. + // Readable regexp: ([number]+)[punctuation]+(?=[number]) + return preg_replace('/([' . PREG_CLASS_NUMBERS . ']+)[' . PREG_CLASS_PUNCTUATION . ']+(?=[' . PREG_CLASS_NUMBERS . '])/u', '\1', $text); + case 4: + // The dot, underscore and dash are simply removed. This allows meaningful + // search behavior with acronyms and URLs. + $matches = array(); + + //something full-fledged something + preg_match_all('/(\s*)(\w+)([._-]+)(\w+)(\s*)/', $text, $matches); + foreach ($matches[0] as $key => $match) { + $text = str_replace($match, $matches[1][$key] . $matches[2][$key] . $matches[4][$key] . $matches[5][$key], $text); + $text .= ' ' . $matches[2][$key] . ' ' . $matches[4][$key]; + } + + // With the exception of the rules above, we consider all punctuation, + // marks, spacers, etc, to be a word boundary. + return preg_replace('/[' . PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text); + default: + return $text; + } + + default: + return $text; + } +} + /** * Simplifies a string according to indexing rules. */ function search_simplify($text) { - // Decode entities to UTF-8 - $text = decode_entities($text); - - // Lowercase - $text = drupal_strtolower($text); - - // Call an external processor for word handling. - search_invoke_preprocess($text); - - // Simple CJK handling - if (variable_get('overlap_cjk', TRUE)) { - $text = preg_replace_callback('/[' . PREG_CLASS_CJK . ']+/u', 'search_expand_cjk', $text); - } - - // To improve searching for numerical data such as dates, IP addresses - // or version numbers, we consider a group of numerical characters - // separated only by punctuation characters to be one piece. - // This also means that searching for e.g. '20/03/1984' also returns - // results with '20-03-1984' in them. - // Readable regexp: ([number]+)[punctuation]+(?=[number]) - $text = preg_replace('/([' . PREG_CLASS_NUMBERS . ']+)[' . PREG_CLASS_PUNCTUATION . ']+(?=[' . PREG_CLASS_NUMBERS . '])/u', '\1', $text); - - // The dot, underscore and dash are simply removed. This allows meaningful - // search behavior with acronyms and URLs. - $text = preg_replace('/[._-]+/', '', $text); - - // With the exception of the rules above, we consider all punctuation, - // marks, spacers, etc, to be a word boundary. - $text = preg_replace('/[' . PREG_CLASS_SEARCH_EXCLUDE . ']+/u', ' ', $text); - - return $text; + return check_markup($text, variable_get('search_text_processor', 1), FALSE); } /** @@ -393,14 +432,6 @@ function _search_index_truncate(&$text) $text = truncate_utf8($text, 50); } -/** - * Invokes hook_search_preprocess() in modules. - */ -function search_invoke_preprocess(&$text) { - foreach (module_implements('search_preprocess') as $module) { - $text = module_invoke($module, 'search_preprocess', $text); - } -} /** * Update the full-text search index for a particular item. @@ -725,7 +756,7 @@ function search_parse_query($text) { $phrase = TRUE; $simple = FALSE; } - // Simplify keyword according to indexing rules and external preprocessors + // Simplify keyword according to indexing rules. $words = search_simplify($match[2]); // Re-explode in case simplification added more words, except when matching a phrase $words = $phrase ? array($words) : preg_split('/ /', $words, -1, PREG_SPLIT_NO_EMPTY);