Index: modules/search/search.admin.inc =================================================================== RCS file: /cvs/drupal/drupal/modules/search/search.admin.inc,v retrieving revision 1.9 diff -u -p -r1.9 search.admin.inc --- modules/search/search.admin.inc 20 Jul 2009 17:19:51 -0000 1.9 +++ modules/search/search.admin.inc 24 Jul 2009 07:06:36 -0000 @@ -87,6 +87,20 @@ function search_admin_settings() { '#default_value' => TRUE, '#description' => t('Whether to apply a simple Chinese/Japanese/Korean tokenizer based on overlapping sequences. Turn this off if you want to use an external preprocessor for this instead. Does not affect other languages.') ); + + // Create the options array + $n_gram_options[0] = t('Full Word Matches'); + foreach (range(1, 6) as $size) { + $n_gram_options[$size] = t('@num Character size', array('@num' => $size)); + } + + $form['indexing_settings']['search_gram_size'] = array( + '#type' => 'select', + '#title' => t('Character Matches'), + '#description' => t('This allows for partial word matching in searches. It breaks a word down in a several N sized character strings. Set it to Full Word Matches if you do not want this feature. The smaller the number of characters the more accurate the results but at a larger database cost.'), + '#options' => $n_gram_options, + '#default_value' => variable_get('search_gram_size', 0), + ); $form['#submit'][] = 'search_admin_settings_submit'; Index: modules/search/search.module =================================================================== RCS file: /cvs/drupal/drupal/modules/search/search.module,v retrieving revision 1.303 diff -u -p -r1.303 search.module --- modules/search/search.module 23 Jul 2009 20:58:26 -0000 1.303 +++ modules/search/search.module 24 Jul 2009 07:06:36 -0000 @@ -321,6 +321,11 @@ function search_simplify($text) { // Call an external processor for word handling. search_invoke_preprocess($text); + + // Check if N-Gram is enabled + if ($size = variable_get('search_gram_size', 0)) { + $text = search_gram_words($text, $size); + } // Simple CJK handling if (variable_get('overlap_cjk', TRUE)) { @@ -347,6 +352,29 @@ function search_simplify($text) { } /** + * Provides N-Gramming functionality. + * + * @param string $text + * single piece of plain-text that was extracted from between two HTML tags. Will not contain any HTML entities + * + * @param int $gram_size + * The size of the N-Gram + * + * @return string + * processed string into N-Gram words + */ +function search_gram_words($text, $gram_size) { + // step through each of the text + // and add it modified text as space deliminated + $words = array(); + for($start = 0; $start + $gram_size <= drupal_strlen($text); $start++) { + $words[] = drupal_substr($text, $start, $gram_size); + } + // implode the text so its 'words' + return implode(" ", $words); +} + +/** * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping * sequences of characters ('minimum_word_size' long). */ @@ -1252,7 +1280,7 @@ function search_excerpt($keys, $text) { } // Locate a keyword (position $p), then locate a space in front (position // $q) and behind it (position $s) - if (preg_match('/' . $boundary . $key . $boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) { + if (preg_match('/' . $boundary .'[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . $key . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . $boundary . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $included[$key])) { $p = $match[0][1]; if (($q = strpos($text, ' ', max(0, $p - 60))) !== FALSE) { $end = substr($text, $p, 80); @@ -1310,7 +1338,7 @@ function search_excerpt($keys, $text) { $text = (isset($newranges[0]) ? '' : '... ') . implode(' ... ', $out) . ' ...'; // Highlight keywords. Must be done at once to prevent conflicts ('strong' and ''). - $text = preg_replace('/' . $boundary . '(' . implode('|', $keys) . ')' . $boundary . '/iu', '\0', $text); + $text = preg_replace('/' . $boundary . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . '(' . implode('|', $keys) . ')' . '[^' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']*' . $boundary . '/iu', '\0', $text); return $text; }