Index: modules/search/search.admin.inc =================================================================== RCS file: /cvs/drupal/drupal/modules/search/search.admin.inc,v retrieving revision 1.8 diff -u -p -r1.8 search.admin.inc --- modules/search/search.admin.inc 11 Jan 2009 21:19:18 -0000 1.8 +++ modules/search/search.admin.inc 21 Mar 2009 22:01:01 -0000 @@ -86,6 +86,20 @@ function search_admin_settings() { '#default_value' => TRUE, '#description' => t('Whether to apply a simple Chinese/Japanese/Korean tokenizer based on overlapping sequences. Turn this off if you want to use an external preprocessor for this instead. Does not affect other languages.') ); + + // Create the options array + $n_gram_options[0] = t('Full Word Matches'); + foreach (range(1, 6) as $size) { + $n_gram_options[$size] = t('@num Character size', array('@num' => $size)); + } + + $form['indexing_settings']['search_gram_size'] = array( + '#type' => 'select', + '#title' => t('Character Matches'), + '#description' => t('This allows for partial word matching in searches. It breaks a word down in a several N sized character strings. Set it to Full Word Matches if you do not want this feature. The smaller the number of characters the more accurate the results but at a larger database cost.'), + '#options' => $n_gram_options, + '#default_value' => variable_get('search_gram_size', 0), + ); $form['#validate'] = array('search_admin_settings_validate'); @@ -101,9 +115,14 @@ function search_admin_settings_validate( if ($form_state['values']['op'] == t('Re-index site')) { drupal_goto('admin/settings/search/wipe'); } + // only allow gram sizes that are smaller or equal to the min word size + if ($form_state['values']['minimum_word_size'] < $form_state['values']['search_gram_size']) { + form_set_error('search_gram_size', t('N-Gram size must be less then or equal to the minimum word size')); + } // If these settings change, the index needs to be rebuilt. - if ((variable_get('minimum_word_size', 3) != $form_state['values']['minimum_word_size']) || - (variable_get('overlap_cjk', TRUE) != $form_state['values']['overlap_cjk'])) { + else if ((variable_get('minimum_word_size', 3) != $form_state['values']['minimum_word_size']) || + (variable_get('overlap_cjk', TRUE) != $form_state['values']['overlap_cjk']) || + (variable_get('search_gram_size', 0) != $form_state['values']['search_gram_size'])) { drupal_set_message(t('The index will be rebuilt.')); search_wipe(); } Index: modules/search/search.module =================================================================== RCS file: /cvs/drupal/drupal/modules/search/search.module,v retrieving revision 1.286 diff -u -p -r1.286 search.module --- modules/search/search.module 15 Mar 2009 18:51:28 -0000 1.286 +++ modules/search/search.module 21 Mar 2009 22:01:02 -0000 @@ -315,6 +315,11 @@ function search_simplify($text) { // Lowercase $text = drupal_strtolower($text); + + // Check if N-Gram is enabled + if ($size = variable_get('search_gram_size', 0)) { + $text = search_gram_words($text, $size); + } // Call an external processor for word handling. search_invoke_preprocess($text); @@ -344,6 +349,29 @@ function search_simplify($text) { } /** + * Provides N-Gramming functionality. + * + * @param string $text + * single piece of plain-text that was extracted from between two HTML tags. Will not contain any HTML entities + * + * @param int $gram_size + * The size of the N-Gram + * + * @return string + * processed string into N-Gram words + */ +function search_gram_words($text, $gram_size) { + // step through each of the text + // and add it modified text as space deliminated + $words = array(); + for($start = 0; $start + $gram_size <= strlen($text); $start++) { + $words[] = substr($text, $start, $gram_size); + } + // implode the text so its 'words' + return implode(" ", $words); +} + +/** * Basic CJK tokenizer. Simply splits a string into consecutive, overlapping * sequences of characters ('minimum_word_size' long). */ @@ -1206,8 +1234,7 @@ function search_data($keys = NULL, $type */ function search_excerpt($keys, $text) { // We highlight around non-indexable or CJK characters. - $boundary = '(?:(?<=[' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . '])|(?=[' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']))'; - + $boundary = '.(?:(?<=[' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . '])|(?=[' . PREG_CLASS_SEARCH_EXCLUDE . PREG_CLASS_CJK . ']))'; // Extract positive keywords and phrases preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' ' . $keys, $matches); $keys = array_merge($matches[2], $matches[3]);