### Eclipse Workspace Patch 1.0 #P search_keywords Index: search_keywords.module =================================================================== RCS file: /cvs/drupal/contributions/modules/search_keywords/search_keywords.module,v retrieving revision 1.19 diff -u -r1.19 search_keywords.module --- search_keywords.module 27 Mar 2007 06:00:44 -0000 1.19 +++ search_keywords.module 12 Apr 2007 21:43:00 -0000 @@ -135,6 +135,14 @@ '#options' => $period, '#description' => t('Older access log entries (including referrer search_keywords) will be automatically discarded. Requires crontab.')); + $form['access']['search_keywords_search_keys'] = array( + '#type' => 'textfield', + '#title' => t('Search keys'), + '#default_value' => variable_get('search_keywords_search_keys', _search_keywords_default_search_keys()), + '#maxlength' => 255, + '#description' => t('Keys form part of the query string and mark the start of the search terms. Enter a list of terms, separated by spaces, that will be used to identify search terms in a referral.'), + ); + return system_settings_form($form); } @@ -162,35 +170,49 @@ } function _search_keywords_extract($url) { - $engines = array(); - $engines[] = '/http:\/\/(.*?)\/search\?.*?q=([^&]+)/i'; - $engines[] = '/http:\/\/(.*?)\/base\/search\?.*?q=([^&]+)/i'; - $engines[] = '/http:\/\/(.*?)\/blogsearch\?.*?q=([^&]+)/i'; - $engines[] = '/http:\/\/(.*?)\/search\?.*?p=([^&]+)/i'; - $engines[] = '/http:\/\/(.*?)\/results.aspx\?.*?q=([^&]+)/i'; - $engines[] = '/http:\/\/(.*?)\/tags?\/(.*)/i'; - $engines[] = '/http:\/\/(.*?)\/search\/(.*)/i'; - $engines[] = '/http:\/\/(.*?)\/search\.php\?.*?q=([^&]+)/i'; - $engines[] = '/http:\/\/(.*?)\/\?.*?tag=([^&]+)/i'; - $engines[] = '/http:\/\/(.*?)\/cgi-bin\/search\?.*?q=([^&]+)/i'; - $engines[] = '/http:\/\/(.*?)\/aolcom\/search\?.*?query=([^&]+)/i'; - $engines[] = '/http:\/\/(.*?)\/jsp\/([^\.]+)\.jsp\?.*?searchfor=([^&]+)/i'; - $engines[] = '/http:\/\/(.*?)\/p\/search\?.*?qt=([^&]+)/i'; - $engines[] = '/http:\/\/(.*?)\/web\?.*?q=([^&]+)/i'; - $engines[] = '/http:\/\/(.*?)\/searchScreen\?.*?q=([^&]+)/i'; //Seznam.cz - $engines[] = '/http:\/\/(.*?)\/searchScreen\?.*?w=([^&]+)/i'; //Seznam.cz - $engines[] = '/http:\/\/(.*?)\/index.php\?.*?q=([^&]+)/i'; //ZoooHoo.cz - $engines[] = '/http:\/\/(.*?)\/\?.*?q=([^&]+)/i'; //ZoooHoo.cz - foreach ($engines as $pattern) { - if (preg_match($pattern, $url, $matches)) { - $q = urldecode($matches[2]); - if (!_search_keywords_is_utf8($q)) { - $q = _search_keywords_to_utf8($q); - } - return array($matches[1],$q); + + // Don't log if this is the host site (ignore non-clean url) + // Strip http(s)://www. if it exists, to handle sites that accept URLs both + // with and without the www prefix + + // Get the base url + $base_url = $GLOBALS['base_url']; + + // Construct the regex pattern to extract the common part + $pattern = '@^http(s)?\://(www\.)?(.*)$@i'; + preg_match($pattern, $base_url, $results); + $base_url = $results[3]; + + // Get search engine query keys and prepare for regex + $engineterms = trim(variable_get('search_keywords_search_keys', _search_keywords_default_search_keys())); + $engineterms = str_replace(' ', '|', preg_quote($engineterms)); + + // Construct the regex pattern for checking the host and search terms + $pattern = '@^http(s)?\://(www\.)?('.preg_quote($base_url).')?.*[&?]('.$engineterms.')=([^&]+)@i'; + + // See if the regex found anything + if ( preg_match($pattern, $url, $matches) ) { + + // If this was local referral then return nothing + if ( $base_url == $matches[3] ) { + return array(NULL, NULL); } + + // Get the search terms + $terms = urldecode($matches[5]); + + if (!_search_keywords_is_utf8($terms)) { + $terms = _search_keywords_to_utf8($terms); + } + + // Get the host name + $host = parse_url($url); + + return array($host[host], $terms); } - return array(NULL,NULL); + + // If we get here then this was never a search + return array(NULL, NULL); } function _search_keywords_is_utf8($string) { @@ -204,7 +226,7 @@ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )*$%xs', $string); - + } function _search_keywords_to_utf8($string) { @@ -218,7 +240,7 @@ function _search_keywords_tis620_to_utf8($string) { if (!ereg("[\241-\377]", $string)) return $string; - + $tis620 = array( "\xa1" => "\xe0\xb8\x81", "\xa2" => "\xe0\xb8\x82", @@ -308,7 +330,44 @@ "\xfa" => "\xe0\xb9\x9a", "\xfb" => "\xe0\xb9\x9b" ); - + $string=strtr($string,$tis620); return $string; } + +function _search_keywords_default_search_keys() { + + /* This list based on http://suda.co.uk/projects/SEHL/ */ + + $accepted_query_keys = array( + 'q', + 'p', + 'ask', + 'searchfor', + 'key', + 'query', + 'search', + 'keyword', + 'keywords', + 'qry', + 'searchitem', + 'kwd', + 'recherche', + 'search_text', + 'search_term', + 'term', + 'terms', + 'qq', + 'qry_str', + 'qu', + 's', + 'k', + 't', + 'va' + ); + + // Sort so it looks nice when displayed as the default list on settings page + asort($accepted_query_keys); + return implode(' ', $accepted_query_keys); + +}