Index: modules/node/node.module =================================================================== RCS file: /cvs/drupal/drupal/modules/node/node.module,v retrieving revision 1.882 diff -u -u -p -r1.882 node.module --- modules/node/node.module 14 Sep 2007 09:37:29 -0000 1.882 +++ modules/node/node.module 23 Sep 2007 17:34:57 -0000 @@ -1058,15 +1058,13 @@ function node_search($op = 'search', $ke return t('Content'); case 'reset': - variable_del('node_cron_last'); - variable_del('node_cron_last_nid'); + db_query('UPDATE {search_dataset} SET reindex = %d', time()); return; case 'status': - $last = variable_get('node_cron_last', 0); - $last_nid = variable_get('node_cron_last_nid', 0); $total = db_result(db_query('SELECT COUNT(*) FROM {node} WHERE status = 1')); - $remaining = db_result(db_query('SELECT COUNT(*) FROM {node} n LEFT JOIN {node_comment_statistics} c ON n.nid = c.nid WHERE n.status = 1 AND ((GREATEST(n.created, n.changed, c.last_comment_timestamp) = %d AND n.nid > %d ) OR (n.created > %d OR n.changed > %d OR c.last_comment_timestamp > %d))', $last, $last_nid, $last, $last, $last)); + $remaining = db_result(db_query("SELECT COUNT(*) FROM {node} n LEFT OUTER JOIN {search_dataset} d ON d.type = 'node' AND d.sid = n.nid WHERE d.sid IS NULL OR d.reindex <> 0")); + return array('remaining' => $remaining, 'total' => $total); case 'admin': @@ -1612,40 +1610,23 @@ function node_page_view($node, $cid = NU } /** - * shutdown function to make sure we always mark the last node processed. - */ -function node_update_shutdown() { - global $last_change, $last_nid; - - if ($last_change && $last_nid) { - variable_set('node_cron_last', $last_change); - variable_set('node_cron_last_nid', $last_nid); - } -} - -/** * Implementation of hook_update_index(). */ function node_update_index() { - global $last_change, $last_nid; - - register_shutdown_function('node_update_shutdown'); - - $last = variable_get('node_cron_last', 0); - $last_nid = variable_get('node_cron_last_nid', 0); $limit = (int)variable_get('search_cron_limit', 100); // Store the maximum possible comments per thread (used for ranking by reply count) variable_set('node_cron_comments_scale', 1.0 / max(1, db_result(db_query('SELECT MAX(comment_count) FROM {node_comment_statistics}')))); variable_set('node_cron_views_scale', 1.0 / max(1, db_result(db_query('SELECT MAX(totalcount) FROM {node_counter}')))); - $result = db_query_range('SELECT GREATEST(IF(c.last_comment_timestamp IS NULL, 0, c.last_comment_timestamp), n.changed) as last_change, n.nid FROM {node} n LEFT JOIN {node_comment_statistics} c ON n.nid = c.nid WHERE n.status = 1 AND ((GREATEST(n.changed, c.last_comment_timestamp) = %d AND n.nid > %d) OR (n.changed > %d OR c.last_comment_timestamp > %d)) ORDER BY GREATEST(n.changed, c.last_comment_timestamp) ASC, n.nid ASC', $last, $last_nid, $last, $last, $last, 0, $limit); + $result = db_query_range("SELECT n.nid FROM {node} n LEFT OUTER JOIN {search_dataset} d ON d.type = 'node' AND d.sid = n.nid WHERE d.sid IS NULL OR d.reindex <> 0 ORDER BY d.reindex ASC, n.nid ASC", 0, $limit); while ($node = db_fetch_object($result)) { - $last_change = $node->last_change; - $last_nid = $node->nid; $node = node_load($node->nid); + // save the changed time of the most recent indexed node, for the search results half-life calculation + variable_set('node_cron_last', $node->changed); + // Build the node body. $node->build_mode = NODE_BUILD_SEARCH_INDEX; $node = node_build_content($node, FALSE, FALSE); Index: modules/search/search.module =================================================================== RCS file: /cvs/drupal/drupal/modules/search/search.module,v retrieving revision 1.235 diff -u -u -p -r1.235 search.module --- modules/search/search.module 5 Sep 2007 08:39:57 -0000 1.235 +++ modules/search/search.module 23 Sep 2007 17:34:57 -0000 @@ -229,9 +229,11 @@ function search_wipe($sid = NULL, $type } else { db_query("DELETE FROM {search_dataset} WHERE sid = %d AND type = '%s'", $sid, $type); - db_query("DELETE FROM {search_index} WHERE fromsid = %d AND fromtype = '%s'", $sid, $type); - // When re-indexing, keep link references - db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'". ($reindex ? " AND fromsid = 0" : ''), $sid, $type); + db_query("DELETE FROM {search_index} WHERE sid = %d AND type = '%s'", $sid, $type); + // Don't remove links if re-indexing. + if (!$reindex) { + db_query("DELETE FROM {search_node_links} WHERE sid = %d AND type = '%s'", $sid, $type); + } } } @@ -527,21 +529,24 @@ function search_index($sid, $type, $text $word = (int)ltrim($word, '-0'); } + // Links score mainly for the target. if ($link) { if (!isset($results[$linknid])) { $results[$linknid] = array(); } - $results[$linknid][$word] += $score * $focus; + $results[$linknid][] = $word; + // Reduce score of the link caption in the source. + $focus *= 0.2; } - else { - if (!isset($results[0][$word])) { - $results[0][$word] = 0; - } - $results[0][$word] += $score * $focus; - // Focus is a decaying value in terms of the amount of unique words up to this point. - // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words. - $focus = min(1, .01 + 3.5 / (2 + count($results[0]) * .015)); + // Fall-through + if (!isset($results[0][$word])) { + $results[0][$word] = 0; } + $results[0][$word] += $score * $focus; + + // Focus is a decaying value in terms of the amount of unique words up to this point. + // From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words. + $focus = min(1, .01 + 3.5 / (2 + count($results[0]) * .015)); } $tagwords++; // Too many words inside a single tag probably mean a tag was accidentally left open. @@ -558,7 +563,7 @@ function search_index($sid, $type, $text search_wipe($sid, $type, TRUE); // Insert cleaned up data into dataset - db_query("INSERT INTO {search_dataset} (sid, type, data) VALUES (%d, '%s', '%s')", $sid, $type, $accum); + db_query("INSERT INTO {search_dataset} (sid, type, data, reindex) VALUES (%d, '%s', '%s', %d)", $sid, $type, $accum, 0); // Insert results into search index foreach ($results[0] as $word => $score) { @@ -567,12 +572,73 @@ function search_index($sid, $type, $text } unset($results[0]); - // Now insert links to nodes + // Get all previous links from this item. + $result = db_query("SELECT nid, caption FROM {search_node_links} WHERE sid = %d AND type = '%s'", $sid, $type); + $links = array(); + while ($link = db_fetch_object($result)) { + $links[$link->nid] = $link->caption; + } + + // Now store links to nodes. foreach ($results as $nid => $words) { - foreach ($words as $word => $score) { - db_query("INSERT INTO {search_index} (word, sid, type, fromsid, fromtype, score) VALUES ('%s', %d, '%s', %d, '%s', %f)", $word, $nid, 'node', $sid, $type, $score); - search_dirty($word); + $caption = implode(' ', $words); + if (isset($links[$nid])) { + if ($links[$nid] != $caption) { + // Update the existing link and mark the node for reindexing. + db_query("UPDATE {search_node_links} SET caption = '%s' WHERE sid = %d AND type = '%s' AND nid = %d", $caption, $sid, $type, $nid); + search_touch_node($nid); + } + // Unset the link to mark it as processed. + unset($links[$nid]); } + else { + // Insert the existing link and mark the node for reindexing. + db_query("INSERT INTO {search_node_links} (caption, sid, type, nid) VALUES ('%s', %d, '%s', %d)", $caption, $sid, $type, $nid); + search_touch_node($nid); + } + } + // Any left-over links in $links no longer exist. Delete them and mark the nodes for reindexing. + foreach ($links as $nid) { + db_query("DELETE FROM {search_node_links} WHERE sid = %d AND type = '%s' AND nid = %d", $sid, $type, $nid); + search_touch_node($nid); + } +} + +/** + * Change a node's changed timestamp to now to force reindexing. + */ +function search_touch_node($nid) { + db_query("UPDATE {search_dataset} SET reindex = %d WHERE nid = %d", time(), $nid); +} + +/** + * Implementation of hook_nodeapi(). + */ +function search_nodeapi(&$node, $op, $teaser = NULL, $page = NULL) { + switch ($op) { + // Transplant links to a node into the target node. + case 'update index': + $result = db_query("SELECT caption FROM {search_node_links} WHERE nid = %d", $node->nid); + $output = array(); + while ($link = db_fetch_object($result)) { + $output[] = $link->caption; + } + return '('. implode(', ', $output) .')'; + // Reindex the node when it is updated. The node is automatically indexed + // when it is added, simply by being added to the node table. + case 'update': + search_touch_node($node->nid); + break; + } +} + +/** + * Implementation of hook_comment(). + */ +function search_comment($a1, $op) { + if ($op == 'insert' || $op == 'delete') { + // Reindex the node when comments are added or deleted. + search_touch_node($a1['nid']); } } Index: modules/search/search.schema =================================================================== RCS file: /cvs/drupal/drupal/modules/search/search.schema,v retrieving revision 1.3 diff -u -u -p -r1.3 search.schema --- modules/search/search.schema 15 Jul 2007 10:09:21 -0000 1.3 +++ modules/search/search.schema 23 Sep 2007 17:34:57 -0000 @@ -6,7 +6,8 @@ function search_schema() { 'fields' => array( 'sid' => array('type' => 'int', 'unsigned' => TRUE, 'not null' => TRUE, 'default' => 0), 'type' => array('type' => 'varchar', 'length' => 16, 'not null' => FALSE), - 'data' => array('type' => 'text', 'not null' => TRUE, 'size' => 'big') + 'data' => array('type' => 'text', 'not null' => TRUE, 'size' => 'big'), + 'reindex' => array('type' => 'int', 'unsigned' => TRUE, 'not null' => TRUE, 'default' => 0) ), 'indexes' => array('sid_type' => array('sid', 'type')), ); @@ -35,6 +36,19 @@ function search_schema() { 'primary key' => array('word'), ); + $schema['search_node_links'] = array( + 'fields' => array( + 'sid' => array('type' => 'int', 'unsigned' => TRUE, 'not null' => TRUE, 'default' => 0), + 'type' => array('type' => 'varchar', 'length' => 16, 'not null' => TRUE, 'default' => ''), + 'nid' => array('type' => 'int', 'unsigned' => TRUE, 'not null' => TRUE, 'default' => 0), + 'caption' => array('type' => 'text', 'size' => 'big', 'not null' => FALSE) + ), + 'primary key' => array('sid', 'type', 'nid'), + 'indexes' => array( + 'nid' => array('nid') + ), + ); + return $schema; } Index: modules/system/system.install =================================================================== RCS file: /cvs/drupal/drupal/modules/system/system.install,v retrieving revision 1.153 diff -u -u -p -r1.153 system.install --- modules/system/system.install 14 Sep 2007 17:46:32 -0000 1.153 +++ modules/system/system.install 23 Sep 2007 17:35:06 -0000 @@ -3766,6 +3766,25 @@ function system_update_6032() { } /** + * Drop and recreate the search index. + */ +function system_update_6033() { + // drop all of the search tables, and then recreate them along with the new tables. + // since the site needs to be reindexed with the introduction of the search_node_links + // table, it's safe to drop and recreate them. + $ret = array(); + db_drop_table($ret, 'search_dataset'); + db_drop_table($ret, 'search_index'); + db_drop_table($ret, 'search_total'); + drupal_install_schema('search'); + + // with the change to search_dataset.reindex, the search queue is handled differently, + // and this is no longer needed + variable_del('node_cron_last'); + return $ret; +} + +/** * @} End of "defgroup updates-5.x-to-6.x" * The next series of updates should start at 7000. */