Index: modules/aggregator/aggregator.admin.inc =================================================================== RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.admin.inc,v retrieving revision 1.14 diff -u -p -r1.14 aggregator.admin.inc --- modules/aggregator/aggregator.admin.inc 16 Aug 2008 14:48:17 -0000 1.14 +++ modules/aggregator/aggregator.admin.inc 20 Aug 2008 11:40:56 -0000 @@ -393,33 +393,43 @@ function aggregator_admin_refresh_feed($ * @see system_settings_form() */ function aggregator_admin_settings() { - $items = array(0 => t('none')) + drupal_map_assoc(array(3, 5, 10, 15, 20, 25), '_aggregator_items'); - $period = drupal_map_assoc(array(3600, 10800, 21600, 32400, 43200, 86400, 172800, 259200, 604800, 1209600, 2419200, 4838400, 9676800), 'format_interval'); - - $form['aggregator_allowed_html_tags'] = array( - '#type' => 'textfield', '#title' => t('Allowed HTML tags'), '#size' => 80, '#maxlength' => 255, - '#default_value' => variable_get('aggregator_allowed_html_tags', '
      • '), - '#description' => t('A space-separated list of HTML tags allowed in the content of feed items. (Tags in this list are not removed by Drupal.)'), - ); - - $form['aggregator_summary_items'] = array( - '#type' => 'select', '#title' => t('Items shown in sources and categories pages') , - '#default_value' => variable_get('aggregator_summary_items', 3), '#options' => $items, - '#description' => t('Number of feed items displayed in feed and category summary pages.'), - ); - - $form['aggregator_clear'] = array( - '#type' => 'select', '#title' => t('Discard items older than'), - '#default_value' => variable_get('aggregator_clear', 9676800), '#options' => $period, - '#description' => t('The length of time to retain feed items before discarding. (Requires a correctly configured cron maintenance task.)', array('@cron' => url('admin/reports/status'))), - ); - - $form['aggregator_category_selector'] = array( - '#type' => 'radios', '#title' => t('Category selection type'), '#default_value' => variable_get('aggregator_category_selector', 'checkboxes'), - '#options' => array('checkboxes' => t('checkboxes'), 'select' => t('multiple selector')), - '#description' => t('The type of category selection widget displayed on categorization pages. (For a small number of categories, checkboxes are easier to use, while a multiple selector work well with large numbers of categories.)'), + $period = array('-1' => t('none')); + $period += drupal_map_assoc(array(900, 1800, 3600, 7200, 10800, 21600, 32400, 43200, 64800, 86400, 172800, 259200, 604800, 1209600, 2419200), 'format_interval'); + $parsers = module_implements('aggregator_parse'); + foreach ($parsers as $k => $v) { + $info = module_invoke($v, 'aggregator_parse', 'info'); + unset($parsers[$k]); + $parsers[$v] = $info['title'] . ' ' . $info['description'] .''; + } + $processors = module_implements('aggregator_process'); + foreach ($processors as $k => $v) { + $info = module_invoke($v, 'aggregator_process', 'info'); + unset($processors[$k]); + $processors[$v] = $info['title'] . ' ' . $info['description'] .''; + } + $form['aggregator_parser'] = array( + '#type' => 'radios', + '#title' => t('Parser'), + '#description' => t('Parsers retrieve and parse feed data. Choose one suitable for the type of feeds you would like to aggregate.'), + '#options' => $parsers, + '#default_value' => variable_get('aggregator_parser', array_pop(array_flip($parsers))), + ); + $form['aggregator_processors'] = array( + '#type' => 'checkboxes', + '#title' => t('Processors'), + '#description' => t('Processors act on parsed feed data, for example they store feed items. Pick the processors suitable for your task.'), + '#options' => $processors, + '#default_value' => variable_get('aggregator_processor', array_slice(array_flip($processors), 0, 1)), + ); + $form['aggregator_refresh'] = array( + '#type' => 'select', + '#title' => t('Update interval'), + '#default_value' => variable_get('aggregator_refresh', 3600), + '#options' => $period, + '#description' => t('Approximate time between checking feeds. Requires a correctly configured cron maintenance task.', array('@cron' => url('admin/reports/status'))), ); - + $form['modules'] = array(); + return system_settings_form($form); } @@ -511,3 +521,51 @@ function aggregator_form_category_submit drupal_set_message(t('The category %category has been added.', array('%category' => $form_state['values']['title']))); } } + +/** + * Implementation of hook_form_alter(). + */ +function aggregator_form_alter(&$form, $form_state, $form_id) { + if ($form_id == 'aggregator_admin_settings') { + if (aggregator_is_enabled('aggregator')) { + $types = node_get_types(); + $types_select = array(); + foreach ($types as $type) { + // Do not allow a content-type for both the items and the feeds + if (!variable_get('aggregator_feed_' . $type->type, FALSE)) { + $types_select[$type->type] = $type->name; + } + } + $info = module_invoke('aggregator', 'aggregator_process', 'info'); + $period = drupal_map_assoc(array(3600, 10800, 21600, 32400, 43200, 86400, 172800, 259200, 604800, 1209600, 2419200, 4838400, 9676800), 'format_interval'); + $items = array(0 => t('none')) + drupal_map_assoc(array(3, 5, 10, 15, 20, 25), '_aggregator_items'); + + $form['modules']['aggregator'] = array( + '#type' => 'fieldset', + '#title' => t('Advanced Aggregator Light settings'), + '#description' => $info['description'], + '#collapsible' => TRUE, + '#collapsed' => !aggregator_is_enabled('aggregator', $type), + ); + + + $form['modules']['aggregator']['aggregator_summary_items'] = array( + '#type' => 'select', '#title' => t('Items shown in sources and categories pages') , + '#default_value' => variable_get('aggregator_summary_items', 3), '#options' => $items, + '#description' => t('Number of feed items displayed in feed and category summary pages.'), + ); + + $form['modules']['aggregator']['aggregator_clear'] = array( + '#type' => 'select', '#title' => t('Discard items older than'), + '#default_value' => variable_get('aggregator_clear', 9676800), '#options' => $period, + '#description' => t('The length of time to retain feed items before discarding. (Requires a correctly configured cron maintenance task.)', array('@cron' => url('admin/reports/status'))), + ); + + $form['modules']['aggregator']['aggregator_category_selector'] = array( + '#type' => 'radios', '#title' => t('Category selection type'), '#default_value' => variable_get('aggregator_category_selector', 'checkboxes'), + '#options' => array('checkboxes' => t('checkboxes'), 'select' => t('multiple selector')), + '#description' => t('The type of category selection widget displayed on categorization pages. (For a small number of categories, checkboxes are easier to use, while a multiple selector work well with large numbers of categories.)'), + ); + } + } +} \ No newline at end of file Index: modules/aggregator/aggregator.info =================================================================== RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.info,v retrieving revision 1.7 diff -u -p -r1.7 aggregator.info --- modules/aggregator/aggregator.info 15 May 2008 21:27:32 -0000 1.7 +++ modules/aggregator/aggregator.info 20 Aug 2008 11:40:56 -0000 @@ -8,3 +8,5 @@ core = 7.x files[] = aggregator.module files[] = aggregator.admin.inc files[] = aggregator.pages.inc +files[] = aggregator.parser.inc + Index: modules/aggregator/aggregator.module =================================================================== RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.module,v retrieving revision 1.389 diff -u -p -r1.389 aggregator.module --- modules/aggregator/aggregator.module 16 Aug 2008 14:48:17 -0000 1.389 +++ modules/aggregator/aggregator.module 20 Aug 2008 11:40:58 -0000 @@ -281,9 +281,23 @@ function aggregator_perm() { * Checks news feeds for updates once their refresh interval has elapsed. */ function aggregator_cron() { - $result = db_query('SELECT * FROM {aggregator_feed} WHERE checked + refresh < %d', time()); - while ($feed = db_fetch_array($result)) { - aggregator_refresh($feed); + $ready = FALSE; + if (drupal_function_exists('_aggregator_light_delete_expired')) { + _aggregator_light_delete_expired(); + } + // Query the feeds which should be refreshed and do the refresh. + $start = time(); + while(_aggregator_cron_time() || !$ready) { + $sql = "SELECT * FROM {aggregator_feed} WHERE checked <= %d AND (%d - checked) < %d ORDER BY checked"; + $result = db_query_range($sql, $start, variable_get('aggregator_refresh', 3600), 0, 2); + $feed_count = 0; + while ($feed = db_fetch_array($result)) { + aggregator_refresh($feed); + ++$feed_count; + } + if ($feed_count == 0) { + $ready = TRUE; + } } } @@ -359,6 +373,136 @@ function aggregator_block($op = 'list', } /** + * Implementation of hook_aggregator_process(). + * + * @param $op + * 'save' The feed items should be updated or saved. + * 'info' Metadata about the processor + * @param $channel + * The data + */ +function aggregator_aggregator_process($op, $feed = NULL) { + switch ($op) { + case 'save': + $new = FALSE; + foreach ($feed['items'] as $k => $item) { + $new = ($new || is_numeric($item['unique']['aggregator']) ? FALSE : TRUE); + aggregator_save_item(array('iid' => $item['unique']['aggregator'], 'fid' => $feed['fid'], 'timestamp' => $item['timestamp'], 'title' => $item['title'], 'link' => $item['link'], 'author' => $item['author'], 'description' => $item['description'], 'guid' => $item['guid'])); + } + if ($new) { + watchdog('aggregator', 'There is new syndicated content from %site.', array('%site' => $feed['title'])); + drupal_set_message(t('There is new syndicated content from %site.', array('%site' => $feed['title']))); + } + return $feed; + break; + case 'unique': + foreach ($feed['items'] as $k => $item) { + if (!empty($item['guid'])) { + $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND guid = '%s'", $feed['fid'], $item['guid'])); + } + else if ($item['link'] && $item['link'] != $feed['link'] && $item['link'] != $feed['url']) { + $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND link = '%s'", $feed['fid'], $item['link'])); + } + else { + $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND title = '%s'", $feed['fid'], $item['title'])); + } + $feed['items'][$k]['unique'] = array(); + $feed['items'][$k]['unique']['aggregator'] = (!isset($entry->iid) ? TRUE : $entry->iid); + } + return $feed; + break; + case 'info': + return array( + 'title' => t('Aggregator Light'), + 'description' => t('Creates lightweight records of feed items.'), + ); + } +} + +/** + * Implementation of hook_parse(). + * + * @param $op + * 'parse' Parse the feed-nodes + * 'info' Metadata about the processor + * @param $data + * Raw downloaded data + */ +function aggregator_aggregator_parse($op, $feed = NULL) { + switch ($op) { + case 'parse': + // Generate conditional GET headers. + $headers = array(); + if ($feed['etag']) { + $headers['If-None-Match'] = $feed['etag']; + } + if ($feed['modified']) { + $headers['If-Modified-Since'] = gmdate('D, d M Y H:i:s', $feed['modified']) . ' GMT'; + } + + // Request feed. + $result = drupal_http_request($feed['url'], $headers); + + // Process HTTP response code. + switch ($result->code) { + case 304: + return TRUE; + break; + case 301: + $feed['url'] = $result->redirect_url; + + if (isset($result->redirect_url)) { + watchdog('aggregator', 'Updated URL for feed %title to %url.', array('%title' => $feed['title'], '%url' => $feed['url'])); + } + // Do not break here. + case 200: + case 302: + case 307: + // We store the md5 hash of feed data in the database. When refreshing a + // feed we compare stored hash and new hash calculated from downloaded + // data. If both are equal we say that feed is not updated. + $feed['md5'] = md5($result->data); + if ($feed['hash'] == $feed['md5']) { + return TRUE; + } + @$data = simplexml_load_string($result->data); + if (drupal_function_exists('aggregator_parser_format_detect')) { + $format = aggregator_parser_format_detect($data); + if ($format == FALSE) { + $result = FALSE; + } + $feed_handler = 'aggregator_parser_' . $format; + if (drupal_function_exists($feed_handler)) { + $parser_out = $feed_handler($data); + } + } + return array_merge( + array( + 'md5' => $feed['md5'], + 'modified' => empty($result->headers['Last-Modified']) ? 0 : strtotime($result->headers['Last-Modified']), + 'etag' => empty($result->headers['ETag']) ? '' : $result->headers['ETag'], + 'author' => '', + ), + $parser_out + ); + break; + default: + watchdog('aggregator', 'The feed from %site seems to be broken, due to "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error), WATCHDOG_WARNING); + drupal_set_message(t('The feed from %site seems to be broken, because of error "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error))); + module_invoke('system', 'check_http_request'); + return FALSE; + } + break; + case 'info': + return array( + 'title' => t('Built-in Parser'), + 'description' => t('Default parser for RSS, Atom and RDF feeds.'), + ); + + } +} + +/** * Add/edit/delete aggregator categories. * * @param $edit @@ -406,16 +550,11 @@ function aggregator_save_feed($edit) { db_query("UPDATE {aggregator_feed} SET title = '%s', url = '%s', refresh = %d, block = %d WHERE fid = %d", $edit['title'], $edit['url'], $edit['refresh'], $edit['block'], $edit['fid']); } elseif (!empty($edit['fid'])) { - $items = array(); - $result = db_query('SELECT iid FROM {aggregator_item} WHERE fid = %d', $edit['fid']); - while ($item = db_fetch_object($result)) { - $items[] = "iid = $item->iid"; - } - if (!empty($items)) { - db_query('DELETE FROM {aggregator_category_item} WHERE ' . implode(' OR ', $items)); + $processors = variable_get('aggregator_processor', array()); + foreach ($processors as $processor) { + module_invoke($processor, 'aggregator_process', 'delete', $edit); } db_query('DELETE FROM {aggregator_feed} WHERE fid = %d', $edit['fid']); - db_query('DELETE FROM {aggregator_item} WHERE fid = %d', $edit['fid']); // Make sure there is no active block for this feed. db_query("DELETE FROM {blocks} WHERE module = '%s' AND delta = '%s'", 'aggregator', 'feed-' . $edit['fid']); } @@ -456,203 +595,64 @@ function aggregator_remove($feed) { } /** - * Callback function used by the XML parser. - */ -function aggregator_element_start($parser, $name, $attributes) { - global $item, $element, $tag, $items, $channel; - - switch ($name) { - case 'IMAGE': - case 'TEXTINPUT': - case 'CONTENT': - case 'SUMMARY': - case 'TAGLINE': - case 'SUBTITLE': - case 'LOGO': - case 'INFO': - $element = $name; - break; - case 'ID': - if ($element != 'ITEM') { - $element = $name; - } - case 'LINK': - if (!empty($attributes['REL']) && $attributes['REL'] == 'alternate') { - if ($element == 'ITEM') { - $items[$item]['LINK'] = $attributes['HREF']; - } - else { - $channel['LINK'] = $attributes['HREF']; - } - } - break; - case 'ITEM': - $element = $name; - $item += 1; - break; - case 'ENTRY': - $element = 'ITEM'; - $item += 1; - break; - } - - $tag = $name; -} - -/** - * Call-back function used by the XML parser. - */ -function aggregator_element_end($parser, $name) { - global $element; - - switch ($name) { - case 'IMAGE': - case 'TEXTINPUT': - case 'ITEM': - case 'ENTRY': - case 'CONTENT': - case 'INFO': - $element = ''; - break; - case 'ID': - if ($element == 'ID') { - $element = ''; - } - } -} - -/** - * Callback function used by the XML parser. - */ -function aggregator_element_data($parser, $data) { - global $channel, $element, $items, $item, $image, $tag; - $items += array($item => array()); - switch ($element) { - case 'ITEM': - $items[$item] += array($tag => ''); - $items[$item][$tag] .= $data; - break; - case 'IMAGE': - case 'LOGO': - $image += array($tag => ''); - $image[$tag] .= $data; - break; - case 'LINK': - if ($data) { - $items[$item] += array($tag => ''); - $items[$item][$tag] .= $data; - } - break; - case 'CONTENT': - $items[$item] += array('CONTENT' => ''); - $items[$item]['CONTENT'] .= $data; - break; - case 'SUMMARY': - $items[$item] += array('SUMMARY' => ''); - $items[$item]['SUMMARY'] .= $data; - break; - case 'TAGLINE': - case 'SUBTITLE': - $channel += array('DESCRIPTION' => ''); - $channel['DESCRIPTION'] .= $data; - break; - case 'INFO': - case 'ID': - case 'TEXTINPUT': - // The sub-element is not supported. However, we must recognize - // it or its contents will end up in the item array. - break; - default: - $channel += array($tag => ''); - $channel[$tag] .= $data; - } -} - -/** * Checks a news feed for new items. * * @param $feed * An associative array describing the feed to be refreshed. */ function aggregator_refresh($feed) { - global $channel, $image; - // Generate conditional GET headers. - $headers = array(); - if ($feed['etag']) { - $headers['If-None-Match'] = $feed['etag']; + $parser = variable_get('aggregator_parser', 'aggregator'); + $channel = module_invoke($parser, 'aggregator_parse', 'parse', $feed); + if ($channel === TRUE) { + db_query('UPDATE {aggregator_feed} SET checked = %d WHERE fid = %d', time(), $feed['fid']); + drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title']))); + return; } - if ($feed['modified']) { - $headers['If-Modified-Since'] = gmdate('D, d M Y H:i:s', $feed['modified']) . ' GMT'; + if ($channel === FALSE) { + return; } + $channel = array_merge($channel, $feed); + if (is_array($channel)) { + $processors = variable_get('aggregator_processor', array()); + foreach ($processors as $processor) { + $channel = module_invoke($processor, 'aggregator_process', 'unique', $channel); + } + foreach ($processors as $processor) { + $channel = module_invoke($processor, 'aggregator_process', 'save', $channel); + } - // Request feed. - $result = drupal_http_request($feed['url'], $headers); - - // Process HTTP response code. - switch ($result->code) { - case 304: - db_query('UPDATE {aggregator_feed} SET checked = %d WHERE fid = %d', time(), $feed['fid']); - drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title']))); - break; - case 301: - $feed['url'] = $result->redirect_url; - // Do not break here. - case 200: - case 302: - case 307: - // We store the md5 hash of feed data in the database. When refreshing a - // feed we compare stored hash and new hash calculated from downloaded - // data. If both are equal we say that feed is not updated. - $md5 = md5($result->data); - if ($feed['hash'] == $md5) { - db_query('UPDATE {aggregator_feed} SET checked = %d WHERE fid = %d', time(), $feed['fid']); - drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title']))); - break; - } - - // Filter the input data. - if (aggregator_parse_feed($result->data, $feed)) { - $modified = empty($result->headers['Last-Modified']) ? 0 : strtotime($result->headers['Last-Modified']); - - // Prepare the channel data. - foreach ($channel as $key => $value) { - $channel[$key] = trim($value); - } - - // Prepare the image data (if any). - foreach ($image as $key => $value) { - $image[$key] = trim($value); - } + $image = $channel['image']; - if (!empty($image['LINK']) && !empty($image['URL']) && !empty($image['TITLE'])) { - // TODO: we should really use theme_image() here, but that only works with - // local images. It won't work with images fetched with a URL unless PHP version > 5. - $image = '' . check_plain($image['TITLE']) . ''; - } - else { - $image = NULL; - } + // Prepare the channel data. + foreach ($channel as $key => $value) { + if (!is_array($value)) { + $channel[$key] = trim($value); + } + } - $etag = empty($result->headers['ETag']) ? '' : $result->headers['ETag']; - // Update the feed data. - db_query("UPDATE {aggregator_feed} SET url = '%s', checked = %d, link = '%s', description = '%s', image = '%s', hash = '%s', etag = '%s', modified = %d WHERE fid = %d", $feed['url'], time(), $channel['LINK'], $channel['DESCRIPTION'], $image, $md5, $etag, $modified, $feed['fid']); + // Prepare the image data (if any). + if (is_array($image)) { + foreach ($image as $key => $value) { + $image[$key] = trim($value); + } + } - // Clear the cache. - cache_clear_all(); + if (!empty($image['LINK']) && !empty($image['URL']) && !empty($image['TITLE'])) { + // TODO: we should really use theme_image() here, but that only works with + // local images. It won't work with images fetched with a URL unless PHP version > 5. + $image = '' . check_plain($image['TITLE']) . ''; + } + else { + $image = NULL; + } - if (isset($result->redirect_url)) { - watchdog('aggregator', 'Updated URL for feed %title to %url.', array('%title' => $feed['title'], '%url' => $feed['url'])); - } + // Update the feed data. + db_query("UPDATE {aggregator_feed} SET url = '%s', checked = %d, link = '%s', description = '%s', image = '%s', hash = '%s', etag = '%s', modified = %d WHERE fid = %d", $feed['url'], time(), $channel['link'], $channel['description'], $image, $channel['md5'], $channel['etag'], $channel['modified'], $feed['fid']); - watchdog('aggregator', 'There is new syndicated content from %site.', array('%site' => $feed['title'])); - drupal_set_message(t('There is new syndicated content from %site.', array('%site' => $feed['title']))); - } - break; - default: - watchdog('aggregator', 'The feed from %site seems to be broken, due to "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error), WATCHDOG_WARNING); - drupal_set_message(t('The feed from %site seems to be broken, because of error "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error))); - module_invoke('system', 'check_http_request'); + // Clear the cache. + cache_clear_all(); + } } @@ -697,148 +697,16 @@ function aggregator_parse_w3cdtf($date_s } /** - * Parse a feed and store its items. - * - * @param $data - * The feed data. - * @param $feed - * An associative array describing the feed to be parsed. - * @return - * FALSE on error, TRUE otherwise. - */ -function aggregator_parse_feed(&$data, $feed) { - global $items, $image, $channel; - - // Unset the global variables before we use them. - unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']); - $items = array(); - $image = array(); - $channel = array(); - - // Parse the data. - $xml_parser = drupal_xml_parser_create($data); - xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end'); - xml_set_character_data_handler($xml_parser, 'aggregator_element_data'); - - if (!xml_parse($xml_parser, $data, 1)) { - watchdog('aggregator', 'The feed from %site seems to be broken, due to an error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser)), WATCHDOG_WARNING); - drupal_set_message(t('The feed from %site seems to be broken, because of error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), 'error'); - return FALSE; - } - xml_parser_free($xml_parser); - - // We reverse the array such that we store the first item last, and the last - // item first. In the database, the newest item should be at the top. - $items = array_reverse($items); - - // Initialize variables. - $title = $link = $author = $description = $guid = NULL; - foreach ($items as $item) { - unset($title, $link, $author, $description, $guid); - - // Prepare the item: - foreach ($item as $key => $value) { - $item[$key] = trim($value); - } - - // Resolve the item's title. If no title is found, we use up to 40 - // characters of the description ending at a word boundary, but not - // splitting potential entities. - if (!empty($item['TITLE'])) { - $title = $item['TITLE']; - } - elseif (!empty($item['DESCRIPTION'])) { - $title = preg_replace('/^(.*)[^\w;&].*?$/', "\\1", truncate_utf8($item['DESCRIPTION'], 40)); - } - else { - $title = ''; - } - - // Resolve the items link. - if (!empty($item['LINK'])) { - $link = $item['LINK']; - } - else { - $link = $feed['link']; - } - $guid = isset($item['GUID']) ? $item['GUID'] : ''; - - // Atom feeds have a CONTENT and/or SUMMARY tag instead of a DESCRIPTION tag. - if (!empty($item['CONTENT:ENCODED'])) { - $item['DESCRIPTION'] = $item['CONTENT:ENCODED']; - } - elseif (!empty($item['SUMMARY'])) { - $item['DESCRIPTION'] = $item['SUMMARY']; - } - elseif (!empty($item['CONTENT'])) { - $item['DESCRIPTION'] = $item['CONTENT']; - } - - // Try to resolve and parse the item's publication date. If no date is - // found, use the current date instead. - $date = 'now'; - foreach (array('PUBDATE', 'DC:DATE', 'DCTERMS:ISSUED', 'DCTERMS:CREATED', 'DCTERMS:MODIFIED', 'ISSUED', 'CREATED', 'MODIFIED', 'PUBLISHED', 'UPDATED') as $key) { - if (!empty($item[$key])) { - $date = $item[$key]; - break; - } - } - - $timestamp = strtotime($date); // As of PHP 5.1.0, strtotime returns FALSE on failure instead of -1. - - if ($timestamp <= 0) { - $timestamp = aggregator_parse_w3cdtf($date); // Aggregator_parse_w3cdtf() returns FALSE on failure. - if (!$timestamp) { - // Better than nothing. - $timestamp = time(); - } - } - - // Save this item. Try to avoid duplicate entries as much as possible. If - // we find a duplicate entry, we resolve it and pass along its ID is such - // that we can update it if needed. - if (!empty($guid)) { - $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND guid = '%s'", $feed['fid'], $guid)); - } - else if ($link && $link != $feed['link'] && $link != $feed['url']) { - $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND link = '%s'", $feed['fid'], $link)); - } - else { - $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND title = '%s'", $feed['fid'], $title)); - } - $item += array('AUTHOR' => '', 'DESCRIPTION' => ''); - aggregator_save_item(array('iid' => (isset($entry->iid) ? $entry->iid: ''), 'fid' => $feed['fid'], 'timestamp' => $timestamp, 'title' => $title, 'link' => $link, 'author' => $item['AUTHOR'], 'description' => $item['DESCRIPTION'], 'guid' => $guid)); - } - - // Remove all items that are older than flush item timer. - $age = time() - variable_get('aggregator_clear', 9676800); - $result = db_query('SELECT iid FROM {aggregator_item} WHERE fid = %d AND timestamp < %d', $feed['fid'], $age); - - $items = array(); - $num_rows = FALSE; - while ($item = db_fetch_object($result)) { - $items[] = $item->iid; - $num_rows = TRUE; - } - if ($num_rows) { - db_query('DELETE FROM {aggregator_category_item} WHERE iid IN (' . implode(', ', $items) . ')'); - db_query('DELETE FROM {aggregator_item} WHERE fid = %d AND timestamp < %d', $feed['fid'], $age); - } - - return TRUE; -} - -/** * Add/edit/delete an aggregator item. * * @param $edit * An associative array describing the item to be added/edited/deleted. */ function aggregator_save_item($edit) { - if ($edit['iid'] && $edit['title']) { + if (is_numeric($edit['iid']) && $edit['title']) { db_query("UPDATE {aggregator_item} SET title = '%s', link = '%s', author = '%s', description = '%s', guid = '%s', timestamp = %d WHERE iid = %d", $edit['title'], $edit['link'], $edit['author'], $edit['description'], $edit['guid'], $edit['timestamp'], $edit['iid']); } - elseif ($edit['iid']) { + elseif (is_numeric($edit['iid'])) { db_query('DELETE FROM {aggregator_item} WHERE iid = %d', $edit['iid']); db_query('DELETE FROM {aggregator_category_item} WHERE iid = %d', $edit['iid']); } @@ -871,6 +739,25 @@ function aggregator_feed_load($fid) { } /** + * Tells if the given module is enabled (as parser or as processor). + * + * @param $module + * The name of the module. + * @return + * TRUE if enabled, FALSE if disabled. + */ +function aggregator_is_enabled($module) { + if ($module == variable_get('aggregator_parser', '')) { + return TRUE; + } + $processors = array_values(variable_get('aggregator_processors', array())); + if (in_array($module, $processors, TRUE)) { + return TRUE; + } + return FALSE; +} + +/** * Load an aggregator category. * * @param $cid @@ -937,3 +824,17 @@ function aggregator_filter_xss($value) { function _aggregator_items($count) { return format_plural($count, '1 item', '@count items'); } + +/** + * Checks for time limits in cron processing. + */ +function _aggregator_cron_time() { + static $time_limit; + $execute_percentage = 0.5; + if (!$time_limit) { + $time_limit = time() + ($execute_percentage / 100) * ini_get('max_execution_time'); + // However, check for left time, maybe some other cron processing already occured. + $time_limit = min($time_limit, variable_get('cron_semaphore', 0) + ini_get('max_execution_time')); + } + return max($time_limit - time(), 0); +} Index: modules/aggregator/aggregator.parser.inc =================================================================== RCS file: modules/aggregator/aggregator.parser.inc diff -N modules/aggregator/aggregator.parser.inc --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ modules/aggregator/aggregator.parser.inc 20 Aug 2008 11:40:58 -0000 @@ -0,0 +1,271 @@ +attributes(); + $type = strtolower($data->getName()); + if (isset($data->entry) || $type == "feed") { + return "atom"; + } + if ($type == "rdf" && isset($data->channel)) { + return "rdf"; + } + if ($type == "rss" && in_array($attr["version"], array('0.91', "0.92", "2.0"))) { + return "rss"; + } + } + return FALSE; +} + +/** + * Parses RSS 2.0, 0.91, 0.92 feeds. + */ +function aggregator_parser_rss(SimpleXMLElement $data) { + $feed = array(); + $dc = $data->channel->children('http://purl.org/dc/elements/1.1/'); + $feed['title'] = _aggregator_parser_choose("{$data->channel->title}", "{$dc->title}"); + $feed['description'] = _aggregator_parser_choose("{$data->channel->description}", "{$dc->subject}"); + $feed['link'] = isset($data->channel->link) ? "{$data->channel->link}" : ""; + $feed['image'] = isset($data->channel->image->url) ? "{$data->channel->image->url}" : ''; + $feed['items'] = array(); + $category_splitter = '.'; + foreach ($data->xpath('//item') as $news) { + // Get important namespaces. + $content = $news->children('http://purl.org/rss/1.0/modules/content/'); + $dc = $news->children('http://purl.org/dc/elements/1.1/'); + $item = array(); + $item['guid'] = isset($news->guid) ? "{$news->guid}" : NULL; + $item['title'] = _aggregator_parser_choose("{$news->title}", "{$dc->title}"); + $item['description'] = _aggregator_parser_choose("{$news->description}", "{$news->encoded}", "{$content->encoded}", "{$dc->description}"); + $item['link'] = _aggregator_parser_choose("{$news->link}"); + $item['timestamp'] = _aggregator_parse_date("{$news->pubDate}"); + $item['categories'] = array(); + if (isset($news->category)) { + foreach ($news->category as $cat) { + if (is_object($cat)) { + $item['categories'][] = trim(strip_tags("$cat")); + } + else { + foreach (explode($category_splitter, $cat) as $tag) { + $item['categories'][] = $tag; + } + } + } + } + $item['categories'] = array_unique($item['categories']); + $item['namespaces'] = aggregator_parser_extract_namespaces($news, $data->getNamespaces(TRUE)); + $item['enclosures'] = aggregator_parser_extract_enclosures($news); + $feed['items'][] = $item; + } + return $feed; +} + +/** + * Parses Atom 1.0 feeds. + */ +function aggregator_parser_atom(SimpleXMLElement $data) { + $feed = array(); + $feed['title'] = isset($data->title) ? "{$data->title}" : ""; + $feed['description'] = isset($data->subtitle) ? "{$data->subtitle}" : ""; + $feed['link'] = ''; + if (count($data->link) > 0) { + $link = $data->link; + $link = $link->attributes(); + $feed['link'] = isset($link["href"]) ? "{$link["href"]}" : ""; + } + $feed->items = array(); + foreach ($data->entry as $news) { + $item = array(); + $item['guid'] = !empty($news->id) ? "{$news->id}" : NULL; + + $link_element = "{$news->link}"; + $link_guid = valid_url($item['guid']) ? $item['guid'] : ''; + $item['link'] = _aggregator_parser_choose($link_element, $link_guid); + $item['title'] = "{$news->title}"; + $body = ''; + if (!empty($news->content)) { + foreach ($news->content->children() as $child) { + $body .= $child->asXML(); + } + $body .= "{$news->content}"; + } + else if (!empty($news->summary)) { + foreach ($news->summary->children() as $child) { + $body .= $child->asXML(); + } + $body .= "{$news->summary}"; + } + $item['description'] = $body; + $item['timestamp'] = _aggregator_parse_date("{$news->published}"); + $item['categories'] = array(); + if (isset($news->category)) { + foreach ($news->category as $category) + $item['categories'][] = trim(strip_tags("{$category['term']}")); + } + $item['categories'] = array_unique($item['categories']); + $item['namespaces'] = aggregator_parser_extract_namespaces($news, $data->getNamespaces(TRUE)); + $item['enclosures'] = aggregator_parser_extract_enclosures($news); + $feed['items'][] = $item; + } + return $feed; +} + +/** + * Parses RDF feeds. + */ +function aggregator_parser_rdf(SimpleXMLElement $data) { + $feed = array(); + $feed['title'] = isset($data->channel->title) ? "{$data->channel->title}" : ""; + $feed['description'] = isset($data->channel->description) ? "{$data->channel->description}" : ""; + $feed['link'] = isset($data->channel->link) ? "{$data->channel->link}" : ""; + $namespaces = $data->getNamespaces(TRUE); + // Set category splitter (space is for del.icio.us feed). + $category_splitter = ' '; + $feed['items'] = array(); + foreach ($data->item as $news) { + // Initialization. + $id = $original_url = NULL; + $title = $body = ''; + $categories = array(); + foreach ($namespaces as $ns_link) { + // Get about attribute as guid. + foreach ($news->attributes($ns_link) as $name => $value) { + if ($name == 'about') { + $id = "{$value}"; + } + } + + // Get children for current namespace. + if (version_compare(phpversion(), '5.1.2', '<')) { + $ns = (array) $news; + } + else { + $ns = (array) $news->children($ns_link); + } + + // Title + if (!empty($ns['title'])) { + $title = "{$ns['title']}"; + } + + // Description or dc:description + if (!empty($ns['description']) && $body == '') { + $body = "{$ns['description']}"; + } + + // Link + if (!empty($ns['link'])) { + $link = "{$ns['link']}"; + } + + // content:encoded + if (!empty($ns['encoded'])) { + $body = "{$ns['encoded']}"; + } + + $time_in = (empty($ns['pubDate']) ? (empty($ns['date']) ? '' : "{$ns['date']}") : "{$ns['pubDate']}"); + $timestamp = _aggregator_parse_date($time_in); + + // dc:subject + if (!empty($ns['subject'])) { + // there can be multiple category tags + if (is_array($ns['subject'])) { + foreach ($ns['subject'] as $cat) { + if (is_object($cat)) { + $categories[] = trim(strip_tags($cat->asXML())); + } + else { + $categories[] = $cat; + } + } + } + else { //or single tag + $categories = explode($category_splitter, "{$ns['subject']}"); + } + } + } + if (empty($original_url) && !empty($id)) { + $original_url = $id; + } + $item = array(); + $item['title'] = $title; + $item['description'] = $body; + $item['timestamp'] = $timestamp; + $item['link'] = isset($link) ? $link : ''; + $item['guid'] = $id; + $item['categories'] = $categories; + $item['namespaces'] = aggregator_parser_extract_namespaces($news, $data->getNamespaces(TRUE)); + $item['enclosures'] = aggregator_parser_extract_enclosures($news); + $feed['items'][] = $item; + } + return $feed; +} + +/** + * Extracts all the namespace-contained information to ->namespaces structure. + */ +function aggregator_parser_extract_namespaces(SimpleXMLElement $item, $namespaces) { + $result = array(); + foreach ($namespaces as $prefix => $url) { + $ns = (array) $item->children($url); + if (!(empty($ns) || empty($prefix))) { + $result[$prefix] = $ns; + } + } + return $result; +} + +/** + * Extracts all enclosures inside an item. + */ +function aggregator_parser_extract_enclosures(SimpleXMLElement $item) { + $result = array(); + @$item = simplexml_load_string($item->asXML()); + $possible_enclosures = $item->xpath("//enclosure") + $item->xpath("//link[@rel='enclosure']"); + foreach ($possible_enclosures as $enc) { + $add_enc = array(); + foreach ($enc->attributes() as $k => $v) { + $add_enc[$k] = "{$v}"; + } + $result[] = $add_enc; + } + return $result; +} + +/** + * Chooses the first argument which is not empty and return with it. + */ +function _aggregator_parser_choose() { + $args = func_get_args(); + foreach ($args as $arg) { + if (strlen($arg) > 1) { + return $arg; + } + } + return ''; +} + +/** + * Parses a date comes from a feed. + * + * @param $date_string + * The date string in various formats. + * @return + * The timestamp of the string or the current time if can't be parsed + */ +function _aggregator_parse_date($date_str) { + $parsed_date = strtotime($date_str); + if ($parsed_date === FALSE || $parsed_date == -1) { + $parsed_date = aggregator_parse_w3cdtf($date_str); + } + return $parsed_date === FALSE ? time() : $parsed_date; +}