Index: modules/aggregator/aggregator.module =================================================================== RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.module,v retrieving revision 1.382 diff -u -r1.382 aggregator.module --- modules/aggregator/aggregator.module 5 Jul 2008 05:57:00 -0000 1.382 +++ modules/aggregator/aggregator.module 1 Aug 2008 17:28:39 -0000 @@ -443,192 +443,54 @@ } /** - * Callback function used by the XML parser. - */ -function aggregator_element_start($parser, $name, $attributes) { - global $item, $element, $tag, $items, $channel; - - switch ($name) { - case 'IMAGE': - case 'TEXTINPUT': - case 'CONTENT': - case 'SUMMARY': - case 'TAGLINE': - case 'SUBTITLE': - case 'LOGO': - case 'INFO': - $element = $name; - break; - case 'ID': - if ($element != 'ITEM') { - $element = $name; - } - case 'LINK': - if (!empty($attributes['REL']) && $attributes['REL'] == 'alternate') { - if ($element == 'ITEM') { - $items[$item]['LINK'] = $attributes['HREF']; - } - else { - $channel['LINK'] = $attributes['HREF']; - } - } - break; - case 'ITEM': - $element = $name; - $item += 1; - break; - case 'ENTRY': - $element = 'ITEM'; - $item += 1; - break; - } - - $tag = $name; -} - -/** - * Call-back function used by the XML parser. - */ -function aggregator_element_end($parser, $name) { - global $element; - - switch ($name) { - case 'IMAGE': - case 'TEXTINPUT': - case 'ITEM': - case 'ENTRY': - case 'CONTENT': - case 'INFO': - $element = ''; - break; - case 'ID': - if ($element == 'ID') { - $element = ''; - } - } -} - -/** - * Callback function used by the XML parser. - */ -function aggregator_element_data($parser, $data) { - global $channel, $element, $items, $item, $image, $tag; - $items += array($item => array()); - switch ($element) { - case 'ITEM': - $items[$item] += array($tag => ''); - $items[$item][$tag] .= $data; - break; - case 'IMAGE': - case 'LOGO': - $image += array($tag => ''); - $image[$tag] .= $data; - break; - case 'LINK': - if ($data) { - $items[$item] += array($tag => ''); - $items[$item][$tag] .= $data; - } - break; - case 'CONTENT': - $items[$item] += array('CONTENT' => ''); - $items[$item]['CONTENT'] .= $data; - break; - case 'SUMMARY': - $items[$item] += array('SUMMARY' => ''); - $items[$item]['SUMMARY'] .= $data; - break; - case 'TAGLINE': - case 'SUBTITLE': - $channel += array('DESCRIPTION' => ''); - $channel['DESCRIPTION'] .= $data; - break; - case 'INFO': - case 'ID': - case 'TEXTINPUT': - // The sub-element is not supported. However, we must recognize - // it or its contents will end up in the item array. - break; - default: - $channel += array($tag => ''); - $channel[$tag] .= $data; - } -} - -/** * Checks a news feed for new items. * * @param $feed * An associative array describing the feed to be refreshed. */ function aggregator_refresh($feed) { - global $channel, $image; - // Generate conditional GET headers. - $headers = array(); - if ($feed['etag']) { - $headers['If-None-Match'] = $feed['etag']; - } - if ($feed['modified']) { - $headers['If-Modified-Since'] = gmdate('D, d M Y H:i:s', $feed['modified']) . ' GMT'; - } - - // Request feed. - $result = drupal_http_request($feed['url'], $headers); - - // Process HTTP response code. - switch ($result->code) { - case 304: - db_query('UPDATE {aggregator_feed} SET checked = %d WHERE fid = %d', time(), $feed['fid']); - drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title']))); - break; - case 301: - $feed['url'] = $result->redirect_url; - watchdog('aggregator', 'Updated URL for feed %title to %url.', array('%title' => $feed['title'], '%url' => $feed['url'])); - // Do not break here. - case 200: - case 302: - case 307: - // Filter the input data. - if (aggregator_parse_feed($result->data, $feed)) { - $modified = empty($result->headers['Last-Modified']) ? 0 : strtotime($result->headers['Last-Modified']); - - // Prepare the channel data. - foreach ($channel as $key => $value) { - $channel[$key] = trim($value); - } - - // Prepare the image data (if any). - foreach ($image as $key => $value) { - $image[$key] = trim($value); - } - - if (!empty($image['LINK']) && !empty($image['URL']) && !empty($image['TITLE'])) { - // TODO: we should really use theme_image() here, but that only works with - // local images. It won't work with images fetched with a URL unless PHP version > 5. - $image = '' . check_plain($image['TITLE']) . ''; - } - else { - $image = NULL; - } - - $etag = empty($result->headers['ETag']) ? '' : $result->headers['ETag']; - // Update the feed data. - db_query("UPDATE {aggregator_feed} SET url = '%s', checked = %d, link = '%s', description = '%s', image = '%s', etag = '%s', modified = %d WHERE fid = %d", $feed['url'], time(), $channel['LINK'], $channel['DESCRIPTION'], $image, $etag, $modified, $feed['fid']); + $etag = isset($feed['etag']) ? $feed['etag'] : ''; + $modified = isset($feed['modified']) ? $feed['modified'] : ''; - // Clear the cache. - cache_clear_all(); + // Retrieve feed. + $result = drupal_retrieve_feed($feed['url'], $etag, $modified); - watchdog('aggregator', 'There is new syndicated content from %site.', array('%site' => $feed['title'])); - drupal_set_message(t('There is new syndicated content from %site.', array('%site' => $feed['title']))); - break; - } - $result->error = t('feed not parseable'); - // Do not break here.. - default: - watchdog('aggregator', 'The feed from %site seems to be broken, due to "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error), WATCHDOG_WARNING); - drupal_set_message(t('The feed from %site seems to be broken, because of error "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error))); - module_invoke('system', 'check_http_request'); + if (isset($result->redirect_url)) { + $feed['url'] = $result->redirect_url; + } + + // Filter the items data. + if (!isset($result->error) && isset($result->items) && aggregator_save_items($result->items, $feed)) { + if (!empty($result->image['LINK']) && !empty($result->image['URL']) && !empty($result->image['TITLE'])) { + // TODO: we should really use theme_image() here, but that only works with + // local images. It won't work with images fetched with a URL unless PHP version > 5. + $image = '' . check_plain($result->image['TITLE']) . ''; + } + else { + $image = NULL; + } + + // Update the feed data. + db_query("UPDATE {aggregator_feed} SET url = '%s', checked = %d, link = '%s', description = '%s', image = '%s', etag = '%s', modified = %d WHERE fid = %d", $feed['url'], time(), $result->channel['LINK'], $result->channel['DESCRIPTION'], $image, $result->etag, $result->modified, $feed['fid']); + + // Clear the cache. + cache_clear_all(); + + if (isset($result->redirect_url)) { + watchdog('aggregator', 'Updated URL for feed %title to %url.', array('%title' => $feed['title'], '%url' => $feed['url'])); + } + + watchdog('aggregator', 'There is new syndicated content from %site.', array('%site' => $feed['title'])); + drupal_set_message(t('There is new syndicated content from %site.', array('%site' => $feed['title']))); + } + else { + db_query('UPDATE {aggregator_feed} SET checked = %d WHERE fid = %d', time(), $feed['fid']); + drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title']))); + if (isset($result->error)) { + watchdog('aggregator', 'The feed from %site seems to be broken, due to error "%error".', array('%site' => $feed['title'], '%error' => $result->error), WATCHDOG_WARNING); + drupal_set_message(t('The feed from %site seems to be broken, due to error "%error".', array('%site' => $feed['title'], '%error' => $result->error))); + } } } @@ -673,7 +535,7 @@ } /** - * Parse a feed and store its items. + * Store a feed's items. * * @param $data * The feed data. @@ -682,40 +544,15 @@ * @return * FALSE on error, TRUE otherwise. */ -function aggregator_parse_feed(&$data, $feed) { - global $items, $image, $channel; - - // Unset the global variables before we use them. - unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']); - $items = array(); - $image = array(); - $channel = array(); - - // Parse the data. - $xml_parser = drupal_xml_parser_create($data); - xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end'); - xml_set_character_data_handler($xml_parser, 'aggregator_element_data'); - - if (!xml_parse($xml_parser, $data, 1)) { - watchdog('aggregator', 'The feed from %site seems to be broken, due to an error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser)), WATCHDOG_WARNING); - drupal_set_message(t('The feed from %site seems to be broken, because of error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), 'error'); - return FALSE; - } - xml_parser_free($xml_parser); - +function aggregator_save_items(&$items, $feed) { // We reverse the array such that we store the first item last, and the last // item first. In the database, the newest item should be at the top. $items = array_reverse($items); // Initialize variables. - $title = $link = $author = $description = $guid = NULL; + $title = $link = $guid = NULL; foreach ($items as $item) { - unset($title, $link, $author, $description, $guid); - - // Prepare the item: - foreach ($item as $key => $value) { - $item[$key] = trim($value); - } + unset($title, $link, $guid); // Resolve the item's title. If no title is found, we use up to 40 // characters of the description ending at a word boundary, but not @@ -821,7 +658,7 @@ elseif ($edit['title'] && $edit['link']) { db_query("INSERT INTO {aggregator_item} (fid, title, link, author, description, timestamp, guid) VALUES (%d, '%s', '%s', '%s', '%s', %d, '%s')", $edit['fid'], $edit['title'], $edit['link'], $edit['author'], $edit['description'], $edit['timestamp'], $edit['guid']); $edit['iid'] = db_last_insert_id('aggregator_item', 'iid'); - // file the items in the categories indicated by the feed + // File the items in the categories indicated by the feed. $categories = db_query('SELECT cid FROM {aggregator_category_feed} WHERE fid = %d', $edit['fid']); while ($category = db_fetch_object($categories)) { db_query('INSERT INTO {aggregator_category_item} (cid, iid) VALUES (%d, %d)', $category->cid, $edit['iid']); Index: includes/common.inc =================================================================== RCS file: /cvs/drupal/drupal/includes/common.inc,v retrieving revision 1.779 diff -u -r1.779 common.inc --- includes/common.inc 19 Jul 2008 10:38:13 -0000 1.779 +++ includes/common.inc 1 Aug 2008 17:28:34 -0000 @@ -3577,3 +3577,62 @@ } variable_set('css_js_query_string', $new_character . substr($string_history, 0, 19)); } + +/** + * Fetch a feed from URL. + * + * @param $url + * A string containing a fully qualified URI. + * @param $etag + * Optional Etag for the header checks. + * @param $modified + * Optional timestamp of last check. Will return an empty array if unmodified. + * @return + * An object containing the feed data and headers. + */ +function drupal_retrieve_feed($url, $etag = NULL, $modified = NULL) { + $feed = new stdClass(); + + // Generate conditional GET headers. + $headers = array(); + if (isset($etag)) { + $headers['If-None-Match'] = $etag; + } + if (isset($modified)) { + $headers['If-Modified-Since'] = gmdate('D, d M Y H:i:s', $modified) . ' GMT'; + } + + // Request feed. + $result = drupal_http_request($url, $headers); + + // Process HTTP response code. + switch ($result->code) { + case 301: + $url = $redirect_url = $result->redirect_url; + // Do not break here. + case 200: + case 302: + case 307: + // Allow alternate feed parsing libraries. + require_once variable_get('feed_inc', './includes/feed.inc'); + + $feed = feed_parse($result->data, $url); + + if (isset($feed->error)) { + break; + } + + $feed->etag = empty($result->headers['ETag']) ? '' : $result->headers['ETag']; + $feed->modified = empty($result->headers['Last-Modified']) ? 0 : strtotime($result->headers['Last-Modified']); + if (isset($redirect_url)) { + $feed->redirect_url = $redirect_url; + } + return $feed; + default: + if (!isset($feed->error) && isset($result->error)) { + module_invoke('system', 'check_http_request'); + $feed->error = $result->code . ' ' . $result->error; + } + return $feed; + } +} Index: includes/feed.inc =================================================================== RCS file: includes/feed.inc diff -N includes/feed.inc --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ includes/feed.inc 1 Jan 1970 00:00:00 -0000 @@ -0,0 +1,175 @@ + array()); + switch ($element) { + case 'ITEM': + $items[$item] += array($tag => ''); + $items[$item][$tag] .= $data; + break; + case 'IMAGE': + case 'LOGO': + $image += array($tag => ''); + $image[$tag] .= $data; + break; + case 'LINK': + if ($data) { + $items[$item] += array($tag => ''); + $items[$item][$tag] .= $data; + } + break; + case 'CONTENT': + $items[$item] += array('CONTENT' => ''); + $items[$item]['CONTENT'] .= $data; + break; + case 'SUMMARY': + $items[$item] += array('SUMMARY' => ''); + $items[$item]['SUMMARY'] .= $data; + break; + case 'TAGLINE': + case 'SUBTITLE': + $channel += array('DESCRIPTION' => ''); + $channel['DESCRIPTION'] .= $data; + break; + case 'INFO': + case 'ID': + case 'TEXTINPUT': + // The sub-element is not supported. However, we must recognize + // it or its contents will end up in the item array. + break; + default: + $channel += array($tag => ''); + $channel[$tag] .= $data; + } +} + +/** + * Parse a feed. + * + * @param $data + * The feed data. + * @param $url + * The URL to the feed. + * @return + * FALSE on error, an object on success. + */ +function feed_parse(&$data, $url) { + global $items, $image, $channel; + $feed = new stdClass(); + + // Unset the global variables before we use them. + unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']); + $items = array(); + $image = array(); + $channel = array(); + + // Parse the data. + $xml_parser = drupal_xml_parser_create($data); + xml_set_element_handler($xml_parser, '_feed_element_start', '_feed_element_end'); + xml_set_character_data_handler($xml_parser, '_feed_element_data'); + + if (!xml_parse($xml_parser, $data, 1)) { + $feed->error = xml_error_string(xml_get_error_code($xml_parser)) . ' on line ' . xml_get_current_line_number($xml_parser); + return $feed; + } + xml_parser_free($xml_parser); + + // Prepare the channel data. + foreach ($channel as $key => $value) { + $channel[$key] = trim($value); + } + $feed->channel = $channel; + + // Prepare the image data (if any). + foreach ($image as $key => $value) { + $image[$key] = trim($value); + } + $feed->image = $image; + + $feed->items = array(); + foreach ($items as $item) { + // Prepare the item. + foreach ($item as $key => $value) { + $item[$key] = trim($value); + } + $feed->items[] = $item; + } + + return $feed; +} Index: includes/tests/feed.test =================================================================== RCS file: includes/tests/feed.test diff -N includes/tests/feed.test --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ includes/tests/feed.test 1 Jan 1970 00:00:00 -0000 @@ -0,0 +1,229 @@ + t('Feed parsing'), + 'description' => t('Test feed parsing with RSS, RDF and Atom formatted sample feeds.'), + 'group' => t('System'), + ); + } + + function getRSS091Sample() { + $data = << + + + WriteTheWeb + http://writetheweb.com + News for web users that write back + en-us + Copyright 2000, WriteTheWeb team. + editor@writetheweb.com + webmaster@writetheweb.com + + WriteTheWeb + http://writetheweb.com/images/mynetscape88.gif + http://writetheweb.com + 88 + 31 + News for web users that write back + + + Giving the world a pluggable Gnutella + http://writetheweb.com/read.php?item=24 + WorldOS is a framework on which to build programs that work like Freenet or Gnutella -allowing distributed applications using peer-to-peer routing. + + + +EOT; + return $data; + } + + function getRSS092Sample() { + $data = << + + + + Dave Winer: Grateful Dead + http://www.scripting.com/blog/categories/gratefulDead.html + A high-fidelity Grateful Dead song every day. This is where we're experimenting with enclosures on RSS news items that download when you're not using your computer. If it works (it will) it will be the end of the Click-And-Wait multimedia experience on the Internet. + Fri, 13 Apr 2001 19:23:02 GMT + http://backend.userland.com/rss092 + dave@userland.com (Dave Winer) + dave@userland.com (Dave Winer) + + + It's been a few days since I added a song to the Grateful Dead channel. Now that there are all these new Radio users, many of whom are tuned into this channel (it's #16 on the hotlist of upstreaming Radio users, there's no way of knowing how many non-upstreaming users are subscribing, have to do something about this..). Anyway, tonight's song is a live version of Weather Report Suite from Dick's Picks Volume 7. It's wistful music. Of course a beautiful song, oft-quoted here on Scripting News. <i>A little change, the wind and rain.</i> + + + + + +EOT; + return $data; + } + + function getRSS2Sample() { + $data = << + + + Liftoff News + http://liftoff.msfc.nasa.gov/ + Liftoff to Space Exploration. + en-us + Tue, 10 Jun 2003 04:00:00 GMT + Tue, 10 Jun 2003 09:41:01 GMT + http://blogs.law.harvard.edu/tech/rss + Weblog Editor 2.0 + editor@example.com + webmaster@example.com + + Star City + http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp + How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>. + Tue, 03 Jun 2003 09:39:21 GMT + http://liftoff.msfc.nasa.gov/2003/06/03.html#item573 + + + +EOT; + return $data; + } + + function getRDFSample() { + $data = << + + + World Wide Web Consortium + + +EOT; + return $data; + } + + function getAtomSample() { + $data = << + + Example Feed + + 2003-12-13T18:30:02Z + + John Doe + + urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 + + Atom-Powered Robots Run Amok + + urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a + 2003-12-13T18:30:02Z + Some text. + + +EOT; + return $data; + } + + function _testRSS091Sample() { + $feed = feed_parse($this->getRSS091Sample(), ''); + + $this->assertEqual($feed->channel['TITLE'], 'WriteTheWeb', t('Channel title retrieved from RSS 0.91 formatted sample.')); + $this->assertEqual($feed->channel['LINK'], 'http://writetheweb.com', t('Channel link retrieved from RSS 0.91 formatted sample.')); + $this->assertEqual($feed->channel['DESCRIPTION'], 'News for web users that write back', t('Channel description retrieved from RSS 0.91 formatted sample.')); + + $this->assertEqual($feed->image['TITLE'], 'WriteTheWeb', t('Image title retrieved from RSS 0.91 formatted sample.')); + $this->assertEqual($feed->image['LINK'], 'http://writetheweb.com', t('Image link retrieved from RSS 0.91 formatted sample.')); + $this->assertEqual($feed->image['URL'], 'http://writetheweb.com/images/mynetscape88.gif', t('Image URL retrieved from RSS 0.91 formatted sample.')); + + $test_item = array(); + foreach ($feed->items as $item) { + $test_item = $item; + } + $this->assertEqual($test_item['TITLE'], 'Giving the world a pluggable Gnutella', t('Item title retrieved from RSS 0.91 formatted sample.')); + $this->assertEqual($test_item['LINK'], 'http://writetheweb.com/read.php?item=24', t('Item link retrieved from RSS 0.91 formatted sample.')); + $this->assertEqual($test_item['DESCRIPTION'], 'WorldOS is a framework on which to build programs that work like Freenet or Gnutella -allowing distributed applications using peer-to-peer routing.', t('Item description retrieved from RSS 0.91 formatted sample.')); + } + + function _testRSS092Sample() { + $feed = feed_parse($this->getRSS092Sample(), ''); + + $this->assertEqual($feed->channel['TITLE'], 'Dave Winer: Grateful Dead', t('Channel title retrieved from RSS 0.92 formatted sample.')); + $this->assertEqual($feed->channel['LINK'], 'http://www.scripting.com/blog/categories/gratefulDead.html', t('Channel link retrieved from RSS 0.92 formatted sample.')); + $this->assertEqual($feed->channel['DESCRIPTION'], "A high-fidelity Grateful Dead song every day. This is where we're experimenting with enclosures on RSS news items that download when you're not using your computer. If it works (it will) it will be the end of the Click-And-Wait multimedia experience on the Internet.", t('Channel description retrieved from RSS 0.92 formatted sample.')); + + $test_item = array(); + foreach ($feed->items as $item) { + $test_item = $item; + } + + $this->assertEqual($test_item['DESCRIPTION'], "It's been a few days since I added a song to the Grateful Dead channel. Now that there are all these new Radio users, many of whom are tuned into this channel (it's #16 on the hotlist of upstreaming Radio users, there's no way of knowing how many non-upstreaming users are subscribing, have to do something about this..). Anyway, tonight's song is a live version of Weather Report Suite from Dick's Picks Volume 7. It's wistful music. Of course a beautiful song, oft-quoted here on Scripting News. A little change, the wind and rain.", t('Item description retrieved from RSS 0.92 formatted sample.')); + } + + function _testRSS2Sample() { + $feed = feed_parse($this->getRSS2Sample(), ''); + + $this->assertEqual($feed->channel['TITLE'], 'Liftoff News', t('Channel title retrieved from RSS 2.0 formatted sample.')); + $this->assertEqual($feed->channel['LINK'], 'http://liftoff.msfc.nasa.gov/', t('Channel link retrieved from RSS 2.0 formatted sample.')); + $this->assertEqual($feed->channel['DESCRIPTION'], "Liftoff to Space Exploration.", t('Channel description retrieved from RSS 2.0 formatted sample.')); + + $test_item = array(); + foreach ($feed->items as $item) { + $test_item = $item; + } + + $this->assertEqual($test_item['TITLE'], 'Star City', t('Item title retrieved from RSS 2.0 formatted sample.')); + $this->assertEqual($test_item['LINK'], 'http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp', t('Item link retrieved from RSS 2.0 formatted sample.')); + $this->assertEqual($test_item['DESCRIPTION'], "How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's Star City.", t('Item description retrieved from RSS 2.0 formatted sample.')); + } + + function _testRDFSample() { + $feed = feed_parse($this->getRDFSample(), ''); + + $this->assertEqual($feed->channel['DC:TITLE'], 'World Wide Web Consortium', t('Channel title retrieved from RDF formatted sample.')); + } + + function _testAtomSample() { + $feed = feed_parse($this->getAtomSample(), ''); + + $this->assertEqual($feed->channel['TITLE'], 'Example Feed', t('Channel title retrieved from Atom formatted sample.')); + $this->assertEqual($feed->channel['LINK'], 'http://example.org/', t('Channel link retrieved from Atom formatted sample.')); + + $test_item = array(); + foreach ($feed->items as $item) { + $test_item = $item; + } + + $this->assertEqual($test_item['TITLE'], 'Atom-Powered Robots Run Amok', t('Item title retrieved from Atom formatted sample.')); + $this->assertEqual($test_item['LINK'], 'http://example.org/2003/12/13/atom03', t('Item link retrieved from Atom formatted sample.')); + $this->assertEqual($test_item['SUMMARY'], "Some text.", t('Item description retrieved from Atom formatted sample.')); + } + + function testFeedParsing() { + if (!drupal_function_exists('feed_parse')) { + return; + } + + $this->_testRSS091Sample(); + $this->_testRSS092Sample(); + $this->_testRSS2Sample(); + $this->_testRDFSample(); + $this->_testAtomSample(); + } +}