Index: includes/common.inc
===================================================================
RCS file: /cvs/drupal/drupal/includes/common.inc,v
retrieving revision 1.788
diff -u -p -r1.788 common.inc
--- includes/common.inc 21 Aug 2008 19:36:36 -0000 1.788
+++ includes/common.inc 3 Sep 2008 15:36:53 -0000
@@ -3562,3 +3562,77 @@ function _drupal_flush_css_js() {
}
variable_set('css_js_query_string', $new_character . substr($string_history, 0, 19));
}
+
+/**
+ * Fetch a feed from URL.
+ *
+ * @param $url
+ * A string containing a fully qualified URI.
+ * @param $modified
+ * Optional timestamp of last check.
+ * @param $etag
+ * Optional Etag for the header checks.
+ * @param $md5
+ * Optional md5 hash of last retrieved feed data.
+ * @return
+ * An object containing the feed data and headers.
+ */
+function drupal_retrieve_feed($url, $modified = NULL, $etag = NULL, $md5 = NULL) {
+ $feed = new stdClass();
+ $feed->items = array();
+
+ // Generate conditional GET headers.
+ $headers = array();
+ if ($etag) {
+ $headers['If-None-Match'] = $etag;
+ }
+ if ($modified) {
+ $headers['If-Modified-Since'] = gmdate('D, d M Y H:i:s', $modified) . ' GMT';
+ }
+
+ // Request feed.
+ $result = drupal_http_request($url, $headers);
+
+ // Process HTTP response code.
+ switch ($result->code) {
+ case 304:
+ break;
+ case 301:
+ $redirect_url = $result->redirect_url;
+ case 200:
+ case 302:
+ case 307:
+ if (!isset($result->data)) {
+ break;
+ }
+
+ // Allow alternate feed parsing libraries.
+ require_once variable_get('feed_inc', './includes/feed.inc');
+
+ feed_set_data($result->data);
+ feed_set_headers($result->headers);
+
+ // We store the md5 hash of feed data in the database. When refreshing a
+ // feed we compare stored hash and new hash calculated from downloaded
+ // data. If both are equal we say that feed is not updated.
+ $new_md5 = '';
+ if ($md5 != NULL) {
+ $new_md5 = feed_get_hash();
+ if ($new_md5 == $md5) {
+ break;
+ }
+ }
+
+ $feed = feed_parse();
+ if (!isset($feed->error)) {
+ if (isset($redirect_url)) {
+ $feed->redirect_url = $redirect_url;
+ }
+ }
+ break;
+ default:
+ module_invoke('system', 'check_http_request');
+ }
+
+ return $feed;
+}
Index: includes/feed.inc
===================================================================
RCS file: includes/feed.inc
diff -N includes/feed.inc
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ includes/feed.inc 3 Sep 2008 15:36:53 -0000
@@ -0,0 +1,446 @@
+attributes();
+ $type = strtolower($data->getName());
+ if (isset($data->entry) || $type == "feed") {
+ return "atom";
+ }
+ if ($type == "rdf") {
+ return "rdf";
+ }
+ if ($type == "rss" && in_array($attr["version"], array('0.91', "0.92", "2.0"))) {
+ return "rss";
+ }
+ }
+ return FALSE;
+}
+
+/**
+ * Parses RSS 2.0, 0.91, 0.92 feeds.
+ *
+ * @param $data
+ * The SimpleXML object of the feed.
+ * @return
+ * Nested associative array. See $feed['items'] for accessing to the items.
+ */
+function feed_parse_rss(SimpleXMLElement $data) {
+ $feed = new stdClass();
+ $feed->channel = array();
+ $dc = $data->channel->children('http://purl.org/dc/elements/1.1/');
+ $feed->channel['TITLE'] = htmlspecialchars_decode(_feed_parse_choose("{$data->channel->title}", "{$dc->title}"));
+ $feed->channel['DESCRIPTION'] = htmlspecialchars_decode(_feed_parse_choose("{$data->channel->description}", "{$dc->subject}"));
+ $feed->channel['LINK'] = isset($data->channel->link) ? "{$data->channel->link}" : "";
+ $feed->image = array();
+ $feed->image['URL'] = isset($data->channel->image->url) ? "{$data->channel->image->url}" : '';
+ $feed->image['LINK'] = isset($data->channel->image->url) ? "{$data->channel->image->link}" : '';
+ $feed->image['TITLE'] = isset($data->channel->image->url) ? "{$data->channel->image->title}" : '';
+ $feed->items = array();
+ $category_splitter = '.';
+ foreach ($data->xpath('//item') as $news) {
+ // Get important namespaces.
+ $content = $news->children('http://purl.org/rss/1.0/modules/content/');
+ $dc = $news->children('http://purl.org/dc/elements/1.1/');
+ $item = array();
+ $item['GUID'] = isset($news->guid) ? "{$news->guid}" : NULL;
+ $item['TITLE'] = htmlspecialchars_decode(_feed_parse_choose("{$news->title}", "{$dc->title}"));
+ $item['DESCRIPTION'] = _feed_parse_choose("{$news->description}", "{$news->encoded}", "{$content->encoded}", "{$dc->description}");
+ $item['LINK'] = _feed_parse_choose("{$news->link}");
+ $item['TIMESTAMP'] = _feed_parse_date("{$news->pubDate}");
+ $item['CATEGORIES'] = array();
+ if (isset($news->category)) {
+ foreach ($news->category as $cat) {
+ if (is_object($cat)) {
+ $item['CATEGORIES'][] = trim(strip_tags("$cat"));
+ }
+ else {
+ foreach (explode($category_splitter, $cat) as $tag) {
+ $item['CATEGORIES'][] = $tag;
+ }
+ }
+ }
+ }
+ $item['CATEGORIES'] = array_unique($item['CATEGORIES']);
+ $item['NAMESPACES'] = feed_parse_extract_namespaces($news, $data->getNamespaces(TRUE));
+ $item['ENCLOSURES'] = feed_parse_extract_enclosures($news);
+ $feed->items[] = $item;
+ }
+ return $feed;
+}
+
+/**
+ * Parses Atom 1.0 feeds.
+ *
+ * @param $data
+ * The SimpleXML object of the feed.
+ * @return
+ * Nested associative array. See $feed['items'] for accessing to the items.
+ */
+function feed_parse_atom(SimpleXMLElement $data) {
+ $feed = new stdClass();
+ $feed->channel = array();
+ $feed->channel['TITLE'] = isset($data->title) ? "{$data->title}" : "";
+ $feed->channel['DESCRIPTION'] = isset($data->subtitle) ? "{$data->subtitle}" : "";
+ $feed->channel['LINK'] = '';
+ if (count($data->link) > 0) {
+ $link = $data->link;
+ $link = $link->attributes();
+ $feed->channel['LINK'] = isset($link["href"]) ? "{$link["href"]}" : "";
+ }
+ $feed->items = array();
+ foreach ($data->entry as $news) {
+ $item = array();
+ $item['GUID'] = !empty($news->id) ? "{$news->id}" : NULL;
+
+ $link_element = "{$news->link['href']}";
+ $link_guid = valid_url($item['GUID']) ? $item['GUID'] : '';
+ $item['LINK'] = _feed_parse_choose($link_element, $link_guid);
+ $item['TITLE'] = "{$news->title}";
+ $body = '';
+ if (!empty($news->content)) {
+ foreach ($news->content->children() as $child) {
+ $body .= $child->asXML();
+ }
+ $body .= "{$news->content}";
+ }
+ else if (!empty($news->summary)) {
+ foreach ($news->summary->children() as $child) {
+ $body .= $child->asXML();
+ }
+ $body .= "{$news->summary}";
+ }
+ $item['DESCRIPTION'] = $body;
+ $item['TIMESTAMP'] = _feed_parse_date("{$news->published}");
+ $item['CATEGORIES'] = array();
+ if (isset($news->category)) {
+ foreach ($news->category as $category)
+ $item['CATEGORIES'][] = trim(strip_tags("{$category['term']}"));
+ }
+ $item['CATEGORIES'] = array_unique($item['CATEGORIES']);
+ $item['NAMESPACES'] = feed_parse_extract_namespaces($news, $data->getNamespaces(TRUE));
+ $item['ENCLOSURES'] = feed_parse_extract_enclosures($news);
+ $feed->items[] = $item;
+ }
+ return $feed;
+}
+
+/**
+ * Parses RDF feeds.
+ *
+ * @param $data
+ * The SimpleXML object of the feed.
+ * @return
+ * Nested associative array. See $feed['items'] for accessing to the items.
+ */
+function feed_parse_rdf(SimpleXMLElement $data) {
+ $feed = new stdClass();
+ $feed->channel = array();
+ $feed->channel['TITLE'] = isset($data->channel->title) ? "{$data->channel->title}" : "";
+ $feed->channel['DESCRIPTION'] = isset($data->channel->description) ? "{$data->channel->description}" : "";
+ $feed->channel['LINK'] = isset($data->channel->link) ? "{$data->channel->link}" : "";
+ $namespaces = $data->getNamespaces(TRUE);
+ // Set category splitter (space is for del.icio.us feed).
+ $category_splitter = ' ';
+ $feed->items = array();
+ foreach ($data->item as $news) {
+ // Initialization.
+ $id = $original_url = NULL;
+ $title = $body = '';
+ $categories = array();
+ foreach ($namespaces as $ns_link) {
+ // Get about attribute as guid.
+ foreach ($news->attributes($ns_link) as $name => $value) {
+ if ($name == 'about') {
+ $id = "{$value}";
+ }
+ }
+
+ // Get children for current namespace.
+ if (version_compare(phpversion(), '5.1.2', '<')) {
+ $ns = (array) $news;
+ }
+ else {
+ $ns = (array) $news->children($ns_link);
+ }
+
+ // Title
+ if (!empty($ns['title'])) {
+ $title = "{$ns['title']}";
+ }
+
+ // Description or dc:description
+ if (!empty($ns['description']) && $body == '') {
+ $body = "{$ns['description']}";
+ }
+
+ // Link
+ if (!empty($ns['link'])) {
+ $link = "{$ns['link']}";
+ }
+
+ // content:encoded
+ if (!empty($ns['encoded'])) {
+ $body = "{$ns['encoded']}";
+ }
+
+ $time_in = (empty($ns['pubDate']) ? (empty($ns['date']) ? '' : "{$ns['date']}") : "{$ns['pubDate']}");
+ $timestamp = _feed_parse_date($time_in);
+
+ // dc:subject
+ if (!empty($ns['subject'])) {
+ // there can be multiple category tags
+ if (is_array($ns['subject'])) {
+ foreach ($ns['subject'] as $cat) {
+ if (is_object($cat)) {
+ $categories[] = trim(strip_tags($cat->asXML()));
+ }
+ else {
+ $categories[] = $cat;
+ }
+ }
+ }
+ else { //or single tag
+ $categories = explode($category_splitter, "{$ns['subject']}");
+ }
+ }
+ }
+ if (empty($original_url) && !empty($id)) {
+ $original_url = $id;
+ }
+ $item = array();
+ $item['TITLE'] = $title;
+ $item['DESCRIPTION'] = $body;
+ $item['TIMESTAMP'] = $timestamp;
+ $item['LINK'] = isset($link) ? $link : '';
+ $item['GUID'] = $id;
+ $item['CATEGORIES'] = $categories;
+ $item['NAMESPACES'] = feed_parse_extract_namespaces($news, $data->getNamespaces(TRUE));
+ $item['ENCLOSURES'] = feed_parse_extract_enclosures($news);
+ $feed->items[] = $item;
+ }
+ return $feed;
+}
+
+/**
+ * Extracts all the namespace-contained information to ->namespaces structure.
+ *
+ * @param $item
+ * A SimpleXML object.
+ * @param $namespaces
+ * Array of namespaces, indexed with the namespace prefix, the value is the namespace URL.
+ * @return
+ * Namespace items in an array, array($ns_prefix => array('key' => 'value')).
+ */
+function feed_parse_extract_namespaces(SimpleXMLElement $item, $namespaces) {
+ $result = array();
+ foreach ($namespaces as $prefix => $url) {
+ $ns = (array) $item->children($url);
+ if (!(empty($ns) || empty($prefix))) {
+ $result[$prefix] = $ns;
+ }
+ }
+ return $result;
+}
+
+/**
+ * Extracts all enclosures inside an item.
+ *
+ * @param $item
+ * A SimpleXML object.
+ * @return
+ * Enclosures in an array, array('key' => 'value').
+ */
+function feed_parse_extract_enclosures(SimpleXMLElement $item) {
+ $result = array();
+ @$item = simplexml_load_string($item->asXML());
+ $possible_enclosures = $item->xpath("//enclosure") + $item->xpath("//link[@rel='enclosure']");
+ foreach ($possible_enclosures as $enc) {
+ $add_enc = array();
+ foreach ($enc->attributes() as $k => $v) {
+ $add_enc[$k] = "{$v}";
+ }
+ $result[] = $add_enc;
+ }
+ return $result;
+}
+
+/**
+ * Chooses the first argument which is not empty and return with it.
+ */
+function _feed_parse_choose() {
+ $args = func_get_args();
+ foreach ($args as $arg) {
+ if (strlen($arg) > 1) {
+ return is_string($arg) ? trim($arg) : $arg;
+ }
+ }
+ return '';
+}
+
+/**
+ * Parses a date comes from a feed.
+ *
+ * @param $date_str
+ * The date string in various formats.
+ * @return
+ * The timestamp of the string or the current time if can't be parsed
+ */
+function _feed_parse_date($date_str) {
+ $parsed_date = strtotime($date_str);
+ if ($parsed_date === FALSE || $parsed_date == -1) {
+ $parsed_date = _feed_parse_w3cdtf($date_str);
+ }
+ return $parsed_date === FALSE ? time() : $parsed_date;
+}
+
+/**
+ * Parse the W3C date/time format, a subset of ISO 8601.
+ *
+ * PHP date parsing functions do not handle this format.
+ * See http://www.w3.org/TR/NOTE-datetime for more information.
+ * Originally from MagpieRSS (http://magpierss.sourceforge.net/).
+ *
+ * @param $date_str
+ * A string with a potentially W3C DTF date.
+ * @return
+ * A timestamp if parsed successfully or FALSE if not.
+ */
+function _feed_parse_w3cdtf($date_str) {
+ if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
+ list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
+ // Calculate the epoch for current date assuming GMT.
+ $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
+ if ($match[10] != 'Z') { // Z is zulu time, aka GMT
+ list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
+ // Zero out the variables.
+ if (!$tz_hour) {
+ $tz_hour = 0;
+ }
+ if (!$tz_min) {
+ $tz_min = 0;
+ }
+ $offset_secs = (($tz_hour * 60) + $tz_min) * 60;
+ // Is timezone ahead of GMT? If yes, subtract offset.
+ if ($tz_mod == '+') {
+ $offset_secs *= -1;
+ }
+ $epoch += $offset_secs;
+ }
+ return $epoch;
+ }
+ else {
+ return FALSE;
+ }
+}
+
+
+/**
+ * Set XML feed data to be parsed.
+ *
+ * @param $data
+ * XML feed data.
+ * @return
+ * Stored XML feed data.
+ */
+function feed_set_data($data = NULL) {
+ static $static = NULL;
+ if ($data != NULL) {
+ $static = $data;
+ }
+ return $static;
+}
+
+/**
+ * Get stored XML feed data.
+ *
+ * @return
+ * Stored XML feed data.
+ * @see feed_set_data()
+ */
+function feed_get_data() {
+ return feed_set_data();
+}
+
+/**
+ * Get calculated md5 hash of the feed data.
+ *
+ * @return
+ * Calculated md5 hash of the feed data.
+ */
+function feed_get_hash() {
+ return md5(feed_get_data());
+}
+
+/**
+ * Store HTTP response headers returned by drupal_http_request().
+ *
+ * @param $headers
+ * HTTP response headers returned by drupal_http_request().
+ * @return
+ * Stored HTTP response headers.
+ */
+function feed_set_headers($headers = array()) {
+ static $static = array();
+ if ($headers) {
+ $static = $headers;
+ }
+ return $static;
+}
+
+/**
+ * Get stored HTTP response headers.
+ *
+ * @return
+ * Stored HTTP response headers.
+ */
+function feed_get_headers() {
+ return feed_set_headers();
+}
+
+/**
+ * Parse stored XML feed data into an object.
+ *
+ * This function requires that you call feed_set_data() before calling it.
+ *
+ * @return
+ * FALSE on error, a feed object on success.
+ * @see feed_set_data()
+ */
+function feed_parse() {
+
+ $data = feed_get_data();
+
+ // Initialize feed object.
+ $feed = new stdClass();
+
+ // Parse the data.
+ @$data = simplexml_load_string($data);
+ if ($data === FALSE) {
+ $feed->error = t('The downloaded data is not a parsable.');
+ return $feed;
+ }
+
+ $format = feed_parse_format_detect($data);
+ if ($format == FALSE) {
+ $feed->error = t('The downloaded data is not in a recognizable feed format (ATOM, RSS or RDF).');
+ return $feed;
+ }
+
+ $feed_handler = 'feed_parse_' . $format;
+ return $feed_handler($data);
+}
Index: modules/aggregator/aggregator.module
===================================================================
RCS file: /cvs/drupal/drupal/modules/aggregator/aggregator.module,v
retrieving revision 1.389
diff -u -p -r1.389 aggregator.module
--- modules/aggregator/aggregator.module 16 Aug 2008 14:48:17 -0000 1.389
+++ modules/aggregator/aggregator.module 3 Sep 2008 15:36:54 -0000
@@ -456,358 +456,89 @@ function aggregator_remove($feed) {
}
/**
- * Callback function used by the XML parser.
- */
-function aggregator_element_start($parser, $name, $attributes) {
- global $item, $element, $tag, $items, $channel;
-
- switch ($name) {
- case 'IMAGE':
- case 'TEXTINPUT':
- case 'CONTENT':
- case 'SUMMARY':
- case 'TAGLINE':
- case 'SUBTITLE':
- case 'LOGO':
- case 'INFO':
- $element = $name;
- break;
- case 'ID':
- if ($element != 'ITEM') {
- $element = $name;
- }
- case 'LINK':
- if (!empty($attributes['REL']) && $attributes['REL'] == 'alternate') {
- if ($element == 'ITEM') {
- $items[$item]['LINK'] = $attributes['HREF'];
- }
- else {
- $channel['LINK'] = $attributes['HREF'];
- }
- }
- break;
- case 'ITEM':
- $element = $name;
- $item += 1;
- break;
- case 'ENTRY':
- $element = 'ITEM';
- $item += 1;
- break;
- }
-
- $tag = $name;
-}
-
-/**
- * Call-back function used by the XML parser.
- */
-function aggregator_element_end($parser, $name) {
- global $element;
-
- switch ($name) {
- case 'IMAGE':
- case 'TEXTINPUT':
- case 'ITEM':
- case 'ENTRY':
- case 'CONTENT':
- case 'INFO':
- $element = '';
- break;
- case 'ID':
- if ($element == 'ID') {
- $element = '';
- }
- }
-}
-
-/**
- * Callback function used by the XML parser.
- */
-function aggregator_element_data($parser, $data) {
- global $channel, $element, $items, $item, $image, $tag;
- $items += array($item => array());
- switch ($element) {
- case 'ITEM':
- $items[$item] += array($tag => '');
- $items[$item][$tag] .= $data;
- break;
- case 'IMAGE':
- case 'LOGO':
- $image += array($tag => '');
- $image[$tag] .= $data;
- break;
- case 'LINK':
- if ($data) {
- $items[$item] += array($tag => '');
- $items[$item][$tag] .= $data;
- }
- break;
- case 'CONTENT':
- $items[$item] += array('CONTENT' => '');
- $items[$item]['CONTENT'] .= $data;
- break;
- case 'SUMMARY':
- $items[$item] += array('SUMMARY' => '');
- $items[$item]['SUMMARY'] .= $data;
- break;
- case 'TAGLINE':
- case 'SUBTITLE':
- $channel += array('DESCRIPTION' => '');
- $channel['DESCRIPTION'] .= $data;
- break;
- case 'INFO':
- case 'ID':
- case 'TEXTINPUT':
- // The sub-element is not supported. However, we must recognize
- // it or its contents will end up in the item array.
- break;
- default:
- $channel += array($tag => '');
- $channel[$tag] .= $data;
- }
-}
-
-/**
* Checks a news feed for new items.
*
* @param $feed
* An associative array describing the feed to be refreshed.
*/
function aggregator_refresh($feed) {
- global $channel, $image;
+ // Retrieve feed.
+ $result = drupal_retrieve_feed($feed['url'], $feed['modified'], $feed['etag'], $feed['hash']);
- // Generate conditional GET headers.
- $headers = array();
- if ($feed['etag']) {
- $headers['If-None-Match'] = $feed['etag'];
- }
- if ($feed['modified']) {
- $headers['If-Modified-Since'] = gmdate('D, d M Y H:i:s', $feed['modified']) . ' GMT';
- }
-
- // Request feed.
- $result = drupal_http_request($feed['url'], $headers);
-
- // Process HTTP response code.
- switch ($result->code) {
- case 304:
- db_query('UPDATE {aggregator_feed} SET checked = %d WHERE fid = %d', time(), $feed['fid']);
- drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title'])));
- break;
- case 301:
- $feed['url'] = $result->redirect_url;
- // Do not break here.
- case 200:
- case 302:
- case 307:
- // We store the md5 hash of feed data in the database. When refreshing a
- // feed we compare stored hash and new hash calculated from downloaded
- // data. If both are equal we say that feed is not updated.
- $md5 = md5($result->data);
- if ($feed['hash'] == $md5) {
- db_query('UPDATE {aggregator_feed} SET checked = %d WHERE fid = %d', time(), $feed['fid']);
- drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title'])));
- break;
- }
-
- // Filter the input data.
- if (aggregator_parse_feed($result->data, $feed)) {
- $modified = empty($result->headers['Last-Modified']) ? 0 : strtotime($result->headers['Last-Modified']);
-
- // Prepare the channel data.
- foreach ($channel as $key => $value) {
- $channel[$key] = trim($value);
- }
-
- // Prepare the image data (if any).
- foreach ($image as $key => $value) {
- $image[$key] = trim($value);
- }
-
- if (!empty($image['LINK']) && !empty($image['URL']) && !empty($image['TITLE'])) {
- // TODO: we should really use theme_image() here, but that only works with
- // local images. It won't work with images fetched with a URL unless PHP version > 5.
- $image = '
';
- }
- else {
- $image = NULL;
- }
-
- $etag = empty($result->headers['ETag']) ? '' : $result->headers['ETag'];
- // Update the feed data.
- db_query("UPDATE {aggregator_feed} SET url = '%s', checked = %d, link = '%s', description = '%s', image = '%s', hash = '%s', etag = '%s', modified = %d WHERE fid = %d", $feed['url'], time(), $channel['LINK'], $channel['DESCRIPTION'], $image, $md5, $etag, $modified, $feed['fid']);
-
- // Clear the cache.
- cache_clear_all();
-
- if (isset($result->redirect_url)) {
- watchdog('aggregator', 'Updated URL for feed %title to %url.', array('%title' => $feed['title'], '%url' => $feed['url']));
- }
-
- watchdog('aggregator', 'There is new syndicated content from %site.', array('%site' => $feed['title']));
- drupal_set_message(t('There is new syndicated content from %site.', array('%site' => $feed['title'])));
- }
- break;
- default:
- watchdog('aggregator', 'The feed from %site seems to be broken, due to "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error), WATCHDOG_WARNING);
- drupal_set_message(t('The feed from %site seems to be broken, because of error "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error)));
- module_invoke('system', 'check_http_request');
+ if (isset($result->redirect_url)) {
+ $feed['url'] = $result->redirect_url;
}
-}
-/**
- * Parse the W3C date/time format, a subset of ISO 8601.
- *
- * PHP date parsing functions do not handle this format.
- * See http://www.w3.org/TR/NOTE-datetime for more information.
- * Originally from MagpieRSS (http://magpierss.sourceforge.net/).
- *
- * @param $date_str
- * A string with a potentially W3C DTF date.
- * @return
- * A timestamp if parsed successfully or FALSE if not.
- */
-function aggregator_parse_w3cdtf($date_str) {
- if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
- list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
- // Calculate the epoch for current date assuming GMT.
- $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
- if ($match[10] != 'Z') { // Z is zulu time, aka GMT
- list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
- // Zero out the variables.
- if (!$tz_hour) {
- $tz_hour = 0;
- }
- if (!$tz_min) {
- $tz_min = 0;
- }
- $offset_secs = (($tz_hour * 60) + $tz_min) * 60;
- // Is timezone ahead of GMT? If yes, subtract offset.
- if ($tz_mod == '+') {
- $offset_secs *= -1;
- }
- $epoch += $offset_secs;
+ if (count($result->items) > 0) {
+ aggregator_save_items($result->items, $feed);
+ if (!empty($result->image['LINK']) && !empty($result->image['URL']) && !empty($result->image['TITLE'])) {
+ // TODO: we should really use theme_image() here, but that only works with
+ // local images. It won't work with images fetched with a URL unless PHP version > 5.
+ $image = '
';
+ }
+ else {
+ $image = NULL;
}
- return $epoch;
+
+ $headers = feed_get_headers();
+ $etag = empty($headers['ETag']) ? '' : $headers['ETag'];
+ $modified = empty($headers['Last-Modified']) ? 0 : strtotime($headers['Last-Modified']);
+
+ // Update the feed data.
+ db_query("UPDATE {aggregator_feed} SET url = '%s', checked = %d, link = '%s', description = '%s', image = '%s', hash = '%s', etag = '%s', modified = %d WHERE fid = %d", $feed['url'], time(), $result->channel['LINK'], $result->channel['DESCRIPTION'], $image, feed_get_hash(), $etag, $modified, $feed['fid']);
+
+ // Clear the cache.
+ cache_clear_all();
+
+ if (isset($result->redirect_url)) {
+ watchdog('aggregator', 'Updated URL for feed %title to %url.', array('%title' => $feed['title'], '%url' => $feed['url']));
+ }
+
+ watchdog('aggregator', 'There is new syndicated content from %site.', array('%site' => $feed['title']));
+ drupal_set_message(t('There is new syndicated content from %site.', array('%site' => $feed['title'])));
}
else {
- return FALSE;
+ db_query('UPDATE {aggregator_feed} SET checked = %d WHERE fid = %d', time(), $feed['fid']);
+ drupal_set_message(t('There is no new syndicated content from %site.', array('%site' => $feed['title'])));
+ if (isset($result->error)) {
+ watchdog('aggregator', 'The feed from %site seems to be broken, due to "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error), WATCHDOG_WARNING);
+ drupal_set_message(t('The feed from %site seems to be broken, because of error "%error".', array('%site' => $feed['title'], '%error' => $result->code . ' ' . $result->error)));
+ }
}
}
/**
- * Parse a feed and store its items.
+ * Store a feed's items.
*
- * @param $data
- * The feed data.
+ * @param $items
+ * An array containing feed items.
* @param $feed
- * An associative array describing the feed to be parsed.
+ * An associative array describing the feed.
* @return
* FALSE on error, TRUE otherwise.
*/
-function aggregator_parse_feed(&$data, $feed) {
- global $items, $image, $channel;
-
- // Unset the global variables before we use them.
- unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']);
- $items = array();
- $image = array();
- $channel = array();
-
- // Parse the data.
- $xml_parser = drupal_xml_parser_create($data);
- xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end');
- xml_set_character_data_handler($xml_parser, 'aggregator_element_data');
-
- if (!xml_parse($xml_parser, $data, 1)) {
- watchdog('aggregator', 'The feed from %site seems to be broken, due to an error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser)), WATCHDOG_WARNING);
- drupal_set_message(t('The feed from %site seems to be broken, because of error "%error" on line %line.', array('%site' => $feed['title'], '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), 'error');
- return FALSE;
- }
- xml_parser_free($xml_parser);
-
+function aggregator_save_items($items, $feed) {
// We reverse the array such that we store the first item last, and the last
// item first. In the database, the newest item should be at the top.
$items = array_reverse($items);
// Initialize variables.
- $title = $link = $author = $description = $guid = NULL;
foreach ($items as $item) {
- unset($title, $link, $author, $description, $guid);
-
- // Prepare the item:
- foreach ($item as $key => $value) {
- $item[$key] = trim($value);
- }
-
- // Resolve the item's title. If no title is found, we use up to 40
- // characters of the description ending at a word boundary, but not
- // splitting potential entities.
- if (!empty($item['TITLE'])) {
- $title = $item['TITLE'];
- }
- elseif (!empty($item['DESCRIPTION'])) {
- $title = preg_replace('/^(.*)[^\w;&].*?$/', "\\1", truncate_utf8($item['DESCRIPTION'], 40));
- }
- else {
- $title = '';
- }
-
- // Resolve the items link.
- if (!empty($item['LINK'])) {
- $link = $item['LINK'];
- }
- else {
- $link = $feed['link'];
- }
- $guid = isset($item['GUID']) ? $item['GUID'] : '';
-
- // Atom feeds have a CONTENT and/or SUMMARY tag instead of a DESCRIPTION tag.
- if (!empty($item['CONTENT:ENCODED'])) {
- $item['DESCRIPTION'] = $item['CONTENT:ENCODED'];
- }
- elseif (!empty($item['SUMMARY'])) {
- $item['DESCRIPTION'] = $item['SUMMARY'];
- }
- elseif (!empty($item['CONTENT'])) {
- $item['DESCRIPTION'] = $item['CONTENT'];
- }
-
- // Try to resolve and parse the item's publication date. If no date is
- // found, use the current date instead.
- $date = 'now';
- foreach (array('PUBDATE', 'DC:DATE', 'DCTERMS:ISSUED', 'DCTERMS:CREATED', 'DCTERMS:MODIFIED', 'ISSUED', 'CREATED', 'MODIFIED', 'PUBLISHED', 'UPDATED') as $key) {
- if (!empty($item[$key])) {
- $date = $item[$key];
- break;
- }
- }
-
- $timestamp = strtotime($date); // As of PHP 5.1.0, strtotime returns FALSE on failure instead of -1.
-
- if ($timestamp <= 0) {
- $timestamp = aggregator_parse_w3cdtf($date); // Aggregator_parse_w3cdtf() returns FALSE on failure.
- if (!$timestamp) {
- // Better than nothing.
- $timestamp = time();
- }
- }
// Save this item. Try to avoid duplicate entries as much as possible. If
// we find a duplicate entry, we resolve it and pass along its ID is such
// that we can update it if needed.
- if (!empty($guid)) {
- $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND guid = '%s'", $feed['fid'], $guid));
+ if (!empty($item['GUID'])) {
+ $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND guid = '%s'", $feed['fid'], $item['GUID']));
}
- else if ($link && $link != $feed['link'] && $link != $feed['url']) {
- $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND link = '%s'", $feed['fid'], $link));
+ else if ($item['LINK'] && $item['LINK'] != $feed['link'] && $item['LINK'] != $feed['url']) {
+ $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND link = '%s'", $feed['fid'], $item['LINK']));
}
else {
- $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND title = '%s'", $feed['fid'], $title));
+ $entry = db_fetch_object(db_query("SELECT iid FROM {aggregator_item} WHERE fid = %d AND title = '%s'", $feed['fid'], $item['TITLE']));
}
$item += array('AUTHOR' => '', 'DESCRIPTION' => '');
- aggregator_save_item(array('iid' => (isset($entry->iid) ? $entry->iid: ''), 'fid' => $feed['fid'], 'timestamp' => $timestamp, 'title' => $title, 'link' => $link, 'author' => $item['AUTHOR'], 'description' => $item['DESCRIPTION'], 'guid' => $guid));
+ aggregator_save_item(array('iid' => (isset($entry->iid) ? $entry->iid: ''), 'fid' => $feed['fid'], 'timestamp' => $item['TIMESTAMP'], 'title' => $item['TITLE'], 'link' => $item['LINK'], 'author' => $item['AUTHOR'], 'description' => $item['DESCRIPTION'], 'guid' => $item['GUID']));
}
// Remove all items that are older than flush item timer.