diff -Naur planet-6.x-1.3/planet.info 6.x/planet.info
--- planet-6.x-1.3/planet.info 2009-03-12 10:25:55.000000000 -0400
+++ 6.x/planet.info 2009-03-19 11:10:42.000000000 -0400
@@ -1,13 +1,6 @@
-; $Id: planet.info,v 1.4 2009/03/11 23:15:34 swe3tdave Exp $
+; $Id: planet.info,v 1.2 2007/05/30 03:22:01 daryl Exp $
name = Planet
description = Planet blog aggregator
package = Community - optional
version = 6.x
core = 6.x
-
-; Information added by drupal.org packaging script on 2009-03-12
-version = "6.x-1.3"
-core = "6.x"
-project = "planet"
-datestamp = "1236867955"
-
diff -Naur planet-6.x-1.3/planet.install 6.x/planet.install
--- planet-6.x-1.3/planet.install 2009-03-11 19:15:34.000000000 -0400
+++ 6.x/planet.install 2009-03-19 11:37:10.000000000 -0400
@@ -1,5 +1,5 @@
TRUE,
'default' => 0,
),
+ 'hash' => array(
+ 'type' => 'varchar',
+ 'length' => 32,
+ 'description' => t("A hash of the feed's headers."),
+ ),
+ 'error' => array(
+ 'type' => 'int',
+ 'not null' => TRUE,
+ 'default' => 0,
+ 'length' => 1,
+ 'description' => t("Whether the feed is throwing errors or not."),
+ ),
),
'primary key' => array('fid'),
+ 'indexes' => array(
+ 'error' => array('error'),
+ ),
);
$schema['planet_items'] = array(
@@ -67,6 +82,12 @@
'unsigned' => 1,
'not null' => FALSE,
),
+ 'iid' => array(
+ 'type' => 'varchar',
+ 'length' => 32,
+ 'not null' => TRUE,
+ 'description' => t("md5 of the feed item's title and body."),
+ ),
'guid' => array(
'type' => 'varchar',
'length' => 120,
@@ -120,3 +141,41 @@
db_query("DELETE FROM {node} WHERE type = '%s'", planet);
}
+
+function planet_update_1() {
+ $ret = array();
+ db_add_field($ret, 'planet_feeds', 'hash', array(
+ 'type' => 'varchar',
+ 'length' => 32,
+ 'description' => t("A hash of the feed's headers."),
+ )
+ );
+ return $ret;
+}
+
+function planet_update_2() {
+ $ret = array();
+ db_add_field($ret, 'planet_items', 'iid', array(
+ 'type' => 'varchar',
+ 'length' => 32,
+ 'not null' => TRUE,
+ 'description' => t("md5 of the feed item's title and body."),
+ )
+ );
+ return $ret;
+}
+
+function planet_update_3() {
+ $ret = array();
+ db_add_field($ret, 'planet_feeds', 'error', array(
+ 'type' => 'int',
+ 'not null' => TRUE,
+ 'default' => 0,
+ 'length' => 1,
+ 'description' => t("Whether the feed is throwing errors or not."),
+ )
+ );
+ db_add_index($ret, 'planet_feeds', 'error', array('error'));
+
+ return $ret;
+}
\ Pas de fin de ligne à la fin du fichier.
diff -Naur planet-6.x-1.3/planet.module 6.x/planet.module
--- planet-6.x-1.3/planet.module 2009-03-11 19:15:34.000000000 -0400
+++ 6.x/planet.module 2009-03-19 11:11:19.000000000 -0400
@@ -1,5 +1,5 @@
uid;
- }
-
- if ($op == 'update' || $op == 'delete') {
- if (user_access('edit own blog', $account) && ($account->uid == $node->uid) || user_access('administer nodes', $account)) {
- return TRUE;
- }
- }
-}
-
function planet_menu() {
$items['admin/settings/planet'] = array(
@@ -486,508 +472,58 @@
if (!$fid) {
$fid = intval(arg(4));
}
-
- $feed = db_fetch_object(db_query('SELECT * FROM {planet_feeds} WHERE fid = %d', $fid));
-
- $headers = array();
- $result = planet_http_request($feed->link, $headers, 15);
-
- switch ($result->code) {
- case 304:
- drupal_set_message(t('No new content syndicated from %site.', array('%site' => ''. $feed->title .'')));
- break;
-
- case 301:
- if ($result->redirect_url) {
- $feed->link = $result->redirect_url;
- watchdog('planet', 'Updated URL for feed %title to %url.', array('%title' => ''. $feed->title .'', '%url' => ''. $feed->url .''), WATCHDOG_NOTICE, l(t('view'), 'planet/'.$feed->fid));
- db_query("UPDATE {planet_feeds} SET link = '%s' WHERE fid = %d", $feed->link, $feed->fid);
- }
- break;
-
- case 200:
- case 302:
- case 307:
- $xml_tree = planet_parse_xml($result->data);
-
- if ($xml_tree['parser_error']) {
- watchdog('planet', 'Failed to parse RSS feed %site: %error at line %line.', array('%site' => ''. $feed->title .'', '%error' => $xml_tree['parser_error'], '%line' => $xml_tree['parser_line']), WATCHDOG_ERROR);
- drupal_set_message(t('Failed to parse RSS feed %site: %error at line %line.', array('%site' => ''. $feed->title .'', '%error' => $xml_tree['parser_error'], '%line' => $xml_tree['parser_line'])), 'error');
- break;
- }
- else {
- drupal_set_message('Parsing feed '. $feed->title .' took '. $xml_tree['parser_time'] .' seconds.');
- }
-
- if (planet_parse_items($xml_tree, $feed) !== false) {
- if ($result->headers['Last-Modified']) {
- $modified = strtotime($result->headers['Last-Modified']);
- }
-
- /*
- ** Prepare data:
- */
- if ($xml_tree['RSS']) { // RSS 0.91, 0.92, 2.0
- $root = &$xml_tree['RSS'][0];
- $channel = &$root['CHANNEL'][0];
- $image = &$channel['IMAGE'][0];
- $description = &$channel['DESCRIPTION'][0]['VALUE'];
- $link = &$channel['LINK'][0]['VALUE'];
- }
- else if ($xml_tree['RDF:RDF']) {
- $root = &$xml_tree['RDF:RDF'][0];
- $channel = &$root['CHANNEL'][0];
- $image = &$root['IMAGE'][0];
- $description = &$channel['DESCRIPTION'][0]['VALUE'];
- $link = &$channel['LINK'][0]['VALUE'];
- }
- else if ($xml_tree['FEED']) { // Atom 0.3, 1.0
- $root = &$xml_tree['FEED'][0];
- $channel = &$root;
- $image = &$channel['LOGO'][0]['VALUE'];
- $description = ($channel['TAGLINE'][0]['VALUE'] ? $channel['TAGLINE'][0]['VALUE'] : '');
- // TODO: remove this Atom hack when we have field mapping or at least specialized parsers in place
- if (count($channel['LINK']) > 1) {
- $link = $feed->link;
- foreach ($channel['LINK'] as $l) {
- if ($l['REL'] == 'alternate') {
- $link = $l['HREF'];
- }
- }
- }
- else {
- $link = $channel['LINK'][0]['HREF'];
- }
- }
- else if ($xml_tree['CHANNEL']) { // RSS 1.1
- $root = &$xml_tree['CHANNEL'][0];
- $channel = &$root;
- $image = &$channel['IMAGE'][0];
- $description = &$channel['DESCRIPTION'][0]['VALUE'];
- $link = &$channel['LINK'][0]['VALUE'];
- }
- else if ($xml_tree['OPML']) {
- $root = &$xml_tree['OPML'][0];
- $channel = &$root;
- $image = NULL;
- $description = NULL;
- $link = NULL;
- }
- else {
- // unsupported format
- break;
- }
-
- if (!$feed->uid) {
- if ($channel['AUTHOR'][0]['VALUE']) {
- $feed->uid = $channel['AUTHOR'][0]['VALUE'];
- }
- if ($channel['AUTHOR'][0]['NAME'][0]['VALUE']) {
- $feed->uid = $channel['AUTHOR'][0]['NAME'][0]['VALUE'];
- }
- else if ($channel['DC:CREATOR']) {
- $feed->uid = $channel['DC:CREATOR'][0]['VALUE'];
- }
- else {
- $feed->uid = '';
- }
- }
-
- /*
- ** Generate image link
- */
- if (!$feed->image && $image['LINK'] && $image['URL'] && $image['TITLE']) {
- if (strlen($image['TITLE'][0]['VALUE']) > 250) {
- $image['TITLE'][0]['VALUE'] = trim(substr($image['TITLE'][0]['VALUE'], 0, 250)).'...';
- }
- $feed->image = '';
- }
-
- /*
- ** Update the feed data:
- */
- $feed->checked = time();
- $feed->link = $link;
- $feed->etag = $result->headers['ETag'];
- $feed->modified = $modified;
- if ($feed->body == '' && $description/* && valid_input_data($description)*/) {
- $feed->body = $feed->teaser = $description;
- }
- $feed->rss_data = &$xml_tree;
-
- /*
- ** Taxonomy module doesn't add taxonomy terms at load time... so we have to do it by hand :((
- */
- $terms = module_invoke('taxonomy', 'node_get_terms', $feed->nid, 'tid');
- $feed->taxonomy = array();
- foreach ($terms as $tid => $term) {
- if ($term->tid) {
- $feed->taxonomy[] = $term->tid;
- }
- }
- }
- default:
- }
-
-
- db_query('UPDATE {planet_feeds} SET checked = %d WHERE fid = %d', time(), $fid);
- return $feed->title;
- //print theme('page', 'refreshing '. $fid .'.');// and got '. print_r($feed, 1));
-}
-
-/**
- * Private function; Parse HTTP headers from data retreived with cURL
- * from: http://pl2.php.net/manual/en/function.curl-setopt.php#42009
- */
-function planet_parse_response($response) {
- /*
- ***original code extracted from examples at
- ***http://www.webreference.com/programming
- /php/cookbook/chap11/1/3.html
-
- ***returns an array in the following format which varies depending on headers returned
-
- [0] => the HTTP error or response code such as 404
- [1] => Array
- (
- [Server] => Microsoft-IIS/5.0
- [Date] => Wed, 28 Apr 2004 23:29:20 GMT
- [X-Powered-By] => ASP.NET
- [Connection] => close
- [Set-Cookie] => COOKIESTUFF
- [Expires] => Thu, 01 Dec 1994 16:00:00 GMT
- [Content-Type] => text/html
- [Content-Length] => 4040
- )
- [2] => Response body (string)
- */
-
- do {
- list($response_headers, $response) = explode("\r\n\r\n", $response, 2);
- $response_header_lines = explode("\r\n", $response_headers);
-
- // first line of headers is the HTTP response code
- $http_response_line = array_shift($response_header_lines);
- if (preg_match('@^HTTP/[0-9]\.[0-9] ([0-9]{3})@', $http_response_line, $matches)) {
- $response_code = $matches[1];
- }
- else {
- $response_code = "Error";
- }
- }
- while (substr($response_code, 0, 1) == "1");
-
- $response_body = $response;
-
- // put the rest of the headers in an array
- $response_header_array = array();
- foreach ($response_header_lines as $header_line) {
- list($header, $value) = explode(':', $header_line, 2);
- $response_header_array[$header] = trim($value);
- }
-
- return array($response_code, $response_header_array, $response_body, $http_response_line);
-}
-
-/**
- * Private function; Gets data from given URL :)
- */
-function planet_http_request($url, $headers = array(), $timeout = 15, $method = 'GET', $data = NULL, $follow = 3) {
- if (!function_exists('curl_init')) {
- return drupal_http_request($url, $headers, $method, $data, $follow);
- }
-
- // convert headers array to format used by cURL
- $temp = array();
- foreach ($headers as $header => $value) {
- $temp[] = $header .': '. $value;
- }
- $headers = $temp;
-
- $result = new StdClass();
-
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_HEADER, 1);
- curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
- curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 0);
- curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
-
- $data = curl_exec($ch);
- $info = curl_getinfo($ch);
-
- curl_close($ch);
- unset($ch);
-
- $response = planet_parse_response($data);
- $result->code = $response[0];
- $result->headers = $response[1];
- $result->data = $response[2];
- $error = $response[3];
- switch ($code) {
- case 200: // OK
- case 304: // Not modified
- break;
- case 301: // Moved permanently
- case 302: // Moved temporarily
- case 307: // Moved temporarily
- $location = $result->headers['Location'];
-
- if ($follow) {
- $result = planet_http_request($result->headers['Location'], $headers, $timeout, $method, $data, --$follow);
- $result->redirect_code = $result->code;
- }
- $result->redirect_url = $location;
- break;
- default:
- $result->error = $error;
- break;
- }
-
- $result->code = $response[0];
- return $result;
-}
-
-/**
- * Private function; Checks a news feed for new items.
- */
-
-
-/**
- * Private function;
- * Parse the W3C date/time format, a subset of ISO 8601. PHP date parsing
- * functions do not handle this format.
- * See http://www.w3.org/TR/NOTE-datetime for more information.
- * Origionally from MagpieRSS (http://magpierss.sourceforge.net/).
- *
- * @param $date_str A string with a potentially W3C DTF date.
- * @return A timestamp if parsed successfully or -1 if not.
- */
-function planet_parse_w3cdtf($date_str) {
- if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
- list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
- // calc epoch for current date assuming GMT
- $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
- if ($match[10] != 'Z') { // Z is zulu time, aka GMT
- list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
- // zero out the variables
- if (!$tz_hour) {
- $tz_hour = 0;
- }
- if (!$tz_min) {
- $tz_min = 0;
- }
- $offset_secs = (($tz_hour * 60) + $tz_min) * 60;
- // is timezone ahead of GMT? then subtract offset
- if ($tz_mod == '+') {
- $offset_secs *= -1;
- }
- $epoch += $offset_secs;
- }
- return $epoch;
- }
- else {
- return -1;
- }
-}
-
-/**
- * Private function;
- * from: http://pl2.php.net/manual/en/function.html-entity-decode.php#51055
- * Used as callback function for preg_replace_all() to decode numeric entities to UTF-8 chars
- *
- * @param $ord Number
- * @return UTF-8 string
- */
-function planet_replace_num_entity($ord) {
- $ord = $ord[1];
- if (preg_match('/^x([0-9a-f]+)$/i', $ord, $match)) {
- $ord = hexdec($match[1]);
- }
- else {
- $ord = intval($ord);
- }
-
- $no_bytes = 0;
- $byte = array();
-
- if ($ord == 128) {
- return chr(226) . chr(130) . chr(172);
- }
- else if ($ord == 129) {
- return chr(239) . chr(191) . chr(189);
- }
- else if ($ord == 130) {
- return chr(226) . chr(128) . chr(154);
- }
- else if ($ord == 131) {
- return chr(198) . chr(146);
- }
- else if ($ord == 132) {
- return chr(226) . chr(128) . chr(158);
- }
- else if ($ord == 133) {
- return chr(226) . chr(128) . chr(166);
- }
- else if ($ord == 134) {
- return chr(226) . chr(128) . chr(160);
- }
- else if ($ord == 135) {
- return chr(226) . chr(128) . chr(161);
- }
- else if ($ord == 136) {
- return chr(203) . chr(134);
- }
- else if ($ord == 137) {
- return chr(226) . chr(128) . chr(176);
- }
- else if ($ord == 138) {
- return chr(197) . chr(160);
- }
- else if ($ord == 139) {
- return chr(226) . chr(128) . chr(185);
- }
- else if ($ord == 140) {
- return chr(197) . chr(146);
- }
- else if ($ord == 141) {
- return chr(239) . chr(191) . chr(189);
- }
- else if ($ord == 142) {
- return chr(197) . chr(189);
- }
- else if ($ord == 143) {
- return chr(239) . chr(191) . chr(189);
- }
- else if ($ord == 144) {
- return chr(239) . chr(191) . chr(189);
- }
- else if ($ord == 145) {
- return chr(226) . chr(128) . chr(152);
- }
- else if ($ord == 146) {
- return chr(226) . chr(128) . chr(153);
- }
- else if ($ord == 147) {
- return chr(226) . chr(128) . chr(156);
- }
- else if ($ord == 148) {
- return chr(226) . chr(128) . chr(157);
- }
- else if ($ord == 149) {
- return chr(226) . chr(128) . chr(162);
- }
- else if ($ord == 150) {
- return chr(226) . chr(128) . chr(147);
- }
- else if ($ord == 151) {
- return chr(226) . chr(128) . chr(148);
- }
- else if ($ord == 152) {
- return chr(203) . chr(156);
- }
- else if ($ord == 153) {
- return chr(226) . chr(132) . chr(162);
- }
- else if ($ord == 154) {
- return chr(197) . chr(161);
- }
- else if ($ord == 155) {
- return chr(226) . chr(128) . chr(186);
- }
- else if ($ord == 156) {
- return chr(197) . chr(147);
- }
- else if ($ord == 157) {
- return chr(239) . chr(191) . chr(189);
- }
- else if ($ord == 158) {
- return chr(197) . chr(190);
- }
- else if ($ord == 159) {
- return chr(197) . chr(184);
- }
- else if ($ord == 160) {
- return chr(194) . chr(160);
- }
-
- if ($ord < 128) {
- return chr($ord);
- }
- else if ($ord < 2048) {
- $no_bytes = 2;
- }
- else if ($ord < 65536) {
- $no_bytes = 3;
- }
- else if ($ord < 1114112) {
- $no_bytes = 4;
+ // initialize simplepie
+ // we want to do this only once and not each time per feed, which would be slower
+ include_once './'. drupal_get_path('module', 'planet') .'/simplepie.inc';
+
+ $process_feed = db_fetch_object(db_query('SELECT * FROM {planet_feeds} WHERE fid = %d', $fid));
+
+ $feed = new SimplePie();
+ $feed->enable_cache(FALSE);
+ $feed->set_timeout(15);
+ // prevent SimplePie from using all of it's data santization since we use Drupal's input formats to handle this
+ $feed->set_stupidly_fast(TRUE);
+ $feed->set_feed_url($process_feed->link);
+ // FeedBurner requires this check otherwise it won't work well with SimplePie
+ // also performance improvement
+ header('If-Modified-Since:'. $process_feed->checked);
+ $success = $feed->init();
+
+ if ($success && $feed->data) {
+ // get a unique hash of the headers in the feed, fast and easy way to compare if this feed is updated or not
+ $hash = md5(serialize($feed->data));
+
+ // hashes don't match so likely the feed is updated
+ if ($process_feed->hash != $hash) {
+ // above we define hook_view() which then performs check_url() on the $url in the feed node
+ // the problem is check_url() calls filter_xss_bad_protocol() which does it thing to prevent XSS
+ // but it returns the string through check_plain() which calls htmlspecialchars()
+ // this converts & in a url to & and then causes SimplePie not to be able to parse it
+ // because of this, we decode this URL since we are passing it directly to SimplePie
+ // it is still encoded everywhere else it is output to prevent XSS
+ $process_feed->link = htmlspecialchars_decode($process_feed->link, ENT_QUOTES);
+
+ // turn each feed item into a node
+ planet_item_feed_parse($process_feed, $feed);
+ }
+
+ // finished processing this feed so we can mark it checked
+ db_query("UPDATE {planet_feeds} SET checked = %d, hash = '%s', error = 0 WHERE fid = %d", time(), $hash, $process_feed->fid);
+ }
+ else if (isset($feed->error)) {
+ db_query("UPDATE {planet_feeds} SET error = 1 WHERE fid = %d", $process_feed->fid);
+ watchdog('planet', 'The feed %feed could not be processed due to the following error: %error', array('%feed' => $process_feed->title, '%error' => $feed->error), WATCHDOG_ERROR, l('view', $process_feed->link));
}
else {
- return;
- }
-
- switch ($no_bytes) {
- case 2:
- $prefix = array(31, 192);
- break;
-
- case 3:
- $prefix = array(15, 224);
- break;
-
- case 4:
- $prefix = array(7, 240);
- break;
+ watchdog('planet', 'You shouldn\'t be here. Something has gone terribly wrong.');
}
-
- for ($i = 0; $i < $no_bytes; $i++) {
- $byte[$no_bytes - $i - 1] = (($ord & (63 * pow(2, 6 * $i))) / pow(2, 6 * $i)) & 63 | 128;
- }
-
- $byte[0] = ($byte[0] & $prefix[0]) | $prefix[1];
-
- $ret = '';
- for ($i = 0; $i < $no_bytes; $i++) {
- $ret .= chr($byte[$i]);
- }
-
- return $ret;
-}
-
-/**
- * Private function; Convert named entities to UTF-8 characters
- * from: http://pl2.php.net/manual/en/function.html-entity-decode.php#51722
- */
-function planet_replace_name_entities(&$text) {
- static $ttr;
- if (!$ttr) {
- $trans_tbl = get_html_translation_table(HTML_ENTITIES);
- foreach ($trans_tbl as $k => $v) {
- $ttr[$v] = utf8_encode($k);
- }
- $ttr['''] = "'";
- }
- return strtr($text, $ttr);
-}
-
-/**
- * Private function; Convert all entities to UTF-8 characters
- */
-function planet_replace_entities(&$text) {
- $result = planet_replace_name_entities($text);
- return preg_replace_callback('/([0-9a-fx]+);/mi', 'planet_replace_num_entity', $result);
+
+ return $process_feed->title;
}
/**
- * Private function; Clone object function to stay compatible with both php4 and php5
- * from: Drupal 4.7CVS
- * TODO: remove after moving to Drupal 4.7
+ * Private function; Checks a news feed for new items.
*/
-function planet_clone($object) {
- return version_compare(phpversion(), '5.0') < 0 ? $object : clone($object);
-}
/**
* Private function; Convert relative URLs
@@ -998,341 +534,6 @@
return preg_replace($src, $dst, $data);
}
-/**
- * Private function; Creates nodes from data found in given xml_tree
- */
-function planet_parse_items(&$xml_tree, &$feed) {
-
- if ($xml_tree['RSS']) { // RSS 0.91, 0.92, 2.0
- $items = &$xml_tree['RSS'][0]['CHANNEL'][0]['ITEM'];
- $link_field = 'VALUE';
- }
- else if ($xml_tree['RDF:RDF']) {
- $items = &$xml_tree['RDF:RDF'][0]['ITEM'];
- $link_field = 'VALUE';
- }
- else if ($xml_tree['FEED']) { // Atom 0.3, 1.0
- $items = &$xml_tree['FEED'][0]['ENTRY'];
- $link_field = 'HREF';
- }
- else if ($xml_tree['CHANNEL']) { // RSS 1.1
- $items = &$xml_tree['CHANNEL'][0]['ITEMS'][0]['ITEM'];
- $link_field = 'VALUE';
- }
- else {
- // unsupported format
- $items = array();
- return false;
- }
-
- /*
- ** We reverse the array such that we store the first item last,
- ** and the last item first. In the database, the newest item
- ** should be at the top.
- */
- $items_added = 0;
-
-
- for ($index = count($items) - 1; $index >= 0; $index--) {
- $item = &$items[$index];
- //print '
'. print_r($item, 1) .''; - $teaser = NULL; - $body = NULL; - - // Description field is needed early for case when no title is specified - if ($item['DESCRIPTION']) { // RSS 0.91, 0.92, 1.0, 1.1, 2.0 - $body = &$item['DESCRIPTION'][0]['VALUE']; - } - else if ($item['SUMMARY']) { // Atom 0.3, 1.0 - $body = &$item['SUMMARY'][0]['VALUE']; - } - - if ($item['CONTENT']) { // Atom 0.3, 1.0 - if (strlen($body) < strlen($item['CONTENT'][0]['VALUE'])) { - if ($body) { - $teaser = $body; - } - $body = &$item['CONTENT'][0]['VALUE']; - } - } - else if ($item['CONTENT:ENCODED']) { // Don't know where it came from but it can be found in RSS 2.0 feeds - if (strlen($body) < strlen($item['CONTENT:ENCODED'][0]['VALUE'])) { - if ($body) { - $teaser = $body; - } - $body = &$item['CONTENT:ENCODED'][0]['VALUE']; - } - } - - /* - ** Resolve the item's title. If no title is found, we use - ** up to 40 characters of the description ending at a word - ** boundary but not splitting potential entities. - */ - if (!($title = $item['TITLE'][0]['VALUE'])) { - $title = preg_replace('/^(.*)[^\w;&].*?$/', "\\1", truncate_utf8($body, 40)); - } - - // If title was "escaped" then it may still contain entities, becuase each & from entity was also escabet to & before - // TODO: the same for content? - if ($item['TITLE'][0]['MODE'] == 'escaped') { - $title = planet_replace_entities($title); - } - $title = strip_tags($title); - - /* - ** Resolve the items link. - */ - if ($item['LINK']) { - // TODO: remove this Atom hack when we have field mapping or at least specialized parsers in place - if (count($item['LINK']) > 1) { - $link = $feed->link; - foreach ($item['LINK'] as $temp) { - if ($temp['REL'] == 'alternate') { - $link = $temp[$link_field]; - } - } - } - else { - $link = $item['LINK'][0][$link_field]; - } - } - elseif ($item['GUID'] && (strncmp($item['GUID'][0][$link_field], 'http://', 7) == 0) && $item['GUID'][0]['ISPERMALINK'] != 'false') { - $link = $item['GUID'][0][$link_field]; - } - else { - $link = $feed->link; - } - - /* - ** Resolve the items source. - */ - if ($item['SOURCE'][0]['VALUE'] && $item['SOURCE'][0]['URL']) { // RSS 2.0 - $source_title = &$item['SOURCE'][0]['VALUE']; - $source_link = &$item['SOURCE'][0]['URL']; - } - else if ($item['SOURCE'] || $item['ATOM:SOURCE']) { // ATOM 1.0 - if ($item['SOURCE'][0]['TITLE']) $source_title = &$item['SOURCE'][0]['TITLE'][0]['VALUE']; - else if ($item['SOURCE'][0]['ATOM:TITLE']) $source_title = &$item['SOURCE'][0]['ATOM:TITLE'][0]['VALUE']; - if ($item['SOURCE'][0]['LINK']) $source_link = &$item['SOURCE'][0]['LINK'][0]['VALUE']; - else if ($item['SOURCE'][0]['ATOM:LINK']) $source_link = &$item['SOURCE'][0]['ATOM:LINK'][0]['VALUE']; - } - else { - $source_title = ''; - $source_link = ''; - } - - /* - ** Try to resolve and parse the item's publication date. If no - ** date is found, we use the current date instead. - */ - // TODO: find nicer way for handling namespaces ;) - if ($item['PUBDATE']) $date = $item['PUBDATE'][0]['VALUE']; // RSS 2.0 - else if ($item['DC:DATE']) $date = $item['DC:DATE'][0]['VALUE']; // Dublin core - else if ($item['DATE']) $date = $item['DATE'][0]['VALUE']; // Dublin core - else if ($item['DCTERMS:ISSUED']) $date = $item['DCTERMS:ISSUED'][0]['VALUE']; // Dublin core - else if ($item['ISSUED']) $date = $item['ISSUED'][0]['VALUE']; // Dublin core - else if ($item['DCTERMS:CREATED']) $date = $item['DCTERMS:CREATED'][0]['VALUE']; // Dublin core - else if ($item['CREATED']) $date = $item['CREATED'][0]['VALUE']; // Dublin core - else if ($item['DCTERMS:MODIFIED']) $date = $item['DCTERMS:MODIFIED'][0]['VALUE']; // Dublin core - else if ($item['MODIFIED']) $date = $item['MODIFIED'][0]['VALUE']; // Dublin core - else if ($item['ATOM:UPDATED']) $date = $item['ATOM:UPDATED'][0]['VALUE']; // Atom - else if ($item['UPDATED']) $date = $item['UPDATED'][0]['VALUE']; // Atom - else $date = 'now'; - - if ($feed->item_date_source == FEEDS_ITEM_DATE_SNIFFED && $date) { - $timestamp = strtotime($date); // strtotime() returns -1 on failure - if ($timestamp < 0) { - $timestamp = planet_parse_w3cdtf($date); // also returns -1 on failure - if ($timestamp < 0) { - $timestamp = time(); // better than nothing - } - } - } - else { - $timestamp = time(); - } - - // Ignore items older than allowed for feed - if ($timestamp < $time_horizont) { - continue; - } - - /* - ** Save this item. Try to avoid duplicate entries as much as - ** possible. If we find a duplicate entry, we resolve it and - ** pass along it's ID such that we can update it if needed. - */ - // Try to use RSS:GUID/ATOM:ID as unique identifier - $guid = ''; - if ($item['GUID'][0]['VALUE']) { // RSS 2.0 - $guid = $item['GUID'][0]['VALUE']; - } - else if ($item['ATOM:ID'][0]['VALUE']) { // ATOM 0.3, 1.0 - $guid = $item['ATOM:ID'][0]['VALUE']; - } - else if ($item['ID'][0]['VALUE']) { // ATOM 0.3, 1.0 - $guid = $item['ID'][0]['VALUE']; - } - else { - // feed may contain duplicated links for different items, so we try to generate unique ID for each item - $guid = md5("$title - . " . $feed->fid); - } - // TODO: is there anyway to check if DC:IDENTIFIER is unique? - // http://dublincore.org/documents/usageguide/elements.shtml says it can be non-unique so useles for us :( - - $entry = NULL; - if ($guid && strlen($guid) > 0) { - $entry = db_fetch_object(db_query("SELECT nid FROM {planet_items} WHERE guid = '%s' AND fid = %d", $guid, $feed->fid)); - } - else if ($link && $link != $feed->link && $link != $feed->url) { - $entry = db_fetch_object(db_query("SELECT nid FROM {planet_items} WHERE guid = '%s' AND fid = %d", $link, $feed->fid)); - } - else { - $entry = db_fetch_object(db_query("SELECT ai.nid AS nid FROM {node} n, {planet_items} ai WHERE ai.fid = %d AND ai.nid = n.nid AND n.title = '%s'", $feed->fid, $title)); - } - - //print $guid .'
'. print_r($entry, 1) .''; - node_save($entry); - db_query('INSERT INTO {planet_items} (fid, nid, guid, link, created) VALUES(%d, %d, "%s", "%s", UNIX_TIMESTAMP(NOW()))', $feed->fid, $entry->nid, $guid, $link); - watchdog('planet', 'Adding '. $title); - drupal_set_message('Adding '. $title); - } - } - - return $items_added; -} - - -/** - * Private function; parses given XML data and returns array - */ -function planet_parse_xml(&$data) { - global $xml_tree, $xml_paths, $xml_path_cur; - $xml_tree = array(); - $xml_paths[] = &$xml_tree; - $xml_path_cur = 0; - - $_start = microtime(); - - // Some feeds already use CDATA but in "wrong way": http://www.rocketboom.com/vlog/quicktime_daily_enclosures.xml (ie.