plugins/FeedsHTTPFetcher.inc | 116 +++++++++++++++++++++++++++++++++++++++++- 1 files changed, 115 insertions(+), 1 deletions(-) diff --git a/plugins/FeedsHTTPFetcher.inc b/plugins/FeedsHTTPFetcher.inc index 7dbb38e..434dad2 100644 --- a/plugins/FeedsHTTPFetcher.inc +++ b/plugins/FeedsHTTPFetcher.inc @@ -30,7 +30,44 @@ class FeedsHTTPBatch extends FeedsImportBatch { feeds_include_library('http_request.inc', 'http_request'); $result = http_request_get($this->url); if (!in_array($result->code, array(200, 201, 202, 203, 204, 205, 206))) { - throw new Exception(t('Download of @url failed with code !code.', array('@url' => $this->url, '!code' => $result->code))); + if ($result->code == 301) { + $curl = FALSE; + if (http_request_use_curl()){ + $curl = TRUE; + // Redirected URL is already present in response information + $redirected = $result->headers['Location']; + } + else{ + // Get redirected URL without use of CURL functions + $redirected = $this->get_final_url(); + } + // Update the URL in the FeedsSource object and save it to the database + $src = feeds_source('feed', $this->feed_nid); + $config = $src->getConfig(); + $config[key($config)]['source'] = $redirected; + $src->setConfig($config); + $src->save(); + // Log the code + $messageStr = t('URL of feed !nid redirected to @redirected -- feed updated', array('!nid' => $this->feed_nid, '@redirected' => $redirected)); + watchdog('feeds_log', $messageStr); + if (!$curl){ + throw new Exception($messageStr); + } + } + else { + // We also want to unpublish nodes that produce bad status codes. We can't do this + // via node_load and node_save, however, because feeds_nodeapi width $op == 'update' + // expects $node to come from a form, which will set 'feeds'. Instead, just modify + // the status field directly in the database. + db_query("UPDATE {node} SET status = %d WHERE nid = %d", 0, $this->feed_nid); + // If a 302 error brought us here and CURL is enabled, we don't need to throw an exception, + // since CURL enables us to automatically follow the redirect and still retrieve + // content. Only throw an exception if the URL can't be followed and there is + // therefore no data: + if (!($result->code == 302 && $curl)){ + throw new Exception(t('Download of @url failed with code !code.', array('@url' => $this->url, '!code' => $result->code))); + } + } } return $result->data; } @@ -301,4 +338,81 @@ class PuSHEnvironment implements PuSHSubscriberEnvironmentInterface { feeds_dbg($msg); watchdog('FeedsHTTPFetcher', $msg, array(), $severity); } + + // These 3 methods were written by the blogger at w-shadow.com. + // This technique was chosen over using CURL, because not all servers have CURL enabled + // and some users may not have the power to enable it. + // To see the original post where this code is presented, visit: + // http://w-shadow.com/blog/2008/07/05/how-to-get-redirect-url-in-php/ + + /** + * Gets the address that the provided URL points to, or FALSE if there's no redirect. + * This code came from a blog post at w-shadow.com. + * See post: http://w-shadow.com/blog/2008/07/05/how-to-get-redirect-url-in-php/ + * + * @param url + * @return string + */ + public function get_redirect_url($url){ + $redirect_url = null; + $url_parts = @parse_url($url); + if (!$url_parts) return false; + if (!isset($url_parts['host'])) return false; //can't process relative URLs + if (!isset($url_parts['path'])) $url_parts['path'] = '/'; + $sock = fsockopen($url_parts['host'], (isset($url_parts['port']) ? (int)$url_parts['port'] : 80), $errno, $errstr, 30); + if (!$sock) return false; + $request = "HEAD " . $url_parts['path'] . (isset($url_parts['query']) ? '?'.$url_parts['query'] : '') . " HTTP/1.1\r\n"; + $request .= 'Host: ' . $url_parts['host'] . "\r\n"; + $request .= "Connection: Close\r\n\r\n"; + fwrite($sock, $request); + $response = ''; + while(!feof($sock)) $response .= fread($sock, 8192); + fclose($sock); + if (preg_match('/^Location: (.+?)$/m', $response, $matches)){ + if ( substr($matches[1], 0, 1) == "/" ) + return $url_parts['scheme'] . "://" . $url_parts['host'] . trim($matches[1]); + else + return trim($matches[1]); + } + else { + return false; + } + } + + /** + * Follows and collects all redirects, in order, for the given URL. + * This code came from a blog post at w-shadow.com. + * See post: http://w-shadow.com/blog/2008/07/05/how-to-get-redirect-url-in-php/ + * + * @param url + * @return array + */ + public function get_all_redirects($url){ + $redirects = array(); + while ($newurl = $this->get_redirect_url($url)){ + if (in_array($newurl, $redirects)){ + break; + } + $redirects[] = $newurl; + $url = $newurl; + } + return $redirects; + } + + /** + * Gets the final address that the URL leads to. + * This code came from a blog post at w-shadow.com. + * See post: http://w-shadow.com/blog/2008/07/05/how-to-get-redirect-url-in-php/ + * + * Returns the original url if it isn't really a redirect. + * + */ + public function get_final_url(){ + $redirects = $this->get_all_redirects($this->url); + if (count($redirects) > 0){ + return array_pop($redirects); + } else { + return $url; + } + } }