From a10bd4a020b8edc650b9b0f7462de04856a5ca1c Mon Sep 17 00:00:00 2001 From: jpg Date: Tue, 6 Aug 2019 19:03:16 +0200 Subject: [PATCH] Improved the handling of errors while parsing recursively. Updated tests. + various formatting fixes --- .travis.yml | 2 +- src/SitemapParser.php | 40 +++++++++++-------- .../Exceptions/SitemapParserException.php | 1 + .../Exceptions/TransferException.php | 13 ++++++ src/SitemapParser/UrlParser.php | 1 + tests/DownloadTest.php | 3 +- tests/ExceptionEncodingTest.php | 1 + tests/InvalidURLTest.php | 1 + tests/RecursiveTest.php | 12 +++++- tests/RobotsTxtTest.php | 4 +- tests/SitemapIndexTest.php | 4 +- tests/StrictTest.php | 4 +- tests/StringTest.php | 4 +- tests/URLSetTest.php | 4 +- 14 files changed, 63 insertions(+), 31 deletions(-) create mode 100644 src/SitemapParser/Exceptions/TransferException.php diff --git a/.travis.yml b/.travis.yml index 28e281f..23eff7e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,11 +1,11 @@ sudo: false language: php php: + - 7.3 - 7.2 - 7.1 - 7.0 - 5.6 - - hhvm install: - composer install after_script: diff --git a/src/SitemapParser.php b/src/SitemapParser.php index 0c242d6..4bea782 100644 --- a/src/SitemapParser.php +++ b/src/SitemapParser.php @@ -1,9 +1,10 @@ userAgent = $userAgent; $this->config = $config; @@ -118,7 +119,7 @@ public function __construct($userAgent = self::DEFAULT_USER_AGENT, array $config * * @param string $url * @return void - * @throws SitemapParserException + * @throws Exceptions\SitemapParserException */ public function parseRecursive($url) { @@ -126,7 +127,12 @@ public function parseRecursive($url) while (count($todo = $this->getQueue()) > 0) { $sitemaps = $this->sitemaps; $urls = $this->urls; - $this->parse($todo[0]); + try { + $this->parse($todo[0]); + } catch (Exceptions\TransferException $e) { + // Keep crawling + continue; + } $this->sitemaps = array_merge_recursive($sitemaps, $this->sitemaps); $this->urls = array_merge_recursive($urls, $this->urls); } @@ -161,14 +167,15 @@ public function getQueue() * @param string $url URL to parse * @param string|null $urlContent URL body content (provide to skip download) * @return void - * @throws SitemapParserException + * @throws Exceptions\TransferException + * @throws Exceptions\SitemapParserException */ public function parse($url, $urlContent = null) { $this->clean(); $this->currentURL = $url; - $response = (is_string($urlContent)) ? $urlContent : $this->getContent(); $this->history[] = $this->currentURL; + $response = is_string($urlContent) ? $urlContent : $this->getContent(); if ($this->urlValidate($this->currentURL) && parse_url($this->currentURL, PHP_URL_PATH) === self::ROBOTSTXT_PATH) { $this->parseRobotstxt($response); return; @@ -201,13 +208,14 @@ protected function clean() * Request the body content of an URL * * @return string Raw body content - * @throws SitemapParserException + * @throws Exceptions\TransferException + * @throws Exceptions\SitemapParserException */ protected function getContent() { $this->currentURL = $this->urlEncode($this->currentURL); if (!$this->urlValidate($this->currentURL)) { - throw new SitemapParserException('Invalid URL'); + throw new Exceptions\SitemapParserException('Invalid URL'); } try { if (!isset($this->config['guzzle']['headers']['User-Agent'])) { @@ -217,9 +225,9 @@ protected function getContent() $res = $client->request('GET', $this->currentURL, $this->config['guzzle']); return $res->getBody(); } catch (GuzzleHttp\Exception\TransferException $e) { - if (stripos($e->getMessage(), 'cURL error 6:') === false && $e->getCode() != 404) { - throw new SitemapParserException($e->getMessage()); - } + throw new Exceptions\TransferException('Unable to fetch URL contents', 0, $e); + } catch (GuzzleHttp\Exception\GuzzleException $e) { + throw new Exceptions\SitemapParserException('GuzzleHttp exception', 0, $e); } } @@ -309,7 +317,7 @@ protected function generateXMLObject($xml) // strip XML comments from files // if they occur at the beginning of the file it will invalidate the XML // this occurs with certain versions of Yoast - $xml = preg_replace('/\s*\<\!\-\-((?!\-\-\>)[\s\S])*\-\-\>\s*/', '', (string) $xml); + $xml = preg_replace('/\s*\<\!\-\-((?!\-\-\>)[\s\S])*\-\-\>\s*/', '', (string)$xml); try { libxml_use_internal_errors(true); return new SimpleXMLElement($xml, LIBXML_NOCDATA); @@ -351,9 +359,9 @@ protected function isSitemapURL($url) { $path = parse_url($this->urlEncode($url), PHP_URL_PATH); return $this->urlValidate($url) && ( - mb_substr($path, -mb_strlen(self::XML_EXTENSION) - 1) == '.' . self::XML_EXTENSION || - mb_substr($path, -mb_strlen(self::XML_EXTENSION_COMPRESSED) - 1) == '.' . self::XML_EXTENSION_COMPRESSED - ); + mb_substr($path, -mb_strlen(self::XML_EXTENSION) - 1) == '.' . self::XML_EXTENSION || + mb_substr($path, -mb_strlen(self::XML_EXTENSION_COMPRESSED) - 1) == '.' . self::XML_EXTENSION_COMPRESSED + ); } /** diff --git a/src/SitemapParser/Exceptions/SitemapParserException.php b/src/SitemapParser/Exceptions/SitemapParserException.php index 8eec58b..8a1acea 100644 --- a/src/SitemapParser/Exceptions/SitemapParserException.php +++ b/src/SitemapParser/Exceptions/SitemapParserException.php @@ -1,4 +1,5 @@