Skip to content

Commit

Permalink
Improved the handling of errors while parsing recursively. Updated te…
Browse files Browse the repository at this point in the history
…sts.

+ various formatting fixes
  • Loading branch information
JanPetterMG committed Aug 6, 2019
1 parent 4dea4e1 commit a10bd4a
Show file tree
Hide file tree
Showing 14 changed files with 63 additions and 31 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
sudo: false
language: php
php:
- 7.3
- 7.2
- 7.1
- 7.0
- 5.6
- hhvm
install:
- composer install
after_script:
Expand Down
40 changes: 24 additions & 16 deletions src/SitemapParser.php
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
<?php

namespace vipnytt;

use GuzzleHttp;
use SimpleXMLElement;
use vipnytt\SitemapParser\Exceptions\SitemapParserException;
use vipnytt\SitemapParser\Exceptions;
use vipnytt\SitemapParser\UrlParser;

/**
Expand Down Expand Up @@ -101,13 +102,13 @@ class SitemapParser
*
* @param string $userAgent User-Agent to send with every HTTP(S) request
* @param array $config Configuration options
* @throws SitemapParserException
* @throws Exceptions\SitemapParserException
*/
public function __construct($userAgent = self::DEFAULT_USER_AGENT, array $config = [])
{
mb_language("uni");
if (!mb_internal_encoding(self::ENCODING)) {
throw new SitemapParserException('Unable to set internal character encoding to `' . self::ENCODING . '`');
throw new Exceptions\SitemapParserException('Unable to set internal character encoding to `' . self::ENCODING . '`');
}
$this->userAgent = $userAgent;
$this->config = $config;
Expand All @@ -118,15 +119,20 @@ public function __construct($userAgent = self::DEFAULT_USER_AGENT, array $config
*
* @param string $url
* @return void
* @throws SitemapParserException
* @throws Exceptions\SitemapParserException
*/
public function parseRecursive($url)
{
$this->addToQueue([$url]);
while (count($todo = $this->getQueue()) > 0) {
$sitemaps = $this->sitemaps;
$urls = $this->urls;
$this->parse($todo[0]);
try {
$this->parse($todo[0]);
} catch (Exceptions\TransferException $e) {
// Keep crawling
continue;
}
$this->sitemaps = array_merge_recursive($sitemaps, $this->sitemaps);
$this->urls = array_merge_recursive($urls, $this->urls);
}
Expand Down Expand Up @@ -161,14 +167,15 @@ public function getQueue()
* @param string $url URL to parse
* @param string|null $urlContent URL body content (provide to skip download)
* @return void
* @throws SitemapParserException
* @throws Exceptions\TransferException
* @throws Exceptions\SitemapParserException
*/
public function parse($url, $urlContent = null)
{
$this->clean();
$this->currentURL = $url;
$response = (is_string($urlContent)) ? $urlContent : $this->getContent();
$this->history[] = $this->currentURL;
$response = is_string($urlContent) ? $urlContent : $this->getContent();
if ($this->urlValidate($this->currentURL) && parse_url($this->currentURL, PHP_URL_PATH) === self::ROBOTSTXT_PATH) {
$this->parseRobotstxt($response);
return;
Expand Down Expand Up @@ -201,13 +208,14 @@ protected function clean()
* Request the body content of an URL
*
* @return string Raw body content
* @throws SitemapParserException
* @throws Exceptions\TransferException
* @throws Exceptions\SitemapParserException
*/
protected function getContent()
{
$this->currentURL = $this->urlEncode($this->currentURL);
if (!$this->urlValidate($this->currentURL)) {
throw new SitemapParserException('Invalid URL');
throw new Exceptions\SitemapParserException('Invalid URL');
}
try {
if (!isset($this->config['guzzle']['headers']['User-Agent'])) {
Expand All @@ -217,9 +225,9 @@ protected function getContent()
$res = $client->request('GET', $this->currentURL, $this->config['guzzle']);
return $res->getBody();
} catch (GuzzleHttp\Exception\TransferException $e) {
if (stripos($e->getMessage(), 'cURL error 6:') === false && $e->getCode() != 404) {
throw new SitemapParserException($e->getMessage());
}
throw new Exceptions\TransferException('Unable to fetch URL contents', 0, $e);
} catch (GuzzleHttp\Exception\GuzzleException $e) {
throw new Exceptions\SitemapParserException('GuzzleHttp exception', 0, $e);
}
}

Expand Down Expand Up @@ -309,7 +317,7 @@ protected function generateXMLObject($xml)
// strip XML comments from files
// if they occur at the beginning of the file it will invalidate the XML
// this occurs with certain versions of Yoast
$xml = preg_replace('/\s*\<\!\-\-((?!\-\-\>)[\s\S])*\-\-\>\s*/', '', (string) $xml);
$xml = preg_replace('/\s*\<\!\-\-((?!\-\-\>)[\s\S])*\-\-\>\s*/', '', (string)$xml);
try {
libxml_use_internal_errors(true);
return new SimpleXMLElement($xml, LIBXML_NOCDATA);
Expand Down Expand Up @@ -351,9 +359,9 @@ protected function isSitemapURL($url)
{
$path = parse_url($this->urlEncode($url), PHP_URL_PATH);
return $this->urlValidate($url) && (
mb_substr($path, -mb_strlen(self::XML_EXTENSION) - 1) == '.' . self::XML_EXTENSION ||
mb_substr($path, -mb_strlen(self::XML_EXTENSION_COMPRESSED) - 1) == '.' . self::XML_EXTENSION_COMPRESSED
);
mb_substr($path, -mb_strlen(self::XML_EXTENSION) - 1) == '.' . self::XML_EXTENSION ||
mb_substr($path, -mb_strlen(self::XML_EXTENSION_COMPRESSED) - 1) == '.' . self::XML_EXTENSION_COMPRESSED
);
}

/**
Expand Down
1 change: 1 addition & 0 deletions src/SitemapParser/Exceptions/SitemapParserException.php
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<?php

namespace vipnytt\SitemapParser\Exceptions;

use Exception;
Expand Down
13 changes: 13 additions & 0 deletions src/SitemapParser/Exceptions/TransferException.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<?php

namespace vipnytt\SitemapParser\Exceptions;

/**
* TransferException class
*
* @license https://opensource.org/licenses/MIT MIT license
* @link https://github.com/VIPnytt/SitemapParser
*/
class TransferException extends SitemapParserException
{
}
1 change: 1 addition & 0 deletions src/SitemapParser/UrlParser.php
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<?php

namespace vipnytt\SitemapParser;

/**
Expand Down
3 changes: 1 addition & 2 deletions tests/DownloadTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@ public function testDownload($url)
* Generate test data
* @return array
*/
public
function generateDataForTest()
public function generateDataForTest()
{
return [
[
Expand Down
1 change: 1 addition & 0 deletions tests/ExceptionEncodingTest.php
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<?php

namespace vipnytt\SitemapParser\Tests;

use PHPUnit\Framework\TestCase;
Expand Down
1 change: 1 addition & 0 deletions tests/InvalidURLTest.php
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<?php

namespace vipnytt\SitemapParser\Tests;

use PHPUnit\Framework\TestCase;
Expand Down
12 changes: 10 additions & 2 deletions tests/RecursiveTest.php
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<?php

namespace vipnytt\SitemapParser\Tests;

use PHPUnit\Framework\TestCase;
Expand Down Expand Up @@ -36,11 +37,18 @@ public function testRecursive($url)
* Generate test data
* @return array
*/
public
function generateDataForTest()
public function generateDataForTest()
{
return [
[
'https://edenapartmentsqueenanne.com/sitemap_index.xml',
'https://livingnongmo.org/sitemap.xml',
'https://loganwestom.com/sitemap_index.xml',
'https://sawyerflats.com/sitemap.xml',
'https://www.bellinghambaymarathon.org/sitemap_index.xml',
'https://www.coachforteens.com/sitemap_index.xml',
'https://www.hallerpostapts.com/sitemap_index.xml',
'https://www.nongmoproject.org/sitemap.xml',
'https://www.xml-sitemaps.com/robots.txt',
]
];
Expand Down
4 changes: 2 additions & 2 deletions tests/RobotsTxtTest.php
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<?php

namespace vipnytt\SitemapParser\Tests;

use PHPUnit\Framework\TestCase;
Expand All @@ -25,8 +26,7 @@ public function testRobotsTxt($url, $body, $result)
* Generate test data
* @return array
*/
public
function generateDataForTest()
public function generateDataForTest()
{
return [
[
Expand Down
4 changes: 2 additions & 2 deletions tests/SitemapIndexTest.php
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<?php

namespace vipnytt\SitemapParser\Tests;

use PHPUnit\Framework\TestCase;
Expand All @@ -25,8 +26,7 @@ public function testSitemapIndex($url, $body, $result)
* Generate test data
* @return array
*/
public
function generateDataForTest()
public function generateDataForTest()
{
return [
[
Expand Down
4 changes: 2 additions & 2 deletions tests/StrictTest.php
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<?php

namespace vipnytt\SitemapParser\Tests;

use PHPUnit\Framework\TestCase;
Expand All @@ -24,8 +25,7 @@ public function testStrict($url, $body)
* Generate test data
* @return array
*/
public
function generateDataForTest()
public function generateDataForTest()
{
return [
[
Expand Down
4 changes: 2 additions & 2 deletions tests/StringTest.php
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<?php

namespace vipnytt\SitemapParser\Tests;

use PHPUnit\Framework\TestCase;
Expand Down Expand Up @@ -37,8 +38,7 @@ public function testString($url)
* Generate test data
* @return array
*/
public
function generateDataForTest()
public function generateDataForTest()
{
return [
[
Expand Down
4 changes: 2 additions & 2 deletions tests/URLSetTest.php
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<?php

namespace vipnytt\SitemapParser\Tests;

use PHPUnit\Framework\TestCase;
Expand All @@ -25,8 +26,7 @@ public function testURLSet($url, $body, $result)
* Generate test data
* @return array
*/
public
function generateDataForTest()
public function generateDataForTest()
{
return [
[
Expand Down

0 comments on commit a10bd4a

Please sign in to comment.