diff --git a/README.md b/README.md index eaafb3b..304267e 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Then run `composer update`. - XML `.xml` - Compressed XML `.xml.gz` - Robots.txt rule sheet `robots.txt` -- Plain text +- Line separated list in plain text ## Getting Started @@ -113,17 +113,17 @@ try { } ``` -### Parsing of plain text strings -__Note: This is disabled by default__ to avoid false positives when parsing XML documents but get something else in return. +### Parsing of line separated text strings +__Note: This is disabled by default__ to avoid false positives when expecting XML, but get some plain text in return. -To disable `strict` standards, simply pass this configuration into the constructor: ````['strict' => false]````. +To disable `strict` standards, simply pass this configuration to constructor parameter #2: ````['strict' => false]````. ```php use vipnytt\SitemapParser; use vipnytt\SitemapParser\Exceptions\SitemapParserException; try { $parser = new SitemapParser('MyCustomUserAgent', ['strict' => false]); - $parser->parse('http://www.example.com/?format=sitemap'); + $parser->parse('https://www.xml-sitemaps.com/urllist.txt'); foreach ($parser->getSitemaps() as $url => $tags) { echo $url . '
'; } diff --git a/src/SitemapParser.php b/src/SitemapParser.php index 1ef90b4..3cdec8e 100644 --- a/src/SitemapParser.php +++ b/src/SitemapParser.php @@ -16,6 +16,41 @@ */ class SitemapParser { + /** + * Default encoding + */ + const ENCODING = 'UTF-8'; + + /** + * XML file extension + */ + const XML_EXTENSION = '.xml'; + + /** + * Compressed XML file extension + */ + const XML_EXTENSION_COMPRESSED = '.xml.gz'; + + /** + * XML Sitemap tag + */ + const XML_TAG_SITEMAP = 'sitemap'; + + /** + * XML URL tag + */ + const XML_TAG_URL = 'url'; + + /** + * Robots.txt path + */ + const ROBOTSTXT_PATH = '/robots.txt'; + + /** + * Robots.txt sitemap prefix + */ + const ROBOTSTXT_PREFIX = 'Sitemap:'; + /** * User-Agent to send with every HTTP(S) request * @var string @@ -74,8 +109,8 @@ public function __construct($userAgent = 'SitemapParser', $config = []) throw new SitemapParserException('The extension `mbstring` must be installed and loaded for this library'); } mb_language("uni"); - if (!mb_internal_encoding('UTF-8')) { - throw new SitemapParserException('Unable to set internal character encoding to UTF-8'); + if (!mb_internal_encoding(self::ENCODING)) { + throw new SitemapParserException('Unable to set internal character encoding to `' . self::ENCODING . '`'); } $this->userAgent = $userAgent; $this->config = $config; @@ -137,7 +172,7 @@ public function parse($url, $urlContent = null) $this->currentURL = $url; $response = (is_string($urlContent)) ? $urlContent : $this->getContent(); $this->history[] = $this->currentURL; - if (parse_url($this->currentURL, PHP_URL_PATH) == '/robots.txt') { + if (parse_url($this->currentURL, PHP_URL_PATH) === self::ROBOTSTXT_PATH) { $this->parseRobotstxt($response); return; } @@ -150,12 +185,8 @@ public function parse($url, $urlContent = null) $this->parseString($response); return; } - if (isset($sitemapJson->sitemap)) { - $this->parseJson('sitemap', $sitemapJson->sitemap); - } - if (isset($sitemapJson->url)) { - $this->parseJson('url', $sitemapJson->url); - } + $this->parseJson(self::XML_TAG_SITEMAP, $sitemapJson); + $this->parseJson(self::XML_TAG_URL, $sitemapJson); } /** @@ -196,17 +227,22 @@ protected function getContent() * Search for sitemaps in the robots.txt content * * @param string $robotstxt - * @return void + * @return bool */ protected function parseRobotstxt($robotstxt) { - preg_match_all('#Sitemap:*(.*)#', $robotstxt, $match); - if (isset($match[1])) { - foreach ($match[1] as $sitemap) { - $sitemap = trim($sitemap); - $this->addArray('sitemap', ['loc' => $sitemap]); + $array = array_map('trim', preg_split('/\R/', $robotstxt)); + foreach ($array as $line) { + if (mb_stripos($line, self::ROBOTSTXT_PREFIX) === 0) { + $url = mb_substr($line, mb_strlen(self::ROBOTSTXT_PREFIX)); + if (($pos = mb_stripos($url, '#')) !== false) { + $url = mb_substr($url, 0, $pos); + } + $url = preg_split('/\s+/', trim($url))[0]; + $this->addArray('sitemap', ['loc' => $url]); } } + return true; } /** @@ -220,10 +256,10 @@ protected function addArray($type, $array) { if (isset($array['loc']) && filter_var($array['loc'], FILTER_VALIDATE_URL) !== false) { switch ($type) { - case 'sitemap': + case self::XML_TAG_SITEMAP: $this->sitemaps[$array['loc']] = $array; return true; - case 'url': + case self::XML_TAG_URL: $this->urls[$array['loc']] = $array; return true; } @@ -248,7 +284,7 @@ protected function generateXMLObject($xml) } /** - * Parse plain text + * Parse line separated text string * * @param string $string * @return bool @@ -256,19 +292,16 @@ protected function generateXMLObject($xml) protected function parseString($string) { if (!isset($this->config['strict']) || $this->config['strict'] !== false) { - // Strings are not part of any sitemap standard + // Strings are not part of any documented sitemap standard return false; } - $offset = 0; - while (preg_match('/(\S+)/', $string, $match, PREG_OFFSET_CAPTURE, $offset)) { - $offset = $match[0][1] + strlen($match[0][0]); - if (filter_var($match[0][0], FILTER_VALIDATE_URL) !== false) { - if ($this->isSitemapURL($match[0][0])) { - $this->addArray('sitemap', ['loc' => $match[0][0]]); - continue; - } - $this->addArray('url', ['loc' => $match[0][0]]); + $array = array_map('trim', preg_split('/\R/', $string)); + foreach ($array as $line) { + if ($this->isSitemapURL($line)) { + $this->addArray(self::XML_TAG_SITEMAP, ['loc' => $line]); + continue; } + $this->addArray(self::XML_TAG_URL, ['loc' => $line]); } return true; } @@ -283,8 +316,8 @@ protected function isSitemapURL($url) { $path = parse_url($url, PHP_URL_PATH); return filter_var($url, FILTER_VALIDATE_URL) !== false && ( - substr($path, -4) === ".xml" || - substr($path, -7) === '.xml.gz' + substr($path, -strlen(self::XML_EXTENSION)) === self::XML_EXTENSION || + substr($path, -strlen(self::XML_EXTENSION_COMPRESSED)) === self::XML_EXTENSION_COMPRESSED ); } @@ -293,13 +326,17 @@ protected function isSitemapURL($url) * * @param string $type Sitemap or URL * @param \SimpleXMLElement $json object - * @return void + * @return bool */ protected function parseJson($type, $json) { - foreach ($json as $url) { + if (!isset($json->$type)) { + return false; + } + foreach ($json->$type as $url) { $this->addArray($type, (array)$url); } + return true; } /** diff --git a/tests/RobotsTxtTest.php b/tests/RobotsTxtTest.php index f10b266..77481aa 100644 --- a/tests/RobotsTxtTest.php +++ b/tests/RobotsTxtTest.php @@ -33,7 +33,8 @@ function generateDataForTest() << false]); $this->assertInstanceOf('vipnytt\SitemapParser', $parser); - $parser->parse($url, $body); - $this->assertEquals($result['sitemaps'], $parser->getSitemaps()); - $this->assertEquals($result['urls'], $parser->getURLs()); + $parser->parse($url); + $this->assertTrue(is_array($parser->getSitemaps())); + $this->assertTrue(is_array($parser->getURLs())); + $this->assertTrue(count($parser->getSitemaps()) > 1); + $this->assertTrue(count($parser->getURLs()) >= 1000); + foreach ($parser->getSitemaps() as $url => $tags) { + $this->assertTrue(is_string($url)); + $this->assertTrue(is_array($tags)); + $this->assertTrue($url === $tags['loc']); + $this->assertNotFalse(filter_var($url, FILTER_VALIDATE_URL)); + } + foreach ($parser->getURLs() as $url => $tags) { + $this->assertTrue(is_string($url)); + $this->assertTrue(is_array($tags)); + $this->assertTrue($url === $tags['loc']); + $this->assertNotFalse(filter_var($url, FILTER_VALIDATE_URL)); + } } /** @@ -29,38 +41,7 @@ function generateDataForTest() { return [ [ - 'http://www.example.com/sitemap.txt', - << [ - 'http://www.example.com/sitemap1.xml' => [ - 'loc' => 'http://www.example.com/sitemap1.xml', - ], - 'http://www.example.com/sitemap2.xml' => [ - 'loc' => 'http://www.example.com/sitemap2.xml', - ], - 'http://www.example.com/sitemap3.xml.gz' => [ - 'loc' => 'http://www.example.com/sitemap3.xml.gz', - ], - ], - 'urls' => [ - 'http://www.example.com/page1/' => [ - 'loc' => 'http://www.example.com/page1/', - ], - 'http://www.example.com/page2/' => [ - 'loc' => 'http://www.example.com/page2/', - ], - 'http://www.example.com/page3/file.gz' => [ - 'loc' => 'http://www.example.com/page3/file.gz', - ], - ], - ], + 'https://www.xml-sitemaps.com/urllist.txt', ] ]; }