Skip to content

Commit

Permalink
Improved robots.txt sitemap parser
Browse files Browse the repository at this point in the history
+ Minor code style improvements
  • Loading branch information
JanPetterMG committed Apr 5, 2016
1 parent ab90907 commit 462d3d3
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 78 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Then run `composer update`.
- XML `.xml`
- Compressed XML `.xml.gz`
- Robots.txt rule sheet `robots.txt`
- Plain text
- Line separated list in plain text


## Getting Started
Expand Down Expand Up @@ -113,17 +113,17 @@ try {
}
```

### Parsing of plain text strings
__Note: This is disabled by default__ to avoid false positives when parsing XML documents but get something else in return.
### Parsing of line separated text strings
__Note: This is disabled by default__ to avoid false positives when expecting XML, but get some plain text in return.

To disable `strict` standards, simply pass this configuration into the constructor: ````['strict' => false]````.
To disable `strict` standards, simply pass this configuration to constructor parameter #2: ````['strict' => false]````.
```php
use vipnytt\SitemapParser;
use vipnytt\SitemapParser\Exceptions\SitemapParserException;

try {
$parser = new SitemapParser('MyCustomUserAgent', ['strict' => false]);
$parser->parse('http://www.example.com/?format=sitemap');
$parser->parse('https://www.xml-sitemaps.com/urllist.txt');
foreach ($parser->getSitemaps() as $url => $tags) {
echo $url . '<br>';
}
Expand Down
101 changes: 69 additions & 32 deletions src/SitemapParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,41 @@
*/
class SitemapParser
{
/**
* Default encoding
*/
const ENCODING = 'UTF-8';

/**
* XML file extension
*/
const XML_EXTENSION = '.xml';

/**
* Compressed XML file extension
*/
const XML_EXTENSION_COMPRESSED = '.xml.gz';

/**
* XML Sitemap tag
*/
const XML_TAG_SITEMAP = 'sitemap';

/**
* XML URL tag
*/
const XML_TAG_URL = 'url';

/**
* Robots.txt path
*/
const ROBOTSTXT_PATH = '/robots.txt';

/**
* Robots.txt sitemap prefix
*/
const ROBOTSTXT_PREFIX = 'Sitemap:';

/**
* User-Agent to send with every HTTP(S) request
* @var string
Expand Down Expand Up @@ -74,8 +109,8 @@ public function __construct($userAgent = 'SitemapParser', $config = [])
throw new SitemapParserException('The extension `mbstring` must be installed and loaded for this library');
}
mb_language("uni");
if (!mb_internal_encoding('UTF-8')) {
throw new SitemapParserException('Unable to set internal character encoding to UTF-8');
if (!mb_internal_encoding(self::ENCODING)) {
throw new SitemapParserException('Unable to set internal character encoding to `' . self::ENCODING . '`');
}
$this->userAgent = $userAgent;
$this->config = $config;
Expand Down Expand Up @@ -137,7 +172,7 @@ public function parse($url, $urlContent = null)
$this->currentURL = $url;
$response = (is_string($urlContent)) ? $urlContent : $this->getContent();
$this->history[] = $this->currentURL;
if (parse_url($this->currentURL, PHP_URL_PATH) == '/robots.txt') {
if (parse_url($this->currentURL, PHP_URL_PATH) === self::ROBOTSTXT_PATH) {
$this->parseRobotstxt($response);
return;
}
Expand All @@ -150,12 +185,8 @@ public function parse($url, $urlContent = null)
$this->parseString($response);
return;
}
if (isset($sitemapJson->sitemap)) {
$this->parseJson('sitemap', $sitemapJson->sitemap);
}
if (isset($sitemapJson->url)) {
$this->parseJson('url', $sitemapJson->url);
}
$this->parseJson(self::XML_TAG_SITEMAP, $sitemapJson);
$this->parseJson(self::XML_TAG_URL, $sitemapJson);
}

/**
Expand Down Expand Up @@ -196,17 +227,22 @@ protected function getContent()
* Search for sitemaps in the robots.txt content
*
* @param string $robotstxt
* @return void
* @return bool
*/
protected function parseRobotstxt($robotstxt)
{
preg_match_all('#Sitemap:*(.*)#', $robotstxt, $match);
if (isset($match[1])) {
foreach ($match[1] as $sitemap) {
$sitemap = trim($sitemap);
$this->addArray('sitemap', ['loc' => $sitemap]);
$array = array_map('trim', preg_split('/\R/', $robotstxt));
foreach ($array as $line) {
if (mb_stripos($line, self::ROBOTSTXT_PREFIX) === 0) {
$url = mb_substr($line, mb_strlen(self::ROBOTSTXT_PREFIX));
if (($pos = mb_stripos($url, '#')) !== false) {
$url = mb_substr($url, 0, $pos);
}
$url = preg_split('/\s+/', trim($url))[0];
$this->addArray('sitemap', ['loc' => $url]);
}
}
return true;
}

/**
Expand All @@ -220,10 +256,10 @@ protected function addArray($type, $array)
{
if (isset($array['loc']) && filter_var($array['loc'], FILTER_VALIDATE_URL) !== false) {
switch ($type) {
case 'sitemap':
case self::XML_TAG_SITEMAP:
$this->sitemaps[$array['loc']] = $array;
return true;
case 'url':
case self::XML_TAG_URL:
$this->urls[$array['loc']] = $array;
return true;
}
Expand All @@ -248,27 +284,24 @@ protected function generateXMLObject($xml)
}

/**
* Parse plain text
* Parse line separated text string
*
* @param string $string
* @return bool
*/
protected function parseString($string)
{
if (!isset($this->config['strict']) || $this->config['strict'] !== false) {
// Strings are not part of any sitemap standard
// Strings are not part of any documented sitemap standard
return false;
}
$offset = 0;
while (preg_match('/(\S+)/', $string, $match, PREG_OFFSET_CAPTURE, $offset)) {
$offset = $match[0][1] + strlen($match[0][0]);
if (filter_var($match[0][0], FILTER_VALIDATE_URL) !== false) {
if ($this->isSitemapURL($match[0][0])) {
$this->addArray('sitemap', ['loc' => $match[0][0]]);
continue;
}
$this->addArray('url', ['loc' => $match[0][0]]);
$array = array_map('trim', preg_split('/\R/', $string));
foreach ($array as $line) {
if ($this->isSitemapURL($line)) {
$this->addArray(self::XML_TAG_SITEMAP, ['loc' => $line]);
continue;
}
$this->addArray(self::XML_TAG_URL, ['loc' => $line]);
}
return true;
}
Expand All @@ -283,8 +316,8 @@ protected function isSitemapURL($url)
{
$path = parse_url($url, PHP_URL_PATH);
return filter_var($url, FILTER_VALIDATE_URL) !== false && (
substr($path, -4) === ".xml" ||
substr($path, -7) === '.xml.gz'
substr($path, -strlen(self::XML_EXTENSION)) === self::XML_EXTENSION ||
substr($path, -strlen(self::XML_EXTENSION_COMPRESSED)) === self::XML_EXTENSION_COMPRESSED
);
}

Expand All @@ -293,13 +326,17 @@ protected function isSitemapURL($url)
*
* @param string $type Sitemap or URL
* @param \SimpleXMLElement $json object
* @return void
* @return bool
*/
protected function parseJson($type, $json)
{
foreach ($json as $url) {
if (!isset($json->$type)) {
return false;
}
foreach ($json->$type as $url) {
$this->addArray($type, (array)$url);
}
return true;
}

/**
Expand Down
3 changes: 2 additions & 1 deletion tests/RobotsTxtTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ function generateDataForTest()
<<<ROBOTSTXT
User-agent: *
Disallow: /
Sitemap: http://www.example.com/sitemap.xml
#Sitemap:http://www.example.com/sitemap.xml.gz
Sitemap:http://www.example.com/sitemap.xml#comment
ROBOTSTXT
,
$result = [
Expand Down
6 changes: 4 additions & 2 deletions tests/StrictTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ function generateDataForTest()
'http://www.example.com/sitemap.txt',
<<<TEXT
http://www.example.com/sitemap1.xml
http://www.example.com/sitemap2.xml http://www.example.com/sitemap3.xml.gz
http://www.example.com/sitemap2.xml
http://www.example.com/sitemap3.xml.gz
http://www.example.com/page1/
http://www.example.com/page2/ http://www.example.com/page3/file.gz
http://www.example.com/page2/
http://www.example.com/page3/file.gz
TEXT
]
];
Expand Down
57 changes: 19 additions & 38 deletions tests/StringTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,28 @@ class StringTest extends \PHPUnit_Framework_TestCase
/**
* @dataProvider generateDataForTest
* @param string $url URL
* @param string $body URL body content
* @param array $result Test result to match
*/
public function testString($url, $body, $result)
public function testString($url)
{
$parser = new SitemapParser('SitemapParser', ['strict' => false]);
$this->assertInstanceOf('vipnytt\SitemapParser', $parser);
$parser->parse($url, $body);
$this->assertEquals($result['sitemaps'], $parser->getSitemaps());
$this->assertEquals($result['urls'], $parser->getURLs());
$parser->parse($url);
$this->assertTrue(is_array($parser->getSitemaps()));
$this->assertTrue(is_array($parser->getURLs()));
$this->assertTrue(count($parser->getSitemaps()) > 1);
$this->assertTrue(count($parser->getURLs()) >= 1000);
foreach ($parser->getSitemaps() as $url => $tags) {
$this->assertTrue(is_string($url));
$this->assertTrue(is_array($tags));
$this->assertTrue($url === $tags['loc']);
$this->assertNotFalse(filter_var($url, FILTER_VALIDATE_URL));
}
foreach ($parser->getURLs() as $url => $tags) {
$this->assertTrue(is_string($url));
$this->assertTrue(is_array($tags));
$this->assertTrue($url === $tags['loc']);
$this->assertNotFalse(filter_var($url, FILTER_VALIDATE_URL));
}
}

/**
Expand All @@ -29,38 +41,7 @@ function generateDataForTest()
{
return [
[
'http://www.example.com/sitemap.txt',
<<<TEXT
http://www.example.com/sitemap1.xml
http://www.example.com/sitemap2.xml http://www.example.com/sitemap3.xml.gz
http://www.example.com/page1/
http://www.example.com/page2/ http://www.example.com/page3/file.gz
TEXT
,
$result = [
'sitemaps' => [
'http://www.example.com/sitemap1.xml' => [
'loc' => 'http://www.example.com/sitemap1.xml',
],
'http://www.example.com/sitemap2.xml' => [
'loc' => 'http://www.example.com/sitemap2.xml',
],
'http://www.example.com/sitemap3.xml.gz' => [
'loc' => 'http://www.example.com/sitemap3.xml.gz',
],
],
'urls' => [
'http://www.example.com/page1/' => [
'loc' => 'http://www.example.com/page1/',
],
'http://www.example.com/page2/' => [
'loc' => 'http://www.example.com/page2/',
],
'http://www.example.com/page3/file.gz' => [
'loc' => 'http://www.example.com/page3/file.gz',
],
],
],
'https://www.xml-sitemaps.com/urllist.txt',
]
];
}
Expand Down

0 comments on commit 462d3d3

Please sign in to comment.