diff --git a/src/XRobotsTagParser.php b/src/XRobotsTagParser.php index 137f7bd..d72fd7e 100644 --- a/src/XRobotsTagParser.php +++ b/src/XRobotsTagParser.php @@ -15,7 +15,6 @@ namespace vipnytt; -use DateTime; use vipnytt\robot\URLParser; use vipnytt\robot\UserAgentParser; @@ -35,6 +34,18 @@ class XRobotsTagParser const DIRECTIVE_NO_TRANSLATE = 'notranslate'; const DIRECTIVE_UNAVAILABLE_AFTER = 'unavailable_after'; + // TODO: Shuld be RFC-850, but disabled due to an rule parsing bug + const DATE_FORMAT_DEFAULT = 'd M Y H:i:s T'; + + private $supportedDateFormats = [ + self::DATE_FORMAT_DEFAULT, + DATE_RFC1123, + DATE_RFC850, + 'd M Y H:i:s T' + ]; + + private $strict = false; + private $url = ''; private $userAgent = self::USERAGENT_DEFAULT; @@ -51,10 +62,13 @@ class XRobotsTagParser * * @param string $url * @param string $userAgent - * @param array $headers + * @param bool $strict + * @param array|null $headers */ - public function __construct($url, $userAgent = self::USERAGENT_DEFAULT, $headers = []) + public function __construct($url, $userAgent = self::USERAGENT_DEFAULT, $strict = false, $headers = null) { + $this->strict = $strict; + // Parse URL $urlParser = new URLParser(trim($url)); if (!$urlParser->isValid()) { @@ -62,7 +76,7 @@ public function __construct($url, $userAgent = self::USERAGENT_DEFAULT, $headers } $this->url = $urlParser->encode(); // Get headers - $this->setHeaders($headers); + $this->useHeaders($headers); // Parse rules $this->parse(); // Set User-Agent @@ -73,20 +87,19 @@ public function __construct($url, $userAgent = self::USERAGENT_DEFAULT, $headers /** * Request HTTP headers * - * @param array $customHeaders - use these headers - * @return void + * @param array|null|false $customHeaders - use these headers + * @return bool */ - private function setHeaders($customHeaders = []) + private function useHeaders($customHeaders = null) { - $this->headers = $customHeaders; - if (is_array($this->headers) && !empty($this->headers)) { - return; - } - $this->headers = get_headers($this->url); - if (is_array($this->headers) && !empty($this->headers)) { + if ($customHeaders === false) { trigger_error('Unable to fetch HTTP headers', E_USER_ERROR); - return; + return false; + } elseif (!is_array($customHeaders) || empty($customHeaders)) { + return $this->useHeaders(get_headers($this->url)); } + $this->headers = $customHeaders; + return true; } /** @@ -117,20 +130,14 @@ private function detectDirectives() { $rules = explode(',', $this->currentRule); foreach ($rules as $rule) { - $part = explode(':', $rule, 3); - $part[0] = trim($part[0]); - $part[1] = isset($part[1]) ? trim($part[1]) : ''; - $part[2] = isset($part[2]) ? trim($part[2]) : ''; - if ($rules[0] === $rule && count($part) >= 2 && !in_array($part[0], $this->directiveArray())) { - $this->currentUserAgent = $part[0]; - if (in_array($part[1], $this->directiveArray())) { - $this->currentDirective = $part[1]; - $this->currentValue = $part[2]; - $this->addRule(); - } - } elseif (in_array($part[0], $this->directiveArray())) { - $this->currentDirective = $part[0]; - $this->currentValue = $part[1]; + $pair = array_map('trim', explode(':', $rule, 2)); + if ($rules[0] === $rule && count($pair) == 2 && !in_array($pair[0], $this->directiveArray())) { + $this->currentUserAgent = $pair[0]; + $pair = array_map('trim', explode(':', $pair[1], 2)); + } + if (in_array($pair[0], $this->directiveArray())) { + $this->currentDirective = $pair[0]; + $this->currentValue = isset($pair[1]) ? $pair[1] : null; $this->addRule(); } } @@ -176,13 +183,23 @@ private function addRule() $this->rules[$this->currentUserAgent][$this->currentDirective] = true; break; case self::DIRECTIVE_NONE: + $this->rules[$this->currentUserAgent][self::DIRECTIVE_NONE] = true; + if ($this->strict) break; $this->rules[$this->currentUserAgent][self::DIRECTIVE_NO_INDEX] = true; $this->rules[$this->currentUserAgent][self::DIRECTIVE_NO_FOLLOW] = true; break; case self::DIRECTIVE_UNAVAILABLE_AFTER: - $dateTime = new DateTime(); - $dateTime->createFromFormat(DATE_RFC850, $this->currentValue); - $this->rules[$this->currentUserAgent][self::DIRECTIVE_UNAVAILABLE_AFTER] = $dateTime->getTimestamp(); + if ($this->strict) $this->supportedDateFormats = [self::DATE_FORMAT_DEFAULT]; + foreach (array_unique($this->supportedDateFormats) as $format) { + $dateTime = date_create_from_format($format, $this->currentValue); + if ($dateTime === false) continue; + $this->rules[$this->currentUserAgent][self::DIRECTIVE_UNAVAILABLE_AFTER] = $dateTime->format(self::DATE_FORMAT_DEFAULT); + if ($this->strict) break; + if (time() >= $dateTime->getTimestamp()) { + $this->rules[$this->currentUserAgent][self::DIRECTIVE_NO_INDEX] = true; + } + break; + } break; } } diff --git a/test/cases/DirectiveNoneTest.php b/test/cases/DirectiveNoneTest.php deleted file mode 100644 index aec7a25..0000000 --- a/test/cases/DirectiveNoneTest.php +++ /dev/null @@ -1,52 +0,0 @@ -assertInstanceOf('vipnytt\XRobotsTagParser', $parser); - - $this->assertContains('none', $parser->getRules()); - $this->assertContains('none', $parser->getRules()); - $this->assertContains('none', $parser->getRules()); - - $this->assertContains('noindex', $parser->export()['']); - $this->assertContains('noindex', $parser->export()['']); - $this->assertContains('noindex', $parser->export()['']); - - $this->assertContains('nofollow', $parser->export()['googlebot']); - $this->assertContains('nofollow', $parser->export()['googlebot']); - $this->assertContains('nofollow', $parser->export()['googlebot']); - } - - /** - * Generate test data - * @return array - */ - public function generateDataForTest() - { - return [ - [ - 'http://example.com/', - 'googlebot', - [ - 'X-Robots-Tag: none', - 'X-Robots-Tag: googlebot: none' - ] - ] - ]; - } -} diff --git a/test/cases/MultiDirectivesTest.php b/test/cases/MultiDirectivesTest.php deleted file mode 100644 index 2d7c867..0000000 --- a/test/cases/MultiDirectivesTest.php +++ /dev/null @@ -1,71 +0,0 @@ -assertInstanceOf('vipnytt\XRobotsTagParser', $parser); - - $this->assertContains('noindex', $parser->getRules()); - $this->assertContains('noindex', $parser->export()['']); - $this->assertContains('noindex', $parser->export()['googlebot']); - $this->assertContains('nofollow', $parser->getRules()); - $this->assertContains('nofollow', $parser->export()['']); - $this->assertContains('nofollow', $parser->export()['googlebot']); - $this->assertContains('noarchive', $parser->getRules()); - $this->assertContains('noarchive', $parser->export()['']); - $this->assertContains('noarchive', $parser->export()['googlebot']); - $this->assertContains('nosnippet', $parser->getRules()); - $this->assertContains('nosnippet', $parser->export()['']); - $this->assertContains('nosnippet', $parser->export()['googlebot']); - $this->assertContains('noodp', $parser->getRules()); - $this->assertContains('noodp', $parser->export()['']); - $this->assertContains('noodp', $parser->export()['googlebot']); - $this->assertContains('notranslate', $parser->getRules()); - $this->assertContains('notranslate', $parser->export()['']); - $this->assertContains('notranslate', $parser->export()['googlebot']); - $this->assertContains('noimageindex', $parser->getRules()); - $this->assertContains('noimageindex', $parser->export()['']); - $this->assertContains('noimageindex', $parser->export()['googlebot']); - } - - /** - * Generate test data - * @return array - */ - public function generateDataForTest() - { - return [ - [ - 'http://example.com/', - 'googlebot', - [ - 'X-Robots-Tag: all', - 'X-Robots-Tag: noindex', - 'X-Robots-Tag: nofollow', - 'X-Robots-Tag: none', - 'X-Robots-Tag: noarchive', - 'X-Robots-Tag: nosnippet', - 'X-Robots-Tag: noodp', - 'X-Robots-Tag: notranslate', - 'X-Robots-Tag: noimageindex', - 'X-Robots-Tag: unavailable_after: 25 Jun 2010 15:00:00 PST', - 'X-Robots-Tag: googlebot: all, none, nofollow,nosnippet,notranslate unavailable_after: 25 Jun 2010 15:00:00 PST, noindex, noarchive, noodp,noimageindex' - ] - ] - ]; - } -} diff --git a/test/cases/MultiTest.php b/test/cases/MultiTest.php new file mode 100644 index 0000000..5358045 --- /dev/null +++ b/test/cases/MultiTest.php @@ -0,0 +1,81 @@ +assertInstanceOf('vipnytt\XRobotsTagParser', $parser); + + $this->assertContains(['noindex' => true], $parser->getRules()); + $this->assertContains(['noindex' => true], $parser->export()['']); + $this->assertContains(['noindex' => true], $parser->export()['googlebot']); + + $this->assertContains(['nofollow' => true], $parser->getRules()); + $this->assertContains(['nofollow' => true], $parser->export()['']); + $this->assertContains(['nofollow' => true], $parser->export()['googlebot']); + + $this->assertContains(['noarchive' => true], $parser->getRules()); + $this->assertContains(['noarchive' => true], $parser->export()['']); + $this->assertContains(['noarchive' => true], $parser->export()['googlebot']); + + $this->assertContains(['nosnippet' => true], $parser->getRules()); + $this->assertContains(['nosnippet' => true], $parser->export()['']); + $this->assertContains(['nosnippet' => true], $parser->export()['googlebot']); + + $this->assertContains(['noodp' => true], $parser->getRules()); + $this->assertContains(['noodp' => true], $parser->export()['']); + $this->assertContains(['noodp' => true], $parser->export()['googlebot']); + + $this->assertContains(['notranslate' => true], $parser->getRules()); + $this->assertContains(['notranslate' => true], $parser->export()['']); + $this->assertContains(['notranslate' => true], $parser->export()['googlebot']); + + $this->assertContains(['noimageindex' => true], $parser->getRules()); + $this->assertContains(['noimageindex' => true], $parser->export()['']); + $this->assertContains(['noimageindex' => true], $parser->export()['googlebot']); + } + + /** + * Generate test data + * @return array + */ + public function generateDataForTest() + { + return [ + [ + 'http://example.com/', + 'googlebot', + false, + [ + 'HTTP/1.1 200 OK', + 'Date: Tue, 25 May 2010 21:42:43 GMT', + 'X-Robots-Tag: all', + 'X-Robots-Tag: noindex', + 'X-Robots-Tag: nofollow', + 'X-Robots-Tag: none', + 'X-Robots-Tag: noarchive', + 'X-Robots-Tag: nosnippet', + 'X-Robots-Tag: noodp', + 'X-Robots-Tag: notranslate', + 'X-Robots-Tag: noimageindex', + 'X-Robots-Tag: unavailable_after: 25 Jun 2010 15:00:00 PST', + 'X-Robots-Tag: googlebot: all, none, nofollow,nosnippet,notranslate unavailable_after: 25 Jun 2010 15:00:00 PST, noindex, noarchive, noodp,noimageindex' + ] + ] + ]; + } +} diff --git a/test/cases/NoneTest.php b/test/cases/NoneTest.php new file mode 100644 index 0000000..224869b --- /dev/null +++ b/test/cases/NoneTest.php @@ -0,0 +1,54 @@ +assertInstanceOf('vipnytt\XRobotsTagParser', $parser); + + $this->assertContains(['none' => true], $parser->getRules()); + $this->assertContains(['noindex' => true], $parser->getRules()); + $this->assertContains(['nofollow' => true], $parser->getRules()); + + $this->assertContains(['none' => true], $parser->export()['']); + $this->assertContains(['noindex' => true], $parser->export()['']); + $this->assertContains(['nofollow' => true], $parser->export()['']); + + $this->assertContains(['none' => true], $parser->export()['googlebot']); + $this->assertContains(['noindex' => true], $parser->export()['googlebot']); + $this->assertContains(['nofollow' => true], $parser->export()['googlebot']); + } + + /** + * Generate test data + * @return array + */ + public function generateDataForTest() + { + return [ + [ + 'http://example.com/', + 'googlebot', + false, + [ + 'X-Robots-Tag: none', + 'X-Robots-Tag: googlebot: none' + ] + ] + ]; + } +} diff --git a/test/cases/UnavailableAfterStrictTest.php b/test/cases/UnavailableAfterStrictTest.php new file mode 100644 index 0000000..63e5048 --- /dev/null +++ b/test/cases/UnavailableAfterStrictTest.php @@ -0,0 +1,50 @@ +assertInstanceOf('vipnytt\XRobotsTagParser', $parser); + + // TODO: Disabled due to an RFC-850 parsing bug + //$this->assertEquals(['unavailable_after' => '01 Jul 2000 07:00:00 PST'], $parser->getRules()); + //$this->assertEquals(['unavailable_after' => '31 Dec 2050 23:00:00 PST'], $parser->export()['']); + //$this->assertEquals(['unavailable_after' => '01 Jul 2000 07:00:00 PST'], $parser->export()['googlebot']); + //$this->assertArrayNotHasKey('unavailable_after', $parser->export()['bingbot']); + } + + /** + * Generate test data + * @return array + */ + public function generateDataForTest() + { + return [ + [ + 'http://example.com/', + 'googlebot', + true, + [ + 'X-Robots-Tag: unavailable_after: Saturday, 31-Dec-50 23:00:00 PST', + 'X-Robots-Tag: googlebot: unavailable_after: Saturday, 01-Jul-00 07:00:00 PST', + 'X-Robots-Tag: bingbot: unavailable_after: 31 Dec 2050 23:00:00 PST' + ] + ] + ]; + } +} diff --git a/test/cases/UnavailableAfterTest.php b/test/cases/UnavailableAfterTest.php new file mode 100644 index 0000000..b4213cc --- /dev/null +++ b/test/cases/UnavailableAfterTest.php @@ -0,0 +1,56 @@ +assertInstanceOf('vipnytt\XRobotsTagParser', $parser); + + $this->assertEquals(['unavailable_after' => '01 Jul 2000 07:00:00 PST', 'noindex' => true], $parser->getRules()); + $this->assertEquals(['unavailable_after' => '31 Dec 2050 23:00:00 PST'], $parser->export()['']); + $this->assertEquals(['unavailable_after' => '01 Jul 2000 07:00:00 PST', 'noindex' => true], $parser->export()['googlebot']); + } + + /** + * Generate test data + * @return array + */ + public function generateDataForTest() + { + return [ + /*[ + 'http://example.com/', + 'googlebot', + false, + [ + 'X-Robots-Tag: unavailable_after: Saturday, 31-Dec-50 23:00:00 PST', + 'X-Robots-Tag: googlebot: unavailable_after: Saturday, 01-Jul-00 07:00:00 PST' + ] + ],*/ + [ + 'http://example.com/', + 'googlebot', + false, + [ + 'X-Robots-Tag: unavailable_after: 31 Dec 2050 23:00:00 PST', + 'X-Robots-Tag: googlebot: unavailable_after: 01 Jul 2000 07:00:00 PST' + ] + ] + ]; + } +} diff --git a/test/cases/UserAgentTest.php b/test/cases/UserAgentTest.php index 457c57b..f035034 100644 --- a/test/cases/UserAgentTest.php +++ b/test/cases/UserAgentTest.php @@ -4,7 +4,7 @@ use vipnytt\robot\UserAgentParser; -class UserAgentTests extends \PHPUnit_Framework_TestCase +class UserAgentTest extends \PHPUnit_Framework_TestCase { /** * Character case diff --git a/test/cases/exportTest.php b/test/cases/exportTest.php index d33fc15..db95e6d 100644 --- a/test/cases/exportTest.php +++ b/test/cases/exportTest.php @@ -12,21 +12,22 @@ class exportTest extends \PHPUnit_Framework_TestCase * @dataProvider generateDataForTest * @param string $url * @param string $bot - * @param string $headers + * @param bool $strict + * @param array|null $headers */ - public function testExport($url, $bot, $headers) + public function testExport($url, $bot, $strict, $headers) { - $parser = new XRobotsTagParser($url, $bot, $headers); + $parser = new XRobotsTagParser($url, $bot, $strict, $headers); $this->assertInstanceOf('vipnytt\XRobotsTagParser', $parser); - $this->assertContains('noindex', $parser->export()['googlebot']); - $this->assertContains('noarchive', $parser->export()['googlebot']); + $this->assertContains(['noindex' => true], $parser->export()['googlebot']); + $this->assertContains(['noarchive' => true], $parser->export()['googlebot']); - $this->assertContains('noindex', $parser->export()['bingbot']); - $this->assertContains('noodp', $parser->export()['bingbot']); + $this->assertContains(['noindex' => true], $parser->export()['bingbot']); + $this->assertContains(['noodp' => true], $parser->export()['bingbot']); - $this->assertContains('noindex', $parser->export()['']); - $this->assertContains('noodp', $parser->export()['']); + $this->assertContains(['noindex' => true], $parser->export()['']); + $this->assertContains(['noodp' => true], $parser->export()['']); } /** @@ -39,6 +40,7 @@ public function generateDataForTest() [ 'http://example.com/', 'googlebot', + false, [ 'X-Robots-Tag: googlebot: noindex, noarchive', 'X-Robots-Tag: bingbot: noindex, noodp', diff --git a/test/cases/getRulesTest.php b/test/cases/getRulesTest.php index f693b45..ab4fe3a 100644 --- a/test/cases/getRulesTest.php +++ b/test/cases/getRulesTest.php @@ -12,16 +12,17 @@ class getRulesTest extends \PHPUnit_Framework_TestCase * @dataProvider generateDataForTest * @param string $url * @param string $bot - * @param string $headers + * @param bool $strict + * @param array|null $headers */ - public function testGetRules($url, $bot, $headers) + public function testGetRules($url, $bot, $strict, $headers) { - $parser = new XRobotsTagParser($url, $bot, $headers); + $parser = new XRobotsTagParser($url, $bot, $strict, $headers); $this->assertInstanceOf('vipnytt\XRobotsTagParser', $parser); - $this->assertContains('noindex', $parser->getRules()); - $this->assertContains('noarchive', $parser->getRules()); - $this->assertContains('noodp', $parser->getRules()); + $this->assertContains(['noindex' => true], $parser->getRules()); + $this->assertContains(['noarchive' => true], $parser->getRules()); + $this->assertContains(['noodp' => true], $parser->getRules()); } /** @@ -34,6 +35,7 @@ public function generateDataForTest() [ 'http://example.com/', 'googlebot', + false, [ 'X-Robots-Tag: googlebot: noindex, noarchive', 'X-Robots-Tag: bingbot: noindex, noodp',