Skip to content

Commit

Permalink
Various improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
JanPetterMG committed Feb 7, 2016
1 parent fcbd57a commit 6b36354
Show file tree
Hide file tree
Showing 10 changed files with 309 additions and 170 deletions.
79 changes: 48 additions & 31 deletions src/XRobotsTagParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

namespace vipnytt;

use DateTime;
use vipnytt\robot\URLParser;
use vipnytt\robot\UserAgentParser;

Expand All @@ -35,6 +34,18 @@ class XRobotsTagParser
const DIRECTIVE_NO_TRANSLATE = 'notranslate';
const DIRECTIVE_UNAVAILABLE_AFTER = 'unavailable_after';

// TODO: Shuld be RFC-850, but disabled due to an rule parsing bug
const DATE_FORMAT_DEFAULT = 'd M Y H:i:s T';

private $supportedDateFormats = [
self::DATE_FORMAT_DEFAULT,
DATE_RFC1123,
DATE_RFC850,
'd M Y H:i:s T'
];

private $strict = false;

private $url = '';
private $userAgent = self::USERAGENT_DEFAULT;

Expand All @@ -51,18 +62,21 @@ class XRobotsTagParser
*
* @param string $url
* @param string $userAgent
* @param array $headers
* @param bool $strict
* @param array|null $headers
*/
public function __construct($url, $userAgent = self::USERAGENT_DEFAULT, $headers = [])
public function __construct($url, $userAgent = self::USERAGENT_DEFAULT, $strict = false, $headers = null)
{
$this->strict = $strict;

// Parse URL
$urlParser = new URLParser(trim($url));
if (!$urlParser->isValid()) {
trigger_error('Invalid URL', E_USER_WARNING);
}
$this->url = $urlParser->encode();
// Get headers
$this->setHeaders($headers);
$this->useHeaders($headers);
// Parse rules
$this->parse();
// Set User-Agent
Expand All @@ -73,20 +87,19 @@ public function __construct($url, $userAgent = self::USERAGENT_DEFAULT, $headers
/**
* Request HTTP headers
*
* @param array $customHeaders - use these headers
* @return void
* @param array|null|false $customHeaders - use these headers
* @return bool
*/
private function setHeaders($customHeaders = [])
private function useHeaders($customHeaders = null)
{
$this->headers = $customHeaders;
if (is_array($this->headers) && !empty($this->headers)) {
return;
}
$this->headers = get_headers($this->url);
if (is_array($this->headers) && !empty($this->headers)) {
if ($customHeaders === false) {
trigger_error('Unable to fetch HTTP headers', E_USER_ERROR);
return;
return false;
} elseif (!is_array($customHeaders) || empty($customHeaders)) {
return $this->useHeaders(get_headers($this->url));
}
$this->headers = $customHeaders;
return true;
}

/**
Expand Down Expand Up @@ -117,20 +130,14 @@ private function detectDirectives()
{
$rules = explode(',', $this->currentRule);
foreach ($rules as $rule) {
$part = explode(':', $rule, 3);
$part[0] = trim($part[0]);
$part[1] = isset($part[1]) ? trim($part[1]) : '';
$part[2] = isset($part[2]) ? trim($part[2]) : '';
if ($rules[0] === $rule && count($part) >= 2 && !in_array($part[0], $this->directiveArray())) {
$this->currentUserAgent = $part[0];
if (in_array($part[1], $this->directiveArray())) {
$this->currentDirective = $part[1];
$this->currentValue = $part[2];
$this->addRule();
}
} elseif (in_array($part[0], $this->directiveArray())) {
$this->currentDirective = $part[0];
$this->currentValue = $part[1];
$pair = array_map('trim', explode(':', $rule, 2));
if ($rules[0] === $rule && count($pair) == 2 && !in_array($pair[0], $this->directiveArray())) {
$this->currentUserAgent = $pair[0];
$pair = array_map('trim', explode(':', $pair[1], 2));
}
if (in_array($pair[0], $this->directiveArray())) {
$this->currentDirective = $pair[0];
$this->currentValue = isset($pair[1]) ? $pair[1] : null;
$this->addRule();
}
}
Expand Down Expand Up @@ -176,13 +183,23 @@ private function addRule()
$this->rules[$this->currentUserAgent][$this->currentDirective] = true;
break;
case self::DIRECTIVE_NONE:
$this->rules[$this->currentUserAgent][self::DIRECTIVE_NONE] = true;
if ($this->strict) break;
$this->rules[$this->currentUserAgent][self::DIRECTIVE_NO_INDEX] = true;
$this->rules[$this->currentUserAgent][self::DIRECTIVE_NO_FOLLOW] = true;
break;
case self::DIRECTIVE_UNAVAILABLE_AFTER:
$dateTime = new DateTime();
$dateTime->createFromFormat(DATE_RFC850, $this->currentValue);
$this->rules[$this->currentUserAgent][self::DIRECTIVE_UNAVAILABLE_AFTER] = $dateTime->getTimestamp();
if ($this->strict) $this->supportedDateFormats = [self::DATE_FORMAT_DEFAULT];
foreach (array_unique($this->supportedDateFormats) as $format) {
$dateTime = date_create_from_format($format, $this->currentValue);
if ($dateTime === false) continue;
$this->rules[$this->currentUserAgent][self::DIRECTIVE_UNAVAILABLE_AFTER] = $dateTime->format(self::DATE_FORMAT_DEFAULT);
if ($this->strict) break;
if (time() >= $dateTime->getTimestamp()) {
$this->rules[$this->currentUserAgent][self::DIRECTIVE_NO_INDEX] = true;
}
break;
}
break;
}
}
Expand Down
52 changes: 0 additions & 52 deletions test/cases/DirectiveNoneTest.php

This file was deleted.

71 changes: 0 additions & 71 deletions test/cases/MultiDirectivesTest.php

This file was deleted.

81 changes: 81 additions & 0 deletions test/cases/MultiTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
<?php

namespace vipnytt\XRobotsTagParser\tests;

use vipnytt\XRobotsTagParser;

class MultiTest extends \PHPUnit_Framework_TestCase
{
/**
* Multi directives test
*
* @dataProvider generateDataForTest
* @param string $url
* @param string $bot
* @param bool $strict
* @param array|null $headers
*/
public function testMultipleDirectives($url, $bot, $strict, $headers)
{
$parser = new XRobotsTagParser($url, $bot, $strict, $headers);
$this->assertInstanceOf('vipnytt\XRobotsTagParser', $parser);

$this->assertContains(['noindex' => true], $parser->getRules());
$this->assertContains(['noindex' => true], $parser->export()['']);
$this->assertContains(['noindex' => true], $parser->export()['googlebot']);

$this->assertContains(['nofollow' => true], $parser->getRules());
$this->assertContains(['nofollow' => true], $parser->export()['']);
$this->assertContains(['nofollow' => true], $parser->export()['googlebot']);

$this->assertContains(['noarchive' => true], $parser->getRules());
$this->assertContains(['noarchive' => true], $parser->export()['']);
$this->assertContains(['noarchive' => true], $parser->export()['googlebot']);

$this->assertContains(['nosnippet' => true], $parser->getRules());
$this->assertContains(['nosnippet' => true], $parser->export()['']);
$this->assertContains(['nosnippet' => true], $parser->export()['googlebot']);

$this->assertContains(['noodp' => true], $parser->getRules());
$this->assertContains(['noodp' => true], $parser->export()['']);
$this->assertContains(['noodp' => true], $parser->export()['googlebot']);

$this->assertContains(['notranslate' => true], $parser->getRules());
$this->assertContains(['notranslate' => true], $parser->export()['']);
$this->assertContains(['notranslate' => true], $parser->export()['googlebot']);

$this->assertContains(['noimageindex' => true], $parser->getRules());
$this->assertContains(['noimageindex' => true], $parser->export()['']);
$this->assertContains(['noimageindex' => true], $parser->export()['googlebot']);
}

/**
* Generate test data
* @return array
*/
public function generateDataForTest()
{
return [
[
'http://example.com/',
'googlebot',
false,
[
'HTTP/1.1 200 OK',
'Date: Tue, 25 May 2010 21:42:43 GMT',
'X-Robots-Tag: all',
'X-Robots-Tag: noindex',
'X-Robots-Tag: nofollow',
'X-Robots-Tag: none',
'X-Robots-Tag: noarchive',
'X-Robots-Tag: nosnippet',
'X-Robots-Tag: noodp',
'X-Robots-Tag: notranslate',
'X-Robots-Tag: noimageindex',
'X-Robots-Tag: unavailable_after: 25 Jun 2010 15:00:00 PST',
'X-Robots-Tag: googlebot: all, none, nofollow,nosnippet,notranslate unavailable_after: 25 Jun 2010 15:00:00 PST, noindex, noarchive, noodp,noimageindex'
]
]
];
}
}
54 changes: 54 additions & 0 deletions test/cases/NoneTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
<?php

namespace vipnytt\XRobotsTagParser\tests;

use vipnytt\XRobotsTagParser;

class NoneTest extends \PHPUnit_Framework_TestCase
{
/**
* Directive: NONE
*
* @dataProvider generateDataForTest
* @param string $url
* @param string $bot
* @param bool $strict
* @param array|null $headers
*/
public function testNone($url, $bot, $strict, $headers)
{
$parser = new XRobotsTagParser($url, $bot, $strict, $headers);
$this->assertInstanceOf('vipnytt\XRobotsTagParser', $parser);

$this->assertContains(['none' => true], $parser->getRules());
$this->assertContains(['noindex' => true], $parser->getRules());
$this->assertContains(['nofollow' => true], $parser->getRules());

$this->assertContains(['none' => true], $parser->export()['']);
$this->assertContains(['noindex' => true], $parser->export()['']);
$this->assertContains(['nofollow' => true], $parser->export()['']);

$this->assertContains(['none' => true], $parser->export()['googlebot']);
$this->assertContains(['noindex' => true], $parser->export()['googlebot']);
$this->assertContains(['nofollow' => true], $parser->export()['googlebot']);
}

/**
* Generate test data
* @return array
*/
public function generateDataForTest()
{
return [
[
'http://example.com/',
'googlebot',
false,
[
'X-Robots-Tag: none',
'X-Robots-Tag: googlebot: none'
]
]
];
}
}
Loading

0 comments on commit 6b36354

Please sign in to comment.