Skip to content

Commit

Permalink
Add support for local files with file:// schema. (#22)
Browse files Browse the repository at this point in the history
Using local file vs content of a file ( for parse method ) allows one to use recursive parsing or queue parsing approach and just replace url with local file. Easier for dev time and testing.
I also added a test that creates local file in temp.

Co-authored-by: Grzegorz Drozd <[email protected]>
  • Loading branch information
GrzegorzDrozd and Grzegorz Drozd authored Nov 8, 2023
1 parent 4d144ba commit 151bcee
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 1 deletion.
10 changes: 10 additions & 0 deletions src/SitemapParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,16 @@ protected function getContent()
throw new Exceptions\SitemapParserException('Invalid URL');
}
try {
if (strpos($this->currentURL, 'file://') === 0) {
$path = parse_url($this->currentURL, PHP_URL_PATH);
if (!$this->urlValidatePath($path)) {
throw new Exceptions\SitemapParserException('Invalid file path');
}
if (!file_exists($path) && PHP_OS === 'WINNT') {
return file_get_contents(urldecode($path));
}
return file_get_contents($path);
}
if (!isset($this->config['guzzle']['headers']['User-Agent'])) {
$this->config['guzzle']['headers']['User-Agent'] = $this->userAgent;
}
Expand Down
22 changes: 21 additions & 1 deletion src/SitemapParser/UrlParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,12 @@ protected function urlValidate($url)
return (
filter_var($url, FILTER_VALIDATE_URL) &&
($parsed = parse_url($url)) !== false &&
$this->urlValidateHost($parsed['host']) &&
$this->urlValidateScheme($parsed['scheme']) &&
(
(in_array($parsed['scheme'], ['http', 'https'], true) && $this->urlValidateHost($parsed['host']))
||
(in_array($parsed['scheme'], ['file'], true) && $this->urlValidatePath($parsed['path']))
) &&
$this->urlValidateAgainstBlackList($url)
);
}
Expand Down Expand Up @@ -88,10 +92,26 @@ protected static function urlValidateScheme($scheme)
return in_array($scheme, [
'http',
'https',
'file'
]
);
}

/**
* Check if local file exists at given path.
*
* @param mixed $path
* @return bool
*/
public function urlValidatePath(mixed $path) {
$result = file_exists($path);
if ($result === false && PHP_OS === 'WINNT') {
// try to reverse url encoding for windows paths:
return file_exists(urldecode($path));
}
return $result;
}

protected function urlValidateAgainstBlackList($url)
{
if (empty($this->config['url_black_list'])) {
Expand Down
36 changes: 36 additions & 0 deletions tests/LocalFileTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?php

namespace vipnytt\SitemapParser\Tests;

use PHPUnit\Framework\TestCase;
use vipnytt\SitemapParser;

class RecursiveTest extends TestCase {

public function testLocalFileXMLFile()
{
$parser = new SitemapParser('SitemapParser');
$this->assertInstanceOf('vipnytt\SitemapParser', $parser);

$tmpfname = tempnam(sys_get_temp_dir(), "sitemap_parser_test_file");
$fileContent = <<<XMLSITEMAP
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>http://www.example.com/sitemap.xml</loc>
<lastmod>2004-10-01T18:23:17+00:00</lastmod>
</sitemap>
</sitemapindex>
XMLSITEMAP;
file_put_contents($tmpfname, $fileContent);
$parser->parse('file:///'.$tmpfname);
$this->assertEquals([
'http://www.example.com/sitemap.xml' => [
'loc' => 'http://www.example.com/sitemap.xml',
'lastmod' => '2004-10-01T18:23:17+00:00',
'namespaces' => [],
],
], $parser->getSitemaps());
}

}

0 comments on commit 151bcee

Please sign in to comment.