From 151bceefa12d7f60d722a9c094bee119e80572e8 Mon Sep 17 00:00:00 2001 From: Grzegorz Drozd <1885137+GrzegorzDrozd@users.noreply.github.com> Date: Wed, 8 Nov 2023 19:43:58 +0100 Subject: [PATCH] Add support for local files with file:// schema. (#22) Using local file vs content of a file ( for parse method ) allows one to use recursive parsing or queue parsing approach and just replace url with local file. Easier for dev time and testing. I also added a test that creates local file in temp. Co-authored-by: Grzegorz Drozd --- src/SitemapParser.php | 10 +++++++++ src/SitemapParser/UrlParser.php | 22 +++++++++++++++++++- tests/LocalFileTest.php | 36 +++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 tests/LocalFileTest.php diff --git a/src/SitemapParser.php b/src/SitemapParser.php index bafbf32..f755e1c 100644 --- a/src/SitemapParser.php +++ b/src/SitemapParser.php @@ -224,6 +224,16 @@ protected function getContent() throw new Exceptions\SitemapParserException('Invalid URL'); } try { + if (strpos($this->currentURL, 'file://') === 0) { + $path = parse_url($this->currentURL, PHP_URL_PATH); + if (!$this->urlValidatePath($path)) { + throw new Exceptions\SitemapParserException('Invalid file path'); + } + if (!file_exists($path) && PHP_OS === 'WINNT') { + return file_get_contents(urldecode($path)); + } + return file_get_contents($path); + } if (!isset($this->config['guzzle']['headers']['User-Agent'])) { $this->config['guzzle']['headers']['User-Agent'] = $this->userAgent; } diff --git a/src/SitemapParser/UrlParser.php b/src/SitemapParser/UrlParser.php index 62fb24e..6647a5b 100644 --- a/src/SitemapParser/UrlParser.php +++ b/src/SitemapParser/UrlParser.php @@ -54,8 +54,12 @@ protected function urlValidate($url) return ( filter_var($url, FILTER_VALIDATE_URL) && ($parsed = parse_url($url)) !== false && - $this->urlValidateHost($parsed['host']) && $this->urlValidateScheme($parsed['scheme']) && + ( + (in_array($parsed['scheme'], ['http', 'https'], true) && $this->urlValidateHost($parsed['host'])) + || + (in_array($parsed['scheme'], ['file'], true) && $this->urlValidatePath($parsed['path'])) + ) && $this->urlValidateAgainstBlackList($url) ); } @@ -88,10 +92,26 @@ protected static function urlValidateScheme($scheme) return in_array($scheme, [ 'http', 'https', + 'file' ] ); } + /** + * Check if local file exists at given path. + * + * @param mixed $path + * @return bool + */ + public function urlValidatePath(mixed $path) { + $result = file_exists($path); + if ($result === false && PHP_OS === 'WINNT') { + // try to reverse url encoding for windows paths: + return file_exists(urldecode($path)); + } + return $result; + } + protected function urlValidateAgainstBlackList($url) { if (empty($this->config['url_black_list'])) { diff --git a/tests/LocalFileTest.php b/tests/LocalFileTest.php new file mode 100644 index 0000000..e06eddb --- /dev/null +++ b/tests/LocalFileTest.php @@ -0,0 +1,36 @@ +assertInstanceOf('vipnytt\SitemapParser', $parser); + + $tmpfname = tempnam(sys_get_temp_dir(), "sitemap_parser_test_file"); + $fileContent = << + + + http://www.example.com/sitemap.xml + 2004-10-01T18:23:17+00:00 + + +XMLSITEMAP; + file_put_contents($tmpfname, $fileContent); + $parser->parse('file:///'.$tmpfname); + $this->assertEquals([ + 'http://www.example.com/sitemap.xml' => [ + 'loc' => 'http://www.example.com/sitemap.xml', + 'lastmod' => '2004-10-01T18:23:17+00:00', + 'namespaces' => [], + ], + ], $parser->getSitemaps()); + } + +}