diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 11b2e93..3b8bd01 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,7 +44,7 @@ jobs: poetry install --with dev - name: Run tests run: | - poetry run pytest + poetry run pytest --cov=./ publish: needs: test diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_crawler.py b/tests/test_crawler.py index e2d84fa..4310cd0 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -2,8 +2,10 @@ import responses -from tiny_web_crawler.crawler import Spider +import requests +from tiny_web_crawler.crawler import Spider, DEFAULT_SCHEME +from tests.utils import setup_mock_response def test_is_valid_url() -> None: assert Spider.is_valid_url("http://example.com") is True @@ -12,22 +14,41 @@ def test_is_valid_url() -> None: def test_format_url() -> None: spider = Spider("http://example.com", 10) - assert spider.format_url( - "/test", "http://example.com") == "http://example.com/test" + + assert ( + spider.format_url("/test", "http://example.com") + == "http://example.com/test" + ) + assert ( spider.format_url("http://example.com/test", "http://example.com") == "http://example.com/test" ) + assert ( + spider.format_url('path1/path2', 'http://example.com') + == 'http://example.com/path1/path2' + ) + + assert ( + spider.format_url('/path1/path2', 'http://example.com') + == 'http://example.com/path1/path2' + ) + + assert ( + spider.format_url('path.com', 'http://example.com') + == DEFAULT_SCHEME + 'path.com' + ) + @responses.activate def test_fetch_url() -> None: - responses.add( - responses.GET, - "http://example.com", + setup_mock_response( + url="http://example.com", body="
link", - status=200, + status=200 ) + spider = Spider(root_url="http://example.com", max_links=2) resp = spider.fetch_url("http://example.com") @@ -35,15 +56,85 @@ def test_fetch_url() -> None: assert resp.text == "link" +@responses.activate +def test_fetch_url_connection_error(capsys) -> None: # type: ignore + spider = Spider("http://connection.error") + + # Fetch url whose response isn't mocked to raise ConnectionError + resp = spider.fetch_url("http://connection.error") + + captured = capsys.readouterr() + assert "Connection error occurred:" in captured.out + assert resp is None + + +@responses.activate +def test_fetch_url_http_error(capsys) -> None: # type: ignore + error_codes = [403, 404, 408] + + spider = Spider("http://http.error") + + for error_code in error_codes: + setup_mock_response( + url=f"http://http.error/{error_code}", + body="link", + status=error_code + ) + resp = spider.fetch_url(f"http://http.error/{error_code}") + + captured = capsys.readouterr() + + assert "HTTP error occurred:" in captured.out + assert resp is None + + +@responses.activate +def test_fetch_url_timeout_error(capsys) -> None: # type: ignore + setup_mock_response( + url="http://timeout.error", + body=requests.exceptions.Timeout(), + status=408 + ) + + spider = Spider("http://timeout.error") + + # Fetch url whose response isn't mocked to raise ConnectionError + resp = spider.fetch_url("http://timeout.error") + + captured = capsys.readouterr() + assert "Timeout error occurred:" in captured.out + assert resp is None + + +@responses.activate +def test_fetch_url_requests_exception(capsys) -> None: # type: ignore + setup_mock_response( + url="http://requests.exception", + body=requests.exceptions.RequestException(), + status=404 + ) + + spider = Spider("http://requests.exception") + + # Fetch url whose response isn't mocked to raise ConnectionError + resp = spider.fetch_url("http://requests.exception") + + captured = capsys.readouterr() + assert "Request error occurred:" in captured.out + assert resp is None + + @responses.activate def test_crawl() -> None: - # Mock HTTP response - responses.add( - responses.GET, - "http://example.com", + setup_mock_response( + url="http://example.com", body="link", - status=200, - content_type="text/html", + status=200 + ) + setup_mock_response( + url="http://example.com/test", + body="link", + status=200 ) spider = Spider("http://example.com", 10) @@ -54,6 +145,113 @@ def test_crawl() -> None: "http://example.com/test" ] + spider.crawl("http://example.com/test") + + assert "http://example.com/test" in spider.crawl_result + assert spider.crawl_result["http://example.com/test"]["urls"] == [ + "http://example.com" + ] + + +@responses.activate +def test_crawl_invalid_url(capsys) -> None: # type: ignore + spider = Spider("http://example.com") + + spider.crawl("invalid_url") + + captured = capsys.readouterr() + assert "Invalid url to crawl:" in captured.out + assert spider.crawl_result == {} + + +@responses.activate +def test_crawl_already_crawled_url(capsys) -> None: # type: ignore + setup_mock_response( + url="http://example.com", + body="link", + status=200 + ) + + spider = Spider("http://example.com") + + spider.crawl("http://example.com") + spider.crawl("http://example.com") + + captured = capsys.readouterr() + assert "URL already crawled:" in captured.out + assert spider.crawl_result == {'http://example.com': + {'urls': ['http://example.com'] + } + } + + +@responses.activate +def test_crawl_unfetchable_url() -> None: + setup_mock_response( + url="http://example.com", + body="link", + status=404 + ) + + spider = Spider("http://example.com") + + spider.crawl("http://example.com") + assert spider.crawl_result == {} + + +@responses.activate +def test_crawl_found_invalid_url(capsys) -> None: # type: ignore + setup_mock_response( + url="http://example.com", + body="link", + status=200 + ) + + spider = Spider("http://example.com") + spider.crawl("http://example.com") + + captured = capsys.readouterr() + assert "Invalid url:" in captured.out + assert spider.crawl_result == {'http://example.com': + {'urls': [] + } + } + + +@responses.activate +def test_crawl_found_duplicate_url() -> None: + setup_mock_response( + url="http://example.com", + body="link1" + +"link2", + status=200 + ) + + spider = Spider("http://example.com") + spider.crawl("http://example.com") + + assert spider.crawl_result == {'http://example.com': + {'urls': ['http://duplicate.com'] + } + } + + +@responses.activate +def test_crawl_no_urls_in_page() -> None: + setup_mock_response( + url="http://example.com", + body="", + status=200 + ) + + spider = Spider("http/example.com") + spider.crawl("http://example.com") + + assert spider.crawl_result == {'http://example.com': + {'urls': [] + } + } + @responses.activate def test_save_results() -> None: @@ -68,13 +266,11 @@ def test_save_results() -> None: @responses.activate def test_url_regex() -> None: - # Mock HTTP response - responses.add( - responses.GET, - "http://example.com", - body="linklink", - status=200, - content_type="text/html", + setup_mock_response( + url="http://example.com", + body="link" + +"link", + status=200 ) # This regex matches strings starting with "http://example.com/" @@ -91,20 +287,15 @@ def test_url_regex() -> None: @responses.activate def test_include_body() -> None: - # Mock HTTP response - responses.add( - responses.GET, - "http://example.com", + setup_mock_response( + url="http://example.com", body="link", - status=200, - content_type="text/html", + status=200 ) - responses.add( - responses.GET, - "http://example.com/test", + setup_mock_response( + url="http://example.com/test", body="