-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Increased test coverage #29
Changes from 6 commits
b75e54f
bcb1e19
dfb05ff
922d1fe
762dae5
5f58ba8
2561134
b809542
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,8 +2,9 @@ | |
|
||
import responses | ||
|
||
from tiny_web_crawler.crawler import Spider | ||
import requests | ||
|
||
from tiny_web_crawler.crawler import Spider, DEFAULT_SCHEME | ||
|
||
def test_is_valid_url() -> None: | ||
assert Spider.is_valid_url("http://example.com") is True | ||
|
@@ -12,13 +13,32 @@ def test_is_valid_url() -> None: | |
|
||
def test_format_url() -> None: | ||
spider = Spider("http://example.com", 10) | ||
assert spider.format_url( | ||
"/test", "http://example.com") == "http://example.com/test" | ||
|
||
assert ( | ||
spider.format_url("/test", "http://example.com") | ||
== "http://example.com/test" | ||
) | ||
|
||
assert ( | ||
spider.format_url("http://example.com/test", "http://example.com") | ||
== "http://example.com/test" | ||
) | ||
|
||
assert ( | ||
spider.format_url('path1/path2', 'http://example.com') | ||
== 'http://example.com/path1/path2' | ||
) | ||
|
||
assert ( | ||
spider.format_url('/path1/path2', 'http://example.com') | ||
== 'http://example.com/path1/path2' | ||
) | ||
|
||
assert ( | ||
spider.format_url('path.com', 'http://example.com') | ||
== DEFAULT_SCHEME + 'path.com' | ||
) | ||
|
||
|
||
@responses.activate | ||
def test_fetch_url() -> None: | ||
|
@@ -35,6 +55,81 @@ def test_fetch_url() -> None: | |
assert resp.text == "link" | ||
|
||
|
||
@responses.activate | ||
def test_fetch_url_connection_error(capsys) -> None: # type: ignore | ||
spider = Spider("http://connection.error") | ||
|
||
# Fetch url whose response isn't mocked to raise ConnectionError | ||
resp = spider.fetch_url("http://connection.error") | ||
|
||
captured = capsys.readouterr() | ||
assert "Connection error occurred:" in captured.out | ||
assert resp is None | ||
|
||
|
||
@responses.activate | ||
def test_fetch_url_http_error(capsys) -> None: # type: ignore | ||
error_codes = [403, 404, 408] | ||
|
||
def setup_mock_response(error_code:int) -> None: | ||
responses.add( | ||
responses.GET, | ||
f"http://http.error/{error_code}", | ||
body="<html><body><a href='http://http.error'>link</a></body></html>", | ||
status=error_code | ||
) | ||
|
||
|
||
spider = Spider("http://http.error") | ||
|
||
for error_code in error_codes: | ||
setup_mock_response(error_code) | ||
resp = spider.fetch_url(f"http://http.error/{error_code}") | ||
|
||
captured = capsys.readouterr() | ||
|
||
assert "HTTP error occurred:" in captured.out | ||
assert resp is None | ||
|
||
|
||
@responses.activate | ||
def test_fetch_url_timeout_error(capsys) -> None: # type: ignore | ||
responses.add( | ||
responses.GET, | ||
"http://timeout.error", | ||
body=requests.exceptions.Timeout(), | ||
status=408 | ||
) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. responses.add(
responses.GET,
"http://timeout.error",
body=requests.exceptions.Timeout(),
status=408
) This is getting repeated in every test case. We can move this into a reusable function. And maybe move it to something like this @contextmanager
def capture_output(capsys: pytest.CaptureFixture) -> Generator[None, None, str]:
yield
captured = capsys.readouterr()
return captured.out
def setup_mock_response(url: str, status: int, body: str = "<html><body><a href='http://http.error'>link</a></body></html>") -> None:
responses.add(
responses.GET,
url,
body=body,
status=status
) and we can call the tests like this @responses.activate
def test_fetch_url_handles_404_error(capsys: pytest.CaptureFixture) -> None:
setup_mock_response("http://http.error/404", 404)
spider = Spider("http://http.error")
with capture_output(capsys) as output:
resp404 = spider.fetch_url("http://http.error/404")
assert And repeat this patten for testing other exceptions. Or even better to use parametrize fixtures @pytest.mark.parametrize("url, status", [
("http://http.error/404", 404),
("http://http.error/408", 408),
("http://http.error/403", 403),
])
def test_fetch_url_http_errors(mock_responses, capsys_output, url, status): LMK your thoughts. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh I get it I didn't understand it was meant to be for every test, give me a second and I'll implement it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is very simple. Basically instead of calling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like capsys doesn't capture the stdout from inside the context manager. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. LMK once you fix other things and we can merge this. |
||
spider = Spider("http://timeout.error") | ||
|
||
# Fetch url whose response isn't mocked to raise ConnectionError | ||
resp = spider.fetch_url("http://timeout.error") | ||
|
||
captured = capsys.readouterr() | ||
assert "Timeout error occurred:" in captured.out | ||
assert resp is None | ||
|
||
|
||
@responses.activate | ||
def test_fetch_url_requests_exception(capsys) -> None: # type: ignore | ||
responses.add( | ||
responses.GET, | ||
"http://requests.exception", | ||
body=requests.exceptions.RequestException(), | ||
status=404 | ||
) | ||
|
||
spider = Spider("http://requests.exception") | ||
|
||
# Fetch url whose response isn't mocked to raise ConnectionError | ||
resp = spider.fetch_url("http://requests.exception") | ||
|
||
captured = capsys.readouterr() | ||
assert "Request error occurred:" in captured.out | ||
assert resp is None | ||
|
||
|
||
@responses.activate | ||
def test_crawl() -> None: | ||
# Mock HTTP response | ||
|
@@ -45,6 +140,13 @@ def test_crawl() -> None: | |
status=200, | ||
content_type="text/html", | ||
) | ||
responses.add( | ||
responses.GET, | ||
"http://example.com/test", | ||
body="<html><body><a href='http://example.com'>link</a></body></html>", | ||
status=200, | ||
content_type="text/html", | ||
) | ||
|
||
spider = Spider("http://example.com", 10) | ||
spider.crawl("http://example.com") | ||
|
@@ -54,6 +156,123 @@ def test_crawl() -> None: | |
"http://example.com/test" | ||
] | ||
|
||
spider.crawl("http://example.com/test") | ||
|
||
assert "http://example.com/test" in spider.crawl_result | ||
assert spider.crawl_result["http://example.com/test"]["urls"] == [ | ||
"http://example.com" | ||
] | ||
|
||
|
||
@responses.activate | ||
def test_crawl_invalid_url(capsys) -> None: # type: ignore | ||
spider = Spider("http://example.com") | ||
|
||
spider.crawl("invalid_url") | ||
|
||
captured = capsys.readouterr() | ||
assert "Invalid url to crawl:" in captured.out | ||
assert spider.crawl_result == {} | ||
|
||
|
||
@responses.activate | ||
def test_crawl_already_crawled_url(capsys) -> None: # type: ignore | ||
responses.add( | ||
responses.GET, | ||
"http://example.com", | ||
body="<html><body><a href='http://example.com'>link</a></body></html>", | ||
status=200, | ||
content_type="text/html", | ||
) | ||
|
||
spider = Spider("http://example.com") | ||
|
||
spider.crawl("http://example.com") | ||
spider.crawl("http://example.com") | ||
|
||
captured = capsys.readouterr() | ||
assert "URL already crawled:" in captured.out | ||
assert spider.crawl_result == {'http://example.com': | ||
{'urls': ['http://example.com'] | ||
} | ||
} | ||
|
||
|
||
@responses.activate | ||
def test_crawl_unfetchable_url() -> None: | ||
responses.add( | ||
responses.GET, | ||
"http://example.com", | ||
body="<html><body><a href='http://example.com'>link</a></body></html>", | ||
status=404, | ||
content_type="text/html", | ||
) | ||
|
||
spider = Spider("http://example.com") | ||
|
||
spider.crawl("http://example.com") | ||
assert spider.crawl_result == {} | ||
|
||
|
||
@responses.activate | ||
def test_crawl_found_invalid_url(capsys) -> None: # type: ignore | ||
responses.add( | ||
responses.GET, | ||
"http://example.com", | ||
body="<html><body><a href='^invalidurl^'>link</a></body></html>", | ||
status=200, | ||
content_type="text/html", | ||
) | ||
|
||
spider = Spider("http://example.com") | ||
spider.crawl("http://example.com") | ||
|
||
captured = capsys.readouterr() | ||
assert "Invalid url:" in captured.out | ||
assert spider.crawl_result == {'http://example.com': | ||
{'urls': [] | ||
} | ||
} | ||
|
||
|
||
@responses.activate | ||
def test_crawl_found_duplicate_url() -> None: | ||
responses.add( | ||
responses.GET, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can reuse the helper There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @indrajithi I added the helper function everywhere, I can't get the context manager to work. It seems like capsys doesn't capture the stdout from inside the context manager. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should I open an issue for it? To create the context manager for the tests? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No issues. Let us ignore it for now. This looks good. We can pick displaying coverage in readme in a separate issue. I will merge this one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I've already created an issue for it if you can just assign me there :) #30 |
||
"http://example.com", | ||
body="<html><body><a href='http://duplicate.com'>link1</a>" | ||
+"<a href='http://duplicate.com'>link2</a></body></html>", | ||
status=200, | ||
content_type="text/html", | ||
) | ||
|
||
spider = Spider("http://example.com") | ||
spider.crawl("http://example.com") | ||
|
||
assert spider.crawl_result == {'http://example.com': | ||
{'urls': ['http://duplicate.com'] | ||
} | ||
} | ||
|
||
|
||
@responses.activate | ||
def test_crawl_no_urls_in_page() -> None: | ||
responses.add( | ||
responses.GET, | ||
"http://example.com", | ||
body="<html><body></body></html>", | ||
status=200, | ||
content_type="text/html", | ||
) | ||
|
||
spider = Spider("http/example.com") | ||
spider.crawl("http://example.com") | ||
|
||
assert spider.crawl_result == {'http://example.com': | ||
{'urls': [] | ||
} | ||
} | ||
|
||
|
||
@responses.activate | ||
def test_save_results() -> None: | ||
|
@@ -72,7 +291,8 @@ def test_url_regex() -> None: | |
responses.add( | ||
responses.GET, | ||
"http://example.com", | ||
body="<html><body><a href='http://example.com/123'>link</a><a href='http://example.com/test'>link</a></body></html>", | ||
body="<html><body><a href='http://example.com/123'>link</a>" | ||
+"<a href='http://example.com/test'>link</a></body></html>", | ||
status=200, | ||
content_type="text/html", | ||
) | ||
|
@@ -136,3 +356,22 @@ def test_start(mock_save_results: MagicMock, mock_crawl: MagicMock) -> None: | |
assert spider.crawl_result["http://example.com"]["urls"] == [ | ||
"http://example.com/test" | ||
] | ||
|
||
|
||
@patch.object(Spider, "crawl") | ||
@patch.object(Spider, "save_results") | ||
def test_start_with_save_to_file(mock_save_results: MagicMock, mock_crawl: MagicMock) -> None: | ||
spider = Spider("http://example.com", 10, save_to_file="file.txt") | ||
mock_crawl.side_effect = lambda url: spider.crawl_result.update( | ||
{url: {"urls": ["http://example.com/test"]}} | ||
) | ||
|
||
spider.start() | ||
|
||
assert mock_crawl.call_count == 1 | ||
assert "http://example.com" in spider.crawl_result | ||
assert spider.crawl_result["http://example.com"]["urls"] == [ | ||
"http://example.com/test" | ||
] | ||
|
||
mock_save_results.assert_called_once() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test is too long.
We can break this based on SRP. like
test_fetch_url_handles_404_error
,test_fetch_url_handles_408_error
etc.We can extract out common code
def setup_mock_response(url, status, body="<html><body><a href='http://http.error'>link</a></body></html>")
and re use it.type: ignore (can we use type hints here. Maybe
pytest.CaptureFixture
?)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah you're right I'll shorten it 👍
I don't understand what you mean in the 3rd thing tho. Are you referring to the places where I wrote
# type: ignore
? That's because otherwise mypy would complain about the capsys argument, which is only passed when running the tests.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just pushed the changes let me know what you think 👍