Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Increased test coverage #29

Merged
merged 8 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
poetry install --with dev
- name: Run tests
run: |
poetry run pytest
poetry run pytest --cov=./

publish:
needs: test
Expand Down
247 changes: 243 additions & 4 deletions tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

import responses

from tiny_web_crawler.crawler import Spider
import requests

from tiny_web_crawler.crawler import Spider, DEFAULT_SCHEME

def test_is_valid_url() -> None:
assert Spider.is_valid_url("http://example.com") is True
Expand All @@ -12,13 +13,32 @@ def test_is_valid_url() -> None:

def test_format_url() -> None:
spider = Spider("http://example.com", 10)
assert spider.format_url(
"/test", "http://example.com") == "http://example.com/test"

assert (
spider.format_url("/test", "http://example.com")
== "http://example.com/test"
)

assert (
spider.format_url("http://example.com/test", "http://example.com")
== "http://example.com/test"
)

assert (
spider.format_url('path1/path2', 'http://example.com')
== 'http://example.com/path1/path2'
)

assert (
spider.format_url('/path1/path2', 'http://example.com')
== 'http://example.com/path1/path2'
)

assert (
spider.format_url('path.com', 'http://example.com')
== DEFAULT_SCHEME + 'path.com'
)


@responses.activate
def test_fetch_url() -> None:
Expand All @@ -35,6 +55,81 @@ def test_fetch_url() -> None:
assert resp.text == "link"


@responses.activate
def test_fetch_url_connection_error(capsys) -> None: # type: ignore
spider = Spider("http://connection.error")

# Fetch url whose response isn't mocked to raise ConnectionError
resp = spider.fetch_url("http://connection.error")

captured = capsys.readouterr()
assert "Connection error occurred:" in captured.out
assert resp is None


@responses.activate
def test_fetch_url_http_error(capsys) -> None: # type: ignore
error_codes = [403, 404, 408]

def setup_mock_response(error_code:int) -> None:
responses.add(
responses.GET,
f"http://http.error/{error_code}",
body="<html><body><a href='http://http.error'>link</a></body></html>",
status=error_code
)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is too long.

  1. We can break this based on SRP. like test_fetch_url_handles_404_error, test_fetch_url_handles_408_error etc.

  2. We can extract out common code def setup_mock_response(url, status, body="<html><body><a href='http://http.error'>link</a></body></html>") and re use it.

  3. type: ignore (can we use type hints here. Maybe pytest.CaptureFixture?)

Copy link
Collaborator Author

@Mews Mews Jun 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah you're right I'll shorten it 👍
I don't understand what you mean in the 3rd thing tho. Are you referring to the places where I wrote # type: ignore? That's because otherwise mypy would complain about the capsys argument, which is only passed when running the tests.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just pushed the changes let me know what you think 👍


spider = Spider("http://http.error")

for error_code in error_codes:
setup_mock_response(error_code)
resp = spider.fetch_url(f"http://http.error/{error_code}")

captured = capsys.readouterr()

assert "HTTP error occurred:" in captured.out
assert resp is None


@responses.activate
def test_fetch_url_timeout_error(capsys) -> None: # type: ignore
responses.add(
responses.GET,
"http://timeout.error",
body=requests.exceptions.Timeout(),
status=408
)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

    responses.add(
        responses.GET,
        "http://timeout.error",
        body=requests.exceptions.Timeout(),
        status=408
    )

This is getting repeated in every test case. We can move this into a reusable function. And maybe move it to test/utils.py so that we can re use this other places as well.

something like this

@contextmanager
def capture_output(capsys: pytest.CaptureFixture) -> Generator[None, None, str]:
   yield
   captured = capsys.readouterr()
   return captured.out

def setup_mock_response(url: str, status: int, body: str = "<html><body><a href='http://http.error'>link</a></body></html>") -> None:
   responses.add(
       responses.GET,
       url,
       body=body,
       status=status
   )

and we can call the tests like this

@responses.activate
def test_fetch_url_handles_404_error(capsys: pytest.CaptureFixture) -> None:
 setup_mock_response("http://http.error/404", 404)

 spider = Spider("http://http.error")

 with capture_output(capsys) as output:
     resp404 = spider.fetch_url("http://http.error/404")

 assert 

And repeat this patten for testing other exceptions.

Or even better to use parametrize fixtures

@pytest.mark.parametrize("url, status", [
 ("http://http.error/404", 404),
 ("http://http.error/408", 408),
 ("http://http.error/403", 403),
])
def test_fetch_url_http_errors(mock_responses, capsys_output, url, status):

LMK your thoughts.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I get it I didn't understand it was meant to be for every test, give me a second and I'll implement it.
I've never used pytest parametrize so I might not be the most indicated to implement that, maybe we can just open an issue for it and it can be dealt with later?

Copy link
Collaborator

@indrajithi indrajithi Jun 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is very simple. Basically instead of calling test_fetch_url_handles_404_error, test_fetch_url_handles_408_error we generalize it into one test test_fetch_url_http_errors and pass the fixture parms to it with the url to call and the response we expect. You can try it. If not working. I can pick it up later as part of the refactoring I am currently doing.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like capsys doesn't capture the stdout from inside the context manager.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. LMK once you fix other things and we can merge this.

spider = Spider("http://timeout.error")

# Fetch url whose response isn't mocked to raise ConnectionError
resp = spider.fetch_url("http://timeout.error")

captured = capsys.readouterr()
assert "Timeout error occurred:" in captured.out
assert resp is None


@responses.activate
def test_fetch_url_requests_exception(capsys) -> None: # type: ignore
responses.add(
responses.GET,
"http://requests.exception",
body=requests.exceptions.RequestException(),
status=404
)

spider = Spider("http://requests.exception")

# Fetch url whose response isn't mocked to raise ConnectionError
resp = spider.fetch_url("http://requests.exception")

captured = capsys.readouterr()
assert "Request error occurred:" in captured.out
assert resp is None


@responses.activate
def test_crawl() -> None:
# Mock HTTP response
Expand All @@ -45,6 +140,13 @@ def test_crawl() -> None:
status=200,
content_type="text/html",
)
responses.add(
responses.GET,
"http://example.com/test",
body="<html><body><a href='http://example.com'>link</a></body></html>",
status=200,
content_type="text/html",
)

spider = Spider("http://example.com", 10)
spider.crawl("http://example.com")
Expand All @@ -54,6 +156,123 @@ def test_crawl() -> None:
"http://example.com/test"
]

spider.crawl("http://example.com/test")

assert "http://example.com/test" in spider.crawl_result
assert spider.crawl_result["http://example.com/test"]["urls"] == [
"http://example.com"
]


@responses.activate
def test_crawl_invalid_url(capsys) -> None: # type: ignore
spider = Spider("http://example.com")

spider.crawl("invalid_url")

captured = capsys.readouterr()
assert "Invalid url to crawl:" in captured.out
assert spider.crawl_result == {}


@responses.activate
def test_crawl_already_crawled_url(capsys) -> None: # type: ignore
responses.add(
responses.GET,
"http://example.com",
body="<html><body><a href='http://example.com'>link</a></body></html>",
status=200,
content_type="text/html",
)

spider = Spider("http://example.com")

spider.crawl("http://example.com")
spider.crawl("http://example.com")

captured = capsys.readouterr()
assert "URL already crawled:" in captured.out
assert spider.crawl_result == {'http://example.com':
{'urls': ['http://example.com']
}
}


@responses.activate
def test_crawl_unfetchable_url() -> None:
responses.add(
responses.GET,
"http://example.com",
body="<html><body><a href='http://example.com'>link</a></body></html>",
status=404,
content_type="text/html",
)

spider = Spider("http://example.com")

spider.crawl("http://example.com")
assert spider.crawl_result == {}


@responses.activate
def test_crawl_found_invalid_url(capsys) -> None: # type: ignore
responses.add(
responses.GET,
"http://example.com",
body="<html><body><a href='^invalidurl^'>link</a></body></html>",
status=200,
content_type="text/html",
)

spider = Spider("http://example.com")
spider.crawl("http://example.com")

captured = capsys.readouterr()
assert "Invalid url:" in captured.out
assert spider.crawl_result == {'http://example.com':
{'urls': []
}
}


@responses.activate
def test_crawl_found_duplicate_url() -> None:
responses.add(
responses.GET,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can reuse the helper setup_mock_response in all these places

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@indrajithi I added the helper function everywhere, I can't get the context manager to work. It seems like capsys doesn't capture the stdout from inside the context manager.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I open an issue for it? To create the context manager for the tests?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No issues. Let us ignore it for now. This looks good. We can pick displaying coverage in readme in a separate issue. I will merge this one.

Copy link
Collaborator Author

@Mews Mews Jun 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No issues. Let us ignore it for now. This looks good. We can pick displaying coverage in readme in a separate issue. I will merge this one.

I've already created an issue for it if you can just assign me there :) #30

"http://example.com",
body="<html><body><a href='http://duplicate.com'>link1</a>"
+"<a href='http://duplicate.com'>link2</a></body></html>",
status=200,
content_type="text/html",
)

spider = Spider("http://example.com")
spider.crawl("http://example.com")

assert spider.crawl_result == {'http://example.com':
{'urls': ['http://duplicate.com']
}
}


@responses.activate
def test_crawl_no_urls_in_page() -> None:
responses.add(
responses.GET,
"http://example.com",
body="<html><body></body></html>",
status=200,
content_type="text/html",
)

spider = Spider("http/example.com")
spider.crawl("http://example.com")

assert spider.crawl_result == {'http://example.com':
{'urls': []
}
}


@responses.activate
def test_save_results() -> None:
Expand All @@ -72,7 +291,8 @@ def test_url_regex() -> None:
responses.add(
responses.GET,
"http://example.com",
body="<html><body><a href='http://example.com/123'>link</a><a href='http://example.com/test'>link</a></body></html>",
body="<html><body><a href='http://example.com/123'>link</a>"
+"<a href='http://example.com/test'>link</a></body></html>",
status=200,
content_type="text/html",
)
Expand Down Expand Up @@ -136,3 +356,22 @@ def test_start(mock_save_results: MagicMock, mock_crawl: MagicMock) -> None:
assert spider.crawl_result["http://example.com"]["urls"] == [
"http://example.com/test"
]


@patch.object(Spider, "crawl")
@patch.object(Spider, "save_results")
def test_start_with_save_to_file(mock_save_results: MagicMock, mock_crawl: MagicMock) -> None:
spider = Spider("http://example.com", 10, save_to_file="file.txt")
mock_crawl.side_effect = lambda url: spider.crawl_result.update(
{url: {"urls": ["http://example.com/test"]}}
)

spider.start()

assert mock_crawl.call_count == 1
assert "http://example.com" in spider.crawl_result
assert spider.crawl_result["http://example.com"]["urls"] == [
"http://example.com/test"
]

mock_save_results.assert_called_once()
Loading