From 728a3a9483830f894fc810bbfd6d66873c186b93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 20 Nov 2024 11:13:38 +0100 Subject: [PATCH] Fix max_pages --- tests/test_serp.py | 16 ++++++++++++++-- zyte_spider_templates/spiders/serp.py | 9 +++++---- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/tests/test_serp.py b/tests/test_serp.py index 3aabfe9..699fee5 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -319,7 +319,9 @@ def test_search_queries(): def test_pagination(): crawler = get_crawler() - spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", max_pages=3 + ) def run_parse_serp(total_results, page=1): url = "https://www.google.com/search?q=foo+bar" @@ -388,6 +390,14 @@ def run_parse_serp(total_results, page=1): assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=20" assert requests[0].cb_kwargs["page_number"] == 3 + # Do not go over max_pages + items, requests = run_parse_serp( + total_results=31, + page=3, + ) + assert len(items) == 1 + assert len(requests) == 0 + def test_get_serp_request(): crawler = get_crawler() @@ -404,7 +414,9 @@ def test_get_serp_request(): def test_parse_serp(): crawler = get_crawler() - spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", max_pages=43 + ) url = "https://www.google.com/search?q=foo+bar" response = ZyteAPITextResponse.from_api_response( api_response={ diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index dea6922..ed0d1e7 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -126,9 +126,10 @@ def start_requests(self) -> Iterable[Request]: def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]: serp = Serp.from_dict(response.raw_api_response["serp"]) - next_start = page_number * self._results_per_page - if serp.organicResults and serp.metadata.totalOrganicResults > next_start: - next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) - yield self.get_serp_request(next_url, page_number=page_number + 1) + if page_number < self.args.max_pages: + next_start = page_number * self._results_per_page + if serp.organicResults and serp.metadata.totalOrganicResults > next_start: + next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) + yield self.get_serp_request(next_url, page_number=page_number + 1) yield serp