Skip to content

Commit

Permalink
Google Search: results per page (#79)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Nov 22, 2024
1 parent 55663e0 commit 317f69b
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 39 deletions.
122 changes: 85 additions & 37 deletions tests/test_serp.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from urllib.parse import quote_plus

import pytest
from pydantic import ValidationError
from scrapy import Request
Expand All @@ -16,6 +18,44 @@
from .utils import assertEqualSpiderMetadata


def run_parse_serp(spider, total_results=99999, page=1, query="foo"):
url = f"https://www.google.com/search?q={quote_plus(query)}"
if page > 1:
url = add_or_replace_parameter(url, "start", (page - 1) * 10)
response = ZyteAPITextResponse.from_api_response(
api_response={
"serp": {
"organicResults": [
{
"description": "…",
"name": "…",
"url": f"https://example.com/{rank}",
"rank": rank,
}
for rank in range(1, 11)
],
"metadata": {
"dateDownloaded": "2024-10-25T08:59:45Z",
"displayedQuery": query,
"searchedQuery": query,
"totalOrganicResults": total_results,
},
"pageNumber": page,
"url": url,
},
"url": url,
},
)
items = []
requests = []
for item_or_request in spider.parse_serp(response, page_number=page):
if isinstance(item_or_request, Request):
requests.append(item_or_request)
else:
items.append(item_or_request)
return items, requests


def test_parameters():
with pytest.raises(ValidationError):
GoogleSearchSpider()
Expand Down Expand Up @@ -264,6 +304,20 @@ def test_metadata():
"title": "Max Pages",
"type": "integer",
},
"results_per_page": {
"anyOf": [
{
"minimum": 1,
"type": "integer",
},
{
"type": "null",
},
],
"default": None,
"description": "Maximum number of results per page.",
"title": "Results Per Page",
},
"item_type": {
"anyOf": [{"type": "string"}, {"type": "null"}],
"default": None,
Expand Down Expand Up @@ -347,67 +401,37 @@ def test_pagination():
crawler, search_queries="foo bar", max_pages=3
)

def run_parse_serp(total_results, page=1):
url = "https://www.google.com/search?q=foo+bar"
if page > 1:
url = add_or_replace_parameter(url, "start", (page - 1) * 10)
response = ZyteAPITextResponse.from_api_response(
api_response={
"serp": {
"organicResults": [
{
"description": "…",
"name": "…",
"url": f"https://example.com/{rank}",
"rank": rank,
}
for rank in range(1, 11)
],
"metadata": {
"dateDownloaded": "2024-10-25T08:59:45Z",
"displayedQuery": "foo bar",
"searchedQuery": "foo bar",
"totalOrganicResults": total_results,
},
"pageNumber": page,
"url": url,
},
"url": url,
},
)
items = []
requests = []
for item_or_request in spider.parse_serp(response, page_number=page):
if isinstance(item_or_request, Request):
requests.append(item_or_request)
else:
items.append(item_or_request)
return items, requests

items, requests = run_parse_serp(
spider,
total_results=10,
)
assert len(items) == 1
assert len(requests) == 0

items, requests = run_parse_serp(
spider,
total_results=11,
query="foo bar",
)
assert len(items) == 1
assert len(requests) == 1
assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=10"
assert requests[0].cb_kwargs["page_number"] == 2

items, requests = run_parse_serp(
spider,
total_results=20,
page=2,
query="foo bar",
)
assert len(items) == 1
assert len(requests) == 0

items, requests = run_parse_serp(
spider,
total_results=21,
page=2,
query="foo bar",
)
assert len(items) == 1
assert len(requests) == 1
Expand All @@ -416,6 +440,7 @@ def run_parse_serp(total_results, page=1):

# Do not go over max_pages
items, requests = run_parse_serp(
spider,
total_results=31,
page=3,
)
Expand Down Expand Up @@ -483,6 +508,29 @@ def test_parse_serp():
spider.parse_serp(response) # type: ignore[call-arg]


def test_results_per_page():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(
crawler, search_queries="foo", results_per_page=1, max_pages=2
)
requests = list(spider.start_requests())
assert len(requests) == 1
assert requests[0].url == "https://www.google.com/search?q=foo&num=1"

items, requests = run_parse_serp(spider)
assert len(items) == 1
assert len(requests) == 1
assert requests[0].url == "https://www.google.com/search?q=foo&start=1&num=1"


def test_results_per_page_min():
crawler = get_crawler()
with pytest.raises(ValidationError):
GoogleSearchSpider.from_crawler(
crawler, search_queries="foo", results_per_page=0
)


def test_item_type():
crawler = get_crawler()
spider = GoogleSearchSpider.from_crawler(
Expand Down
18 changes: 16 additions & 2 deletions zyte_spider_templates/spiders/serp.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@ class SerpMaxPagesParam(BaseModel):
)


class SerpResultsPerPageParam(BaseModel):
results_per_page: Optional[int] = Field(
title="Results Per Page",
description="Maximum number of results per page.",
ge=1,
default=None,
)


@document_enum
class SerpItemType(str, Enum):
article: str = "article"
Expand Down Expand Up @@ -126,6 +135,7 @@ class GoogleDomainParam(BaseModel):
class GoogleSearchSpiderParams(
MaxRequestsParam,
SerpItemTypeParam,
SerpResultsPerPageParam,
SerpMaxPagesParam,
SearchQueriesParam,
GoogleDomainParam,
Expand All @@ -144,7 +154,7 @@ class GoogleSearchSpider(Args[GoogleSearchSpiderParams], BaseSpider):
"""

name = "google_search"
_results_per_page = 10
_default_results_per_page = 10

metadata: Dict[str, Any] = {
**BaseSpider.metadata,
Expand All @@ -167,6 +177,8 @@ def update_settings(cls, settings: BaseSettings) -> None:
)

def get_serp_request(self, url: str, *, page_number: int):
if self.args.results_per_page:
url = add_or_replace_parameter(url, "num", str(self.args.results_per_page))
return Request(
url=url,
callback=self.parse_serp,
Expand Down Expand Up @@ -195,7 +207,9 @@ def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]:
serp = Serp.from_dict(response.raw_api_response["serp"])

if page_number < self.args.max_pages:
next_start = page_number * self._results_per_page
next_start = page_number * (
self.args.results_per_page or self._default_results_per_page
)
if serp.organicResults and serp.metadata.totalOrganicResults > next_start:
next_url = add_or_replace_parameter(serp.url, "start", str(next_start))
yield self.get_serp_request(next_url, page_number=page_number + 1)
Expand Down

0 comments on commit 317f69b

Please sign in to comment.