Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Google Search: Allow extracting items from SERP results #78

Merged
merged 12 commits into from
Nov 22, 2024
5 changes: 5 additions & 0 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,10 @@ Parameter mixins

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy

.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam
:exclude-members: model_computed_fields

.. autoenum:: zyte_spider_templates.spiders.serp.SerpItemType

.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam
:exclude-members: model_computed_fields
23 changes: 22 additions & 1 deletion tests/test_serp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
from scrapy_zyte_api.responses import ZyteAPITextResponse
from w3lib.url import add_or_replace_parameter

from zyte_spider_templates.spiders.serp import GoogleSearchSpider
from zyte_spider_templates.spiders.serp import (
ITEM_TYPE_CLASSES,
GoogleSearchSpider,
SerpItemType,
)

from . import get_crawler
from .utils import assertEqualSpiderMetadata
Expand Down Expand Up @@ -445,3 +449,20 @@ def test_parse_serp():
# The page_number parameter is required.
with pytest.raises(TypeError):
spider.parse_serp(response)


def test_item_type_mappings():
# Ensure that all SerpItemType keys and values match.
for entry in SerpItemType:
assert entry.name == entry.value

# Ensure that the ITEM_TYPE_CLASSES dict maps all values from the
# corresponding enum except for serp.
actual_keys = set(ITEM_TYPE_CLASSES)
expected_keys = set(
entry.value for entry in SerpItemType if entry != SerpItemType.serp
)
assert actual_keys == expected_keys

# Also ensure that no dict value is repeated.
assert len(actual_keys) == len(set(ITEM_TYPE_CLASSES.values()))
123 changes: 116 additions & 7 deletions zyte_spider_templates/spiders/serp.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
from enum import Enum
from typing import Any, Dict, Iterable, List, Optional, Union

from pydantic import BaseModel, Field, field_validator
from scrapy import Request
from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import Args
from w3lib.url import add_or_replace_parameter
from zyte_common_items import Serp

from zyte_common_items import ( # TODO: Add ForumThread to zyte-common-items; ForumThread,
Article,
ArticleList,
JobPosting,
Product,
ProductList,
Serp,
)

from ..documentation import document_enum
from ..params import MaxRequestsParam
from ._google_domains import GoogleDomain
from .base import BaseSpider
Expand Down Expand Up @@ -48,6 +58,83 @@ class SerpMaxPagesParam(BaseModel):
)


@document_enum
class SerpItemType(str, Enum):
article: str = "article"
"""
Article data from result URLs.
"""

articleList: str = "articleList"
"""
Article list data from result URLs.
"""

# forumThread: str = "forumThread"
"""
Thread data from result URLs.
"""

jobPosting: str = "jobPosting"
"""
Job posting data from result URLs.
"""

product: str = "product"
"""
Product data from result URLs.
"""

productList: str = "productList"
"""
Product list data from result URLs.
"""

serp: str = "serp"
"""
Search engine results page data.
"""


# NOTE: serp is excluded on purposed, since it is not used below.
# TODO: Add a test to make sure that this is in sync with the enum class above.
ITEM_TYPE_CLASSES = {
SerpItemType.article: Article,
SerpItemType.articleList: ArticleList,
# SerpItemType.forumThread: ForumThread,
SerpItemType.jobPosting: JobPosting,
SerpItemType.product: Product,
SerpItemType.productList: ProductList,
}


class SerpItemTypeParam(BaseModel):
item_type: SerpItemType = Field(
title="Item type",
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
description="Data type of the output items.",
default=SerpItemType.serp,
json_schema_extra={
"enumMeta": {
# TODO: Add a test to make sure this is in sync with the enum class above.
# TODO: Try automating the generation of this metadata from the enum type above.
SerpItemType.serp: {
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
"title": "serp",
"description": (
"Yield the data of result pages, do not follow result " "links."
),
},
SerpItemType.product: {
"title": "product",
"description": (
"Follow result links and yield product details data "
"from them."
),
},
},
},
)


class GoogleDomainParam(BaseModel):
domain: GoogleDomain = Field(
title="Domain",
Expand All @@ -58,6 +145,7 @@ class GoogleDomainParam(BaseModel):

class GoogleSearchSpiderParams(
MaxRequestsParam,
SerpItemTypeParam, # TODO: Update the test_metadata expectations
SerpMaxPagesParam,
SearchQueriesParam,
GoogleDomainParam,
Expand Down Expand Up @@ -126,9 +214,30 @@ def start_requests(self) -> Iterable[Request]:
def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]:
serp = Serp.from_dict(response.raw_api_response["serp"])

next_start = page_number * self._results_per_page
if serp.organicResults and serp.metadata.totalOrganicResults > next_start:
next_url = add_or_replace_parameter(serp.url, "start", str(next_start))
yield self.get_serp_request(next_url, page_number=page_number + 1)
if page_number < self.args.max_pages: # TODO: Add a test for this
next_start = page_number * self._results_per_page
if serp.organicResults and serp.metadata.totalOrganicResults > next_start:
next_url = add_or_replace_parameter(serp.url, "start", str(next_start))
yield self.get_serp_request(next_url, page_number=page_number + 1)

if self.args.item_type == SerpItemType.serp:
yield serp
return

# TODO: Add a test for this
for result in serp.organicResults:
yield response.follow(
result[
"url"
], # TODO: Why does result.url not work? Bug in zyte-common-items?
callback=self.parse_result,
meta={
"crawling_logs": {"page_type": self.args.item_type.value},
"inject": [ITEM_TYPE_CLASSES[self.args.item_type]],
},
)

yield serp
def parse_result(
self, response: DummyResponse, dynamic: DynamicDeps
) -> Iterable[Any]:
yield next(iter(dynamic.values()))
Loading