zytedata · Gallaecio · Nov 22, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
@@ -44,5 +44,10 @@ Parameter mixins
 
 .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
 
+.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam
+    :exclude-members: model_computed_fields
+
+.. autoenum:: zyte_spider_templates.spiders.serp.SerpItemType
+
 .. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam
     :exclude-members: model_computed_fields
diff --git a/tests/test_serp.py b/tests/test_serp.py
@@ -5,7 +5,11 @@
 from scrapy_zyte_api.responses import ZyteAPITextResponse
 from w3lib.url import add_or_replace_parameter
 
-from zyte_spider_templates.spiders.serp import GoogleSearchSpider
+from zyte_spider_templates.spiders.serp import (
+    ITEM_TYPE_CLASSES,
+    GoogleSearchSpider,
+    SerpItemType,
+)
 
 from . import get_crawler
 from .utils import assertEqualSpiderMetadata
@@ -445,3 +449,20 @@ def test_parse_serp():
     # The page_number parameter is required.
     with pytest.raises(TypeError):
         spider.parse_serp(response)
+
+
+def test_item_type_mappings():
+    # Ensure that all SerpItemType keys and values match.
+    for entry in SerpItemType:
+        assert entry.name == entry.value
+
+    # Ensure that the ITEM_TYPE_CLASSES dict maps all values from the
+    # corresponding enum except for serp.
+    actual_keys = set(ITEM_TYPE_CLASSES)
+    expected_keys = set(
+        entry.value for entry in SerpItemType if entry != SerpItemType.serp
+    )
+    assert actual_keys == expected_keys
+
+    # Also ensure that no dict value is repeated.
+    assert len(actual_keys) == len(set(ITEM_TYPE_CLASSES.values()))
diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py
@@ -1,12 +1,22 @@
+from enum import Enum
 from typing import Any, Dict, Iterable, List, Optional, Union
 
 from pydantic import BaseModel, Field, field_validator
 from scrapy import Request
 from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings
+from scrapy_poet import DummyResponse, DynamicDeps
 from scrapy_spider_metadata import Args
 from w3lib.url import add_or_replace_parameter
-from zyte_common_items import Serp
-
+from zyte_common_items import (  # TODO: Add ForumThread to zyte-common-items; ForumThread,
+    Article,
+    ArticleList,
+    JobPosting,
+    Product,
+    ProductList,
+    Serp,
+)
+
+from ..documentation import document_enum
 from ..params import MaxRequestsParam
 from ._google_domains import GoogleDomain
 from .base import BaseSpider
@@ -48,6 +58,83 @@ class SerpMaxPagesParam(BaseModel):
     )
 
 
+@document_enum
+class SerpItemType(str, Enum):
+    article: str = "article"
+    """
+    Article data from result URLs.
+    """
+
+    articleList: str = "articleList"
+    """
+    Article list data from result URLs.
+    """
+
+    # forumThread: str = "forumThread"
+    """
+    Thread data from result URLs.
+    """
+
+    jobPosting: str = "jobPosting"
+    """
+    Job posting data from result URLs.
+    """
+
+    product: str = "product"
+    """
+    Product data from result URLs.
+    """
+
+    productList: str = "productList"
+    """
+    Product list data from result URLs.
+    """
+
+    serp: str = "serp"
+    """
+    Search engine results page data.
+    """
+
+
+# NOTE: serp is excluded on purposed, since it is not used below.
+# TODO: Add a test to make sure that this is in sync with the enum class above.
+ITEM_TYPE_CLASSES = {
+    SerpItemType.article: Article,
+    SerpItemType.articleList: ArticleList,
+    # SerpItemType.forumThread: ForumThread,
+    SerpItemType.jobPosting: JobPosting,
+    SerpItemType.product: Product,
+    SerpItemType.productList: ProductList,
+}
+
+
+class SerpItemTypeParam(BaseModel):
+    item_type: SerpItemType = Field(
+        title="Item type",
+        description="Data type of the output items.",
+        default=SerpItemType.serp,
+        json_schema_extra={
+            "enumMeta": {
+                # TODO: Add a test to make sure this is in sync with the enum class above.
+                # TODO: Try automating the generation of this metadata from the enum type above.
+                SerpItemType.serp: {
+                    "title": "serp",
+                    "description": (
+                        "Yield the data of result pages, do not follow result " "links."
+                    ),
+                },
+                SerpItemType.product: {
+                    "title": "product",
+                    "description": (
+                        "Follow result links and yield product details data "
+                        "from them."
+                    ),
+                },
+            },
+        },
+    )
+
+
 class GoogleDomainParam(BaseModel):
     domain: GoogleDomain = Field(
         title="Domain",
@@ -58,6 +145,7 @@ class GoogleDomainParam(BaseModel):
 
 class GoogleSearchSpiderParams(
     MaxRequestsParam,
+    SerpItemTypeParam,  # TODO: Update the test_metadata expectations
     SerpMaxPagesParam,
     SearchQueriesParam,
     GoogleDomainParam,
@@ -126,9 +214,30 @@ def start_requests(self) -> Iterable[Request]:
     def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]:
         serp = Serp.from_dict(response.raw_api_response["serp"])
 
-        next_start = page_number * self._results_per_page
-        if serp.organicResults and serp.metadata.totalOrganicResults > next_start:
-            next_url = add_or_replace_parameter(serp.url, "start", str(next_start))
-            yield self.get_serp_request(next_url, page_number=page_number + 1)
+        if page_number < self.args.max_pages:  # TODO: Add a test for this
+            next_start = page_number * self._results_per_page
+            if serp.organicResults and serp.metadata.totalOrganicResults > next_start:
+                next_url = add_or_replace_parameter(serp.url, "start", str(next_start))
+                yield self.get_serp_request(next_url, page_number=page_number + 1)
+
+        if self.args.item_type == SerpItemType.serp:
+            yield serp
+            return
+
+        # TODO: Add a test for this
+        for result in serp.organicResults:
+            yield response.follow(
+                result[
+                    "url"
+                ],  # TODO: Why does result.url not work? Bug in zyte-common-items?
+                callback=self.parse_result,
+                meta={
+                    "crawling_logs": {"page_type": self.args.item_type.value},
+                    "inject": [ITEM_TYPE_CLASSES[self.args.item_type]],
+                },
+            )
 
-        yield serp
+    def parse_result(
+        self, response: DummyResponse, dynamic: DynamicDeps
+    ) -> Iterable[Any]:
+        yield next(iter(dynamic.values()))