From 727c506263715f341a64a1e50246842fd46c622c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 6 Nov 2024 14:28:30 +0100 Subject: [PATCH 01/11] WIP --- zyte_spider_templates/spiders/serp.py | 73 ++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index dea6922..37b7e4e 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -1,12 +1,15 @@ +from enum import Enum from typing import Any, Dict, Iterable, List, Optional, Union from pydantic import BaseModel, Field, field_validator from scrapy import Request from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings +from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args from w3lib.url import add_or_replace_parameter -from zyte_common_items import Serp +from zyte_common_items import Product, Serp +from ..documentation import document_enum from ..params import MaxRequestsParam from ._google_domains import GoogleDomain from .base import BaseSpider @@ -48,6 +51,55 @@ class SerpMaxPagesParam(BaseModel): ) +@document_enum +class SerpItemType(str, Enum): + serp: str = "serp" + """ + Yield the data of result pages, do not follow result links. + """ + + product: str = "product" + """ + Follow result links and yield product details data from them. + """ + + # TODO: extend with additional item types. + + +# NOTE: serp is excluded on purposed, since it is not used below. +# TODO: Add a test to make sure that this is in sync with the enum class above. +ITEM_TYPE_CLASSES = { + SerpItemType.product: Product, +} + + +class SerpItemTypeParam(BaseModel): + item_type: SerpItemType = Field( + title="Item type", + description="Data type of the output items.", + default=SerpItemType.serp, + json_schema_extra={ + "enumMeta": { + # TODO: Add a test to make sure this is in sync with the enum class above. + # TODO: Try automating the generation of this metadata from the enum type above. + SerpItemType.serp: { + "title": "serp", + "description": ( + "Yield the data of result pages, do not follow result " "links." + ), + }, + SerpItemType.product: { + "title": "product", + "description": ( + "Follow result links and yield product details data " + "from them." + ), + }, + }, + }, + ) + + class GoogleDomainParam(BaseModel): domain: GoogleDomain = Field( title="Domain", @@ -131,4 +183,21 @@ def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]: next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) yield self.get_serp_request(next_url, page_number=page_number + 1) - yield serp + if self.args.item_type == SerpItemType.serp: + yield serp + return + + for result in serp.organicResults: + yield response.follow( + result.url, + callback=self.parse_result, + meta={ + "crawling_logs": {"page_type": self.args.item_type.value}, + "inject": [ITEM_TYPE_CLASSES[self.args.item_type]], + }, + ) + + def parse_result( + self, response: DummyResponse, dynamic: DynamicDeps + ) -> Iterable[Any]: + yield next(iter(dynamic.values())) From 9525ca6e9e176478bd04a52ba3c8d5487d6ee28f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 6 Nov 2024 14:32:21 +0100 Subject: [PATCH 02/11] WIP --- zyte_spider_templates/spiders/serp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 37b7e4e..682547b 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -110,6 +110,7 @@ class GoogleDomainParam(BaseModel): class GoogleSearchSpiderParams( MaxRequestsParam, + SerpItemTypeParam, SerpMaxPagesParam, SearchQueriesParam, GoogleDomainParam, From 9a1f471eb8e65ae3d417f8ecbe5de52ca0e4820b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 6 Nov 2024 14:41:06 +0100 Subject: [PATCH 03/11] WIP --- zyte_spider_templates/spiders/serp.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 682547b..c9f1c00 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -51,6 +51,8 @@ class SerpMaxPagesParam(BaseModel): ) +# TODO: Make sure this is covered in the docs the same way as the e-commerce +# crawl strategy. @document_enum class SerpItemType(str, Enum): serp: str = "serp" @@ -179,18 +181,22 @@ def start_requests(self) -> Iterable[Request]: def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]: serp = Serp.from_dict(response.raw_api_response["serp"]) - next_start = page_number * self._results_per_page - if serp.organicResults and serp.metadata.totalOrganicResults > next_start: - next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) - yield self.get_serp_request(next_url, page_number=page_number + 1) + if page_number < self.args.max_pages: # TODO: Add a test for this + next_start = page_number * self._results_per_page + if serp.organicResults and serp.metadata.totalOrganicResults > next_start: + next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) + yield self.get_serp_request(next_url, page_number=page_number + 1) if self.args.item_type == SerpItemType.serp: yield serp return + # TODO: Add a test for this for result in serp.organicResults: yield response.follow( - result.url, + result[ + "url" + ], # TODO: Why does result.url not work? Bug in zyte-common-items? callback=self.parse_result, meta={ "crawling_logs": {"page_type": self.args.item_type.value}, From 6507f12329369038e67a51e3ab3359754b765f27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 6 Nov 2024 15:10:26 +0100 Subject: [PATCH 04/11] WIP --- docs/reference/index.rst | 5 +++ tests/test_serp.py | 23 ++++++++++++- zyte_spider_templates/spiders/serp.py | 49 ++++++++++++++++++++++----- 3 files changed, 68 insertions(+), 9 deletions(-) diff --git a/docs/reference/index.rst b/docs/reference/index.rst index dd368dd..d0c3c05 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -44,5 +44,10 @@ Parameter mixins .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy +.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam + :exclude-members: model_computed_fields + +.. autoenum:: zyte_spider_templates.spiders.serp.SerpItemType + .. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam :exclude-members: model_computed_fields diff --git a/tests/test_serp.py b/tests/test_serp.py index 92b19d2..dd30a27 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -5,7 +5,11 @@ from scrapy_zyte_api.responses import ZyteAPITextResponse from w3lib.url import add_or_replace_parameter -from zyte_spider_templates.spiders.serp import GoogleSearchSpider +from zyte_spider_templates.spiders.serp import ( + ITEM_TYPE_CLASSES, + GoogleSearchSpider, + SerpItemType, +) from . import get_crawler from .utils import assertEqualSpiderMetadata @@ -445,3 +449,20 @@ def test_parse_serp(): # The page_number parameter is required. with pytest.raises(TypeError): spider.parse_serp(response) + + +def test_item_type_mappings(): + # Ensure that all SerpItemType keys and values match. + for entry in SerpItemType: + assert entry.name == entry.value + + # Ensure that the ITEM_TYPE_CLASSES dict maps all values from the + # corresponding enum except for serp. + actual_keys = set(ITEM_TYPE_CLASSES) + expected_keys = set( + entry.value for entry in SerpItemType if entry != SerpItemType.serp + ) + assert actual_keys == expected_keys + + # Also ensure that no dict value is repeated. + assert len(actual_keys) == len(set(ITEM_TYPE_CLASSES.values())) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index c9f1c00..7f48b8f 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -7,7 +7,14 @@ from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args from w3lib.url import add_or_replace_parameter -from zyte_common_items import Product, Serp +from zyte_common_items import ( # TODO: Add ForumThread to zyte-common-items; ForumThread, + Article, + ArticleList, + JobPosting, + Product, + ProductList, + Serp, +) from ..documentation import document_enum from ..params import MaxRequestsParam @@ -51,27 +58,53 @@ class SerpMaxPagesParam(BaseModel): ) -# TODO: Make sure this is covered in the docs the same way as the e-commerce -# crawl strategy. @document_enum class SerpItemType(str, Enum): - serp: str = "serp" + article: str = "article" + """ + Article data from result URLs. + """ + + articleList: str = "articleList" + """ + Article list data from result URLs. + """ + + # forumThread: str = "forumThread" + """ + Thread data from result URLs. """ - Yield the data of result pages, do not follow result links. + + jobPosting: str = "jobPosting" + """ + Job posting data from result URLs. """ product: str = "product" """ - Follow result links and yield product details data from them. + Product data from result URLs. + """ + + productList: str = "productList" + """ + Product list data from result URLs. """ - # TODO: extend with additional item types. + serp: str = "serp" + """ + Search engine results page data. + """ # NOTE: serp is excluded on purposed, since it is not used below. # TODO: Add a test to make sure that this is in sync with the enum class above. ITEM_TYPE_CLASSES = { + SerpItemType.article: Article, + SerpItemType.articleList: ArticleList, + # SerpItemType.forumThread: ForumThread, + SerpItemType.jobPosting: JobPosting, SerpItemType.product: Product, + SerpItemType.productList: ProductList, } @@ -112,7 +145,7 @@ class GoogleDomainParam(BaseModel): class GoogleSearchSpiderParams( MaxRequestsParam, - SerpItemTypeParam, + SerpItemTypeParam, # TODO: Update the test_metadata expectations SerpMaxPagesParam, SearchQueriesParam, GoogleDomainParam, From 2e9bebab396119dbb795db130cfb9ad3b242cc17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 11 Nov 2024 18:11:51 +0100 Subject: [PATCH 05/11] WIP --- setup.py | 2 +- tests/test_serp.py | 23 +++++++++-- tox.ini | 2 +- zyte_spider_templates/spiders/serp.py | 59 +++++++++------------------ 4 files changed, 41 insertions(+), 45 deletions(-) diff --git a/setup.py b/setup.py index 76788ff..b5597a1 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ "scrapy-poet>=0.24.0", "scrapy-spider-metadata>=0.2.0", "scrapy-zyte-api[provider]>=0.23.0", - "zyte-common-items>=0.23.0", + "zyte-common-items>=0.26.0", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/test_serp.py b/tests/test_serp.py index dd30a27..b2623dd 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -263,6 +263,25 @@ def test_metadata(): "title": "Max Pages", "type": "integer", }, + "item_type": { + "anyOf": [{"type": "string"}, {"type": "null"}], + "default": None, + "description": ( + "If specified, result URLs are followed to extract " + "the specified item type. Spider output items will be " + "of the specified item type, not search engine " + "results page items." + ), + "enum": [ + "article", + "articleList", + "forumThread", + "jobPosting", + "product", + "productList", + ], + "title": "Item type", + }, "max_requests": { "anyOf": [{"type": "integer"}, {"type": "null"}], "default": 100, @@ -459,9 +478,7 @@ def test_item_type_mappings(): # Ensure that the ITEM_TYPE_CLASSES dict maps all values from the # corresponding enum except for serp. actual_keys = set(ITEM_TYPE_CLASSES) - expected_keys = set( - entry.value for entry in SerpItemType if entry != SerpItemType.serp - ) + expected_keys = set(entry.value for entry in SerpItemType) assert actual_keys == expected_keys # Also ensure that no dict value is repeated. diff --git a/tox.ini b/tox.ini index 3fa9108..55de9d8 100644 --- a/tox.ini +++ b/tox.ini @@ -26,7 +26,7 @@ deps = scrapy-poet==0.24.0 scrapy-spider-metadata==0.2.0 scrapy-zyte-api[provider]==0.23.0 - zyte-common-items==0.23.0 + zyte-common-items==0.26.0 [testenv:mypy] deps = diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 7f48b8f..d61a818 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -7,9 +7,10 @@ from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args from w3lib.url import add_or_replace_parameter -from zyte_common_items import ( # TODO: Add ForumThread to zyte-common-items; ForumThread, +from zyte_common_items import ( Article, ArticleList, + ForumThread, JobPosting, Product, ProductList, @@ -62,46 +63,39 @@ class SerpMaxPagesParam(BaseModel): class SerpItemType(str, Enum): article: str = "article" """ - Article data from result URLs. + Article data. """ articleList: str = "articleList" """ - Article list data from result URLs. + Article list data. """ - # forumThread: str = "forumThread" + forumThread: str = "forumThread" """ - Thread data from result URLs. + Forum thread data. """ jobPosting: str = "jobPosting" """ - Job posting data from result URLs. + Job posting data. """ product: str = "product" """ - Product data from result URLs. + Product data. """ productList: str = "productList" """ - Product list data from result URLs. + Product list data. """ - serp: str = "serp" - """ - Search engine results page data. - """ - -# NOTE: serp is excluded on purposed, since it is not used below. -# TODO: Add a test to make sure that this is in sync with the enum class above. ITEM_TYPE_CLASSES = { SerpItemType.article: Article, SerpItemType.articleList: ArticleList, - # SerpItemType.forumThread: ForumThread, + SerpItemType.forumThread: ForumThread, SerpItemType.jobPosting: JobPosting, SerpItemType.product: Product, SerpItemType.productList: ProductList, @@ -109,29 +103,14 @@ class SerpItemType(str, Enum): class SerpItemTypeParam(BaseModel): - item_type: SerpItemType = Field( + item_type: Optional[SerpItemType] = Field( title="Item type", - description="Data type of the output items.", - default=SerpItemType.serp, - json_schema_extra={ - "enumMeta": { - # TODO: Add a test to make sure this is in sync with the enum class above. - # TODO: Try automating the generation of this metadata from the enum type above. - SerpItemType.serp: { - "title": "serp", - "description": ( - "Yield the data of result pages, do not follow result " "links." - ), - }, - SerpItemType.product: { - "title": "product", - "description": ( - "Follow result links and yield product details data " - "from them." - ), - }, - }, - }, + description=( + "If specified, result URLs are followed to extract the specified " + "item type. Spider output items will be of the specified item " + "type, not search engine results page items." + ), + default=None, ) @@ -145,7 +124,7 @@ class GoogleDomainParam(BaseModel): class GoogleSearchSpiderParams( MaxRequestsParam, - SerpItemTypeParam, # TODO: Update the test_metadata expectations + SerpItemTypeParam, SerpMaxPagesParam, SearchQueriesParam, GoogleDomainParam, @@ -220,7 +199,7 @@ def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]: next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) yield self.get_serp_request(next_url, page_number=page_number + 1) - if self.args.item_type == SerpItemType.serp: + if self.args.item_type is None: yield serp return From cf97bcbe48afde2f7d9f8a56bbf9a43f54248f06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 12 Nov 2024 10:36:45 +0100 Subject: [PATCH 06/11] Update test expectations after fixing max_pages --- tests/test_serp.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/test_serp.py b/tests/test_serp.py index b2623dd..8c48f08 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -342,7 +342,9 @@ def test_search_queries(): def test_pagination(): crawler = get_crawler() - spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", max_pages=3 + ) def run_parse_serp(total_results, page=1): url = "https://www.google.com/search?q=foo+bar" @@ -411,6 +413,14 @@ def run_parse_serp(total_results, page=1): assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=20" assert requests[0].cb_kwargs["page_number"] == 3 + # Do not go over max_pages + items, requests = run_parse_serp( + total_results=31, + page=3, + ) + assert len(items) == 1 + assert len(requests) == 0 + def test_get_serp_request(): crawler = get_crawler() @@ -427,7 +437,9 @@ def test_get_serp_request(): def test_parse_serp(): crawler = get_crawler() - spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", max_pages=43 + ) url = "https://www.google.com/search?q=foo+bar" response = ZyteAPITextResponse.from_api_response( api_response={ From d7a7cb6cc7bde19d3c441cc9a02dd01b9b50cb27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 12 Nov 2024 10:45:04 +0100 Subject: [PATCH 07/11] Remove obsolete TODO --- zyte_spider_templates/spiders/serp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index d61a818..5384418 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -193,7 +193,7 @@ def start_requests(self) -> Iterable[Request]: def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]: serp = Serp.from_dict(response.raw_api_response["serp"]) - if page_number < self.args.max_pages: # TODO: Add a test for this + if page_number < self.args.max_pages: next_start = page_number * self._results_per_page if serp.organicResults and serp.metadata.totalOrganicResults > next_start: next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) From 7e6d6f195f3b49319660e97f7bf3a0a9b033a379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 12 Nov 2024 11:26:11 +0100 Subject: [PATCH 08/11] Solve remaining to-do items --- setup.py | 2 +- tests/test_serp.py | 53 +++++++++++++++++++++++++++ tox.ini | 2 +- zyte_spider_templates/spiders/serp.py | 5 +-- 4 files changed, 56 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index b5597a1..4e02cb1 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ "scrapy-poet>=0.24.0", "scrapy-spider-metadata>=0.2.0", "scrapy-zyte-api[provider]>=0.23.0", - "zyte-common-items>=0.26.0", + "zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@fix-result-inheritance", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/test_serp.py b/tests/test_serp.py index 8c48f08..0a9fad1 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -4,6 +4,7 @@ from scrapy_spider_metadata import get_spider_metadata from scrapy_zyte_api.responses import ZyteAPITextResponse from w3lib.url import add_or_replace_parameter +from zyte_common_items import Product from zyte_spider_templates.spiders.serp import ( ITEM_TYPE_CLASSES, @@ -482,6 +483,58 @@ def test_parse_serp(): spider.parse_serp(response) +def test_item_type(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", max_pages=43, item_type="product" + ) + url = "https://www.google.com/search?q=foo+bar" + response = ZyteAPITextResponse.from_api_response( + api_response={ + "serp": { + "organicResults": [ + { + "description": "…", + "name": "…", + "url": f"https://example.com/{rank}", + "rank": rank, + } + for rank in range(1, 11) + ], + "metadata": { + "dateDownloaded": "2024-10-25T08:59:45Z", + "displayedQuery": "foo bar", + "searchedQuery": "foo bar", + "totalOrganicResults": 99999, + }, + "pageNumber": 1, + "url": url, + }, + "url": url, + }, + ) + items = [] + requests = [] + for item_or_request in spider.parse_serp(response, page_number=42): + if isinstance(item_or_request, Request): + requests.append(item_or_request) + else: + items.append(item_or_request) + assert len(items) == 0 + assert len(requests) == 11 + + assert requests[0].url == add_or_replace_parameter(url, "start", "420") + assert requests[0].cb_kwargs["page_number"] == 43 + + for rank in range(1, 11): + assert requests[rank].url == f"https://example.com/{rank}" + assert requests[rank].callback == spider.parse_result + assert requests[rank].meta == { + "crawling_logs": {"page_type": "product"}, + "inject": [Product], + } + + def test_item_type_mappings(): # Ensure that all SerpItemType keys and values match. for entry in SerpItemType: diff --git a/tox.ini b/tox.ini index 55de9d8..3f0b756 100644 --- a/tox.ini +++ b/tox.ini @@ -26,7 +26,7 @@ deps = scrapy-poet==0.24.0 scrapy-spider-metadata==0.2.0 scrapy-zyte-api[provider]==0.23.0 - zyte-common-items==0.26.0 + zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@fix-result-inheritance [testenv:mypy] deps = diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 5384418..51fd3af 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -203,12 +203,9 @@ def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]: yield serp return - # TODO: Add a test for this for result in serp.organicResults: yield response.follow( - result[ - "url" - ], # TODO: Why does result.url not work? Bug in zyte-common-items? + result.url, callback=self.parse_result, meta={ "crawling_logs": {"page_type": self.args.item_type.value}, From 5cd60724457b0a9ac935236da5fb02471f830470 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 12 Nov 2024 13:14:48 +0100 Subject: [PATCH 09/11] =?UTF-8?q?zyte-common-items=20=E2=89=A5=200.26.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.py | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 4e02cb1..5940a73 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ "scrapy-poet>=0.24.0", "scrapy-spider-metadata>=0.2.0", "scrapy-zyte-api[provider]>=0.23.0", - "zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@fix-result-inheritance", + "zyte-common-items>=0.26.2", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tox.ini b/tox.ini index 3f0b756..7e77c06 100644 --- a/tox.ini +++ b/tox.ini @@ -26,7 +26,7 @@ deps = scrapy-poet==0.24.0 scrapy-spider-metadata==0.2.0 scrapy-zyte-api[provider]==0.23.0 - zyte-common-items @ git+https://github.com/Gallaecio/zyte-common-items.git@fix-result-inheritance + zyte-common-items==0.26.2 [testenv:mypy] deps = From 93599814be488cd5ce94d3e2e816d90412869d91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 13 Nov 2024 08:02:42 +0100 Subject: [PATCH 10/11] =?UTF-8?q?item=5Ftype.title:=20Item=20type=20?= =?UTF-8?q?=E2=86=92=20Follow=20and=20Extract?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_serp.py | 2 +- zyte_spider_templates/spiders/serp.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_serp.py b/tests/test_serp.py index 0a9fad1..29c9cb2 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -281,7 +281,7 @@ def test_metadata(): "product", "productList", ], - "title": "Item type", + "title": "Follow and Extract", }, "max_requests": { "anyOf": [{"type": "integer"}, {"type": "null"}], diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 51fd3af..8d4232f 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -104,7 +104,7 @@ class SerpItemType(str, Enum): class SerpItemTypeParam(BaseModel): item_type: Optional[SerpItemType] = Field( - title="Item type", + title="Follow and Extract", description=( "If specified, result URLs are followed to extract the specified " "item type. Spider output items will be of the specified item " From 953091b00615bea012a5545b2d01fe700beec3da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 13 Nov 2024 08:04:58 +0100 Subject: [PATCH 11/11] Improve item_type.description --- tests/test_serp.py | 8 ++++---- zyte_spider_templates/spiders/serp.py | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/test_serp.py b/tests/test_serp.py index 29c9cb2..9c59cd7 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -268,10 +268,10 @@ def test_metadata(): "anyOf": [{"type": "string"}, {"type": "null"}], "default": None, "description": ( - "If specified, result URLs are followed to extract " - "the specified item type. Spider output items will be " - "of the specified item type, not search engine " - "results page items." + "If specified, follow organic search result links, " + "and extract the selected data type from the target " + "pages. Spider output items will be of the specified " + "data type, not search engine results page items." ), "enum": [ "article", diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 8d4232f..d69856e 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -106,9 +106,10 @@ class SerpItemTypeParam(BaseModel): item_type: Optional[SerpItemType] = Field( title="Follow and Extract", description=( - "If specified, result URLs are followed to extract the specified " - "item type. Spider output items will be of the specified item " - "type, not search engine results page items." + "If specified, follow organic search result links, and extract " + "the selected data type from the target pages. Spider output " + "items will be of the specified data type, not search engine " + "results page items." ), default=None, )