diff --git a/setup.py b/setup.py index aabf0d0..a358315 100644 --- a/setup.py +++ b/setup.py @@ -12,13 +12,13 @@ packages=find_packages(), include_package_data=True, install_requires=[ - "pydantic>=2", + "pydantic>=2.1", "requests>=0.10.1", "scrapy>=2.11.0", - "scrapy-poet>=0.21.0", - "scrapy-spider-metadata>=0.1.2", - "scrapy-zyte-api[provider]>=0.16.0", - "zyte-common-items>=0.22.0", + "scrapy-poet>=0.23.0", + "scrapy-spider-metadata>=0.2.0", + "scrapy-zyte-api[provider]>=0.23.0", + "zyte-common-items>=0.23.0", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index ae77049..23bb15a 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -5,7 +5,7 @@ import requests import scrapy from pydantic import ValidationError -from scrapy_poet import DummyResponse +from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import get_spider_metadata from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request @@ -243,7 +243,7 @@ def test_parse_product(probability, has_item, item_drop, caplog): mock_crawler = MagicMock() spider.crawler = mock_crawler logging.getLogger().setLevel(logging.INFO) - items = list(spider.parse_product(response, product)) + items = list(spider.parse_product(response, product, DynamicDeps())) if item_drop: assert mock_crawler.method_calls == [ call.stats.inc_value("drop_item/product/low_probability") @@ -463,7 +463,7 @@ def test_metadata(): "title": "Pagination Only", }, }, - "title": "Crawl Strategy", + "title": "Crawl strategy", "enum": [ "automatic", "full", @@ -528,6 +528,42 @@ def test_metadata(): "title": "Extraction source", "enum": ["httpResponseBody", "browserHtml"], }, + "custom_attrs_input": { + "anyOf": [ + { + "contentMediaType": "application/json", + "contentSchema": {"type": "object"}, + "type": "string", + }, + {"type": "null"}, + ], + "default": None, + "description": "Custom attributes to extract.", + "title": "Custom attributes schema", + "widget": "custom-attrs", + }, + "custom_attrs_method": { + "default": "generate", + "description": "Which model to use for custom attribute extraction.", + "enum": ["generate", "extract"], + "enumMeta": { + "extract": { + "description": "Use an extractive model (BERT). Supports only a " + "subset of the schema (string, integer and " + "number), suited for extraction of short and clear " + "fields, with a fixed per-request cost.", + "title": "extract", + }, + "generate": { + "description": "Use a generative model (LLM). The most powerful " + "and versatile, but more expensive, with variable " + "per-request cost.", + "title": "generate", + }, + }, + "title": "Custom attributes extraction method", + "type": "string", + }, }, "title": "EcommerceSpiderParams", "type": "object", diff --git a/tox.ini b/tox.ini index 4212e3e..19b8859 100644 --- a/tox.ini +++ b/tox.ini @@ -20,13 +20,13 @@ commands = basepython = python3.9 deps = {[testenv]deps} - pydantic==2 + pydantic==2.1 requests==0.10.1 scrapy==2.11.0 - scrapy-poet==0.21.0 - scrapy-spider-metadata==0.1.2 - scrapy-zyte-api[provider]==0.16.0 - zyte-common-items==0.22.0 + scrapy-poet==0.23.0 + scrapy-spider-metadata==0.2.0 + scrapy-zyte-api[provider]==0.23.0 + zyte-common-items==0.23.0 [testenv:mypy] deps = diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index f3190ab..688d7e8 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -2,10 +2,17 @@ import re from enum import Enum from logging import getLogger -from typing import Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union import requests -from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + Json, + field_validator, + model_validator, +) try: from pydantic.config import JsonDict @@ -34,6 +41,18 @@ class ExtractFrom(str, Enum): """Use browser rendering. Often provides the best quality.""" +@document_enum +class CustomAttrsMethod(str, Enum): + generate: str = "generate" + """Use a generative model (LLM). The most powerful and versatile, but more + expensive, with variable per-request cost.""" + + extract: str = "extract" + """Use an extractive model (BERT). Supports only a subset of the schema (string, + integer and number), suited for extraction of short and clear fields, with a fixed + per-request cost.""" + + class ExtractFromParam(BaseModel): extract_from: Optional[ExtractFrom] = Field( title="Extraction source", @@ -304,3 +323,39 @@ def validate_location( return PostalAddress(**value) raise ValueError(f"{value!r} type {type(value)} is not a supported type") + + +class CustomAttrsInputParam(BaseModel): + custom_attrs_input: Optional[Json[Dict[str, Any]]] = Field( + title="Custom attributes schema", + description="Custom attributes to extract.", + default=None, + json_schema_extra={ + "widget": "custom-attrs", + }, + ) + + +class CustomAttrsMethodParam(BaseModel): + custom_attrs_method: CustomAttrsMethod = Field( + title="Custom attributes extraction method", + description="Which model to use for custom attribute extraction.", + default=CustomAttrsMethod.generate, + json_schema_extra={ + "enumMeta": { + CustomAttrsMethod.generate: { + "title": "generate", + "description": "Use a generative model (LLM). The most powerful " + "and versatile, but more expensive, with variable " + "per-request cost.", + }, + CustomAttrsMethod.extract: { + "title": "extract", + "description": "Use an extractive model (BERT). Supports only a " + "subset of the schema (string, integer and " + "number), suited for extraction of short and clear " + "fields, with a fixed per-request cost.", + }, + }, + }, + ) diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py index deb00ee..e6e78f4 100644 --- a/zyte_spider_templates/spiders/base.py +++ b/zyte_spider_templates/spiders/base.py @@ -1,10 +1,12 @@ from importlib.metadata import version -from typing import Any, Dict +from typing import Annotated, Any, Dict from warnings import warn import scrapy from pydantic import BaseModel, ConfigDict, model_validator from scrapy.crawler import Crawler +from scrapy_zyte_api import custom_attrs +from zyte_common_items import CustomAttributes from ..params import ( INPUT_GROUP, @@ -63,6 +65,8 @@ class BaseSpider(scrapy.Spider): _NEXT_PAGE_PRIORITY: int = 100 + _custom_attrs_dep = None + @classmethod def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: spider = super().from_crawler(crawler, *args, **kwargs) @@ -86,4 +90,21 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: spider.args.max_requests, priority=ARG_SETTING_PRIORITY, ) + + if custom_attrs_input := getattr(spider.args, "custom_attrs_input", None): + custom_attrs_options = { + "method": spider.args.custom_attrs_method, + } + if max_input_tokens := crawler.settings.getint("ZYTE_API_MAX_INPUT_TOKENS"): + custom_attrs_options["maxInputTokens"] = max_input_tokens + if max_output_tokens := crawler.settings.getint( + "ZYTE_API_MAX_OUTPUT_TOKENS" + ): + custom_attrs_options["maxOutputTokens"] = max_output_tokens + + spider._custom_attrs_dep = Annotated[ + CustomAttributes, + custom_attrs(custom_attrs_input, custom_attrs_options), + ] + return spider diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index 3868649..7f12148 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -2,12 +2,18 @@ from typing import Any, Callable, Dict, Iterable, Optional, Union import scrapy +from andi.typeutils import strip_annotated from pydantic import BaseModel, ConfigDict, Field from scrapy import Request from scrapy.crawler import Crawler -from scrapy_poet import DummyResponse +from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args -from zyte_common_items import ProbabilityRequest, Product, ProductNavigation +from zyte_common_items import ( + CustomAttributes, + ProbabilityRequest, + Product, + ProductNavigation, +) from zyte_spider_templates.heuristics import is_homepage from zyte_spider_templates.params import parse_input_params @@ -20,6 +26,8 @@ from ..documentation import document_enum from ..params import ( + CustomAttrsInputParam, + CustomAttrsMethodParam, ExtractFromParam, GeolocationParam, MaxRequestsParam, @@ -61,7 +69,7 @@ class EcommerceCrawlStrategy(str, Enum): class EcommerceCrawlStrategyParam(BaseModel): crawl_strategy: EcommerceCrawlStrategy = Field( - title="Crawl Strategy", + title="Crawl strategy", description="Determines how the start URL and follow-up URLs are crawled.", default=EcommerceCrawlStrategy.automatic, json_schema_extra={ @@ -110,6 +118,8 @@ class EcommerceCrawlStrategyParam(BaseModel): class EcommerceSpiderParams( + CustomAttrsMethodParam, + CustomAttrsInputParam, ExtractFromParam, MaxRequestsParam, GeolocationParam, @@ -227,13 +237,23 @@ def parse_navigation( yield self.get_subcategory_request(request, page_params=page_params) def parse_product( - self, response: DummyResponse, product: Product - ) -> Iterable[Product]: + self, response: DummyResponse, product: Product, dynamic: DynamicDeps + ) -> Iterable[ + Union[Product, Dict[str, Union[Product, Optional[CustomAttributes]]]] + ]: probability = product.get_probability() # TODO: convert to a configurable parameter later on after the launch if probability is None or probability >= 0.1: - yield product + if self.args.custom_attrs_input: + custom_attrs = None + for cls, value in dynamic.items(): + if strip_annotated(cls) is CustomAttributes: + custom_attrs = value + break + yield {"product": product, "customAttributes": custom_attrs} + else: + yield product else: self.crawler.stats.inc_value("drop_item/product/low_probability") self.logger.info( @@ -319,17 +339,22 @@ def get_parse_product_request( priority = self.get_parse_product_request_priority(request) probability = request.get_probability() + meta = { + "crawling_logs": { + "name": request.name, + "probability": probability, + "page_type": "product", + }, + } + if self._custom_attrs_dep: + meta["inject"] = [ + self._custom_attrs_dep, + ] scrapy_request = request.to_scrapy( callback=callback, priority=priority, - meta={ - "crawling_logs": { - "name": request.name, - "probability": probability, - "page_type": "product", - } - }, + meta=meta, ) scrapy_request.meta["allow_offsite"] = True return scrapy_request