diff --git a/.bumpversion.cfg b/.bumpversion.cfg index fbcf2ac..8b1c908 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.9.0 +current_version = 0.10.0 commit = True tag = True tag_name = {new_version} diff --git a/CHANGES.rst b/CHANGES.rst index 019e11c..50ff9d1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,47 @@ Changes ======= +0.10.0 (2024-11-22) +------------------- + +* Dropped Python 3.8 support, added Python 3.13 support. + +* Increased the minimum required versions of some dependencies: + + * ``pydantic``: ``2`` → ``2.1`` + + * ``scrapy-poet``: ``0.21.0`` → ``0.24.0`` + + * ``scrapy-spider-metadata``: ``0.1.2`` → ``0.2.0`` + + * ``scrapy-zyte-api[provider]``: ``0.16.0`` → ``0.23.0`` + + * ``zyte-common-items``: ``0.22.0`` → ``0.23.0`` + +* Added :ref:`custom attributes ` support to the + :ref:`e-commerce spider template ` through its new + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.custom_attrs_input` + and + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.custom_attrs_method` + parameters. + +* The + :class:`~zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams.max_pages` + parameter of the :ref:`Google Search spider template ` can no + longer be 0 or lower. + +* The :ref:`Google Search spider template ` now follows + pagination for the results of each query page by page, instead of sending a + request for every page in parallel. It stops once it reaches a page without + organic results. + +* Improved the description of + :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy` + values. + +* Fixed type hint issues related to Scrapy. + + 0.9.0 (2024-09-17) ------------------ diff --git a/docs/conf.py b/docs/conf.py index 406490e..0d0a14e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -4,7 +4,7 @@ project = "zyte-spider-templates" copyright = "2023, Zyte Group Ltd" author = "Zyte Group Ltd" -release = "0.9.0" +release = "0.10.0" sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext extensions = [ @@ -22,6 +22,14 @@ html_theme = "sphinx_rtd_theme" intersphinx_mapping = { + "form2request": ( + "https://form2request.readthedocs.io/en/latest", + None, + ), + "formasaurus": ( + "https://formasaurus.readthedocs.io/en/latest", + None, + ), "python": ( "https://docs.python.org/3", None, @@ -46,6 +54,10 @@ "https://web-poet.readthedocs.io/en/stable", None, ), + "zyte": ( + "https://docs.zyte.com", + None, + ), "zyte-common-items": ( "https://zyte-common-items.readthedocs.io/en/latest", None, @@ -57,6 +69,7 @@ autodoc_pydantic_model_show_json = False autodoc_pydantic_model_show_validator_members = False autodoc_pydantic_model_show_validator_summary = False +autodoc_pydantic_field_list_validators = False # sphinx-reredirects redirects = { diff --git a/docs/customization/pages.rst b/docs/customization/pages.rst index f373788..46da4c9 100644 --- a/docs/customization/pages.rst +++ b/docs/customization/pages.rst @@ -6,7 +6,8 @@ Customizing page objects All parsing is implemented using :ref:`web-poet page objects ` that use `Zyte API automatic extraction`_ to extract :ref:`standard items -`, both for navigation and for item details. +`: for navigation, for item details, and even for :ref:`search +request generation `. .. _Zyte API automatic extraction: https://docs.zyte.com/zyte-api/usage/extract.html @@ -141,3 +142,27 @@ To extract a new field for one or more websites: def parse_product(self, response: DummyResponse, product: CustomProduct): yield from super().parse_product(response, product) + +.. _fix-search: + +Fixing search support +===================== + +If the default implementation to build a request out of :ref:`search queries +` does not work on a given website, you can implement your +own search request page object to fix that. See +:ref:`custom-request-template-page`. + +For example: + +.. code-block:: python + + from web_poet import handle_urls + from zyte_common_items import BaseSearchRequestTemplatePage + + + @handle_urls("example.com") + class ExampleComSearchRequestTemplatePage(BaseSearchRequestTemplatePage): + @field + def url(self): + return "https://example.com/search?q={{ query|quote_plus }}" diff --git a/docs/features/search.rst b/docs/features/search.rst new file mode 100644 index 0000000..8dec02a --- /dev/null +++ b/docs/features/search.rst @@ -0,0 +1,43 @@ +.. _search-queries: + +============== +Search queries +============== + +The :ref:`e-commerce spider template ` supports a spider argument, +:data:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.search_queries`, +that allows you to define a different search query per line, and +turns the input URLs into search requests for those queries. + +For example, given the following input URLs: + +.. code-block:: none + + https://a.example + https://b.example + +And the following list of search queries: + +.. code-block:: none + + foo bar + baz + +By default, the spider would send 2 initial requests to those 2 input URLs, +to try and find out how to build a search request for them, and if it succeeds, +it will then send 4 search requests, 1 per combination of input URL and search +query. For example: + +.. code-block:: none + + https://a.example/search?q=foo+bar + https://a.example/search?q=baz + https://b.example/s/foo%20bar + https://b.example/s/baz + +The default implementation uses a combination of HTML metadata, AI-based HTML +form inspection and heuristics to find the most likely way to build a search +request for a given website. + +If this default implementation does not work as expected on a given website, +you can :ref:`write a page object to fix that `. diff --git a/docs/index.rst b/docs/index.rst index 1083299..dd568ea 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -20,6 +20,12 @@ zyte-spider-templates documentation E-commerce Google search +.. toctree:: + :caption: Features + :hidden: + + Search queries + .. toctree:: :caption: Customization :hidden: diff --git a/docs/reference/index.rst b/docs/reference/index.rst index dd368dd..a7862f1 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -23,6 +23,14 @@ Pages Parameter mixins ================ +.. autopydantic_model:: zyte_spider_templates.params.CustomAttrsInputParam + :exclude-members: model_computed_fields + +.. autopydantic_model:: zyte_spider_templates.params.CustomAttrsMethodParam + :exclude-members: model_computed_fields + +.. autoenum:: zyte_spider_templates.params.CustomAttrsMethod + .. autopydantic_model:: zyte_spider_templates.params.ExtractFromParam :exclude-members: model_computed_fields @@ -44,5 +52,10 @@ Parameter mixins .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy +.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam + :exclude-members: model_computed_fields + +.. autoenum:: zyte_spider_templates.spiders.serp.SerpItemType + .. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam :exclude-members: model_computed_fields diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..1152570 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +filterwarnings = + ignore:deprecated string literal syntax::jmespath.lexer diff --git a/setup.py b/setup.py index 76788ff..869b1f5 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="zyte-spider-templates", - version="0.9.0", + version="0.10.0", description="Spider templates for automatic crawlers.", long_description=open("README.rst").read(), long_description_content_type="text/x-rst", @@ -12,13 +12,18 @@ packages=find_packages(), include_package_data=True, install_requires=[ + "extruct>=0.18.0", + "form2request>=0.2.0", + "formasaurus>=0.10.0", + "jmespath>=0.9.5", "pydantic>=2.1", - "requests>=0.10.1", + "requests>=1.0.0", "scrapy>=2.11.0", "scrapy-poet>=0.24.0", "scrapy-spider-metadata>=0.2.0", "scrapy-zyte-api[provider]>=0.23.0", - "zyte-common-items>=0.23.0", + "web-poet>=0.17.1", + "zyte-common-items>=0.26.2", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 69d9466..3dca339 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -4,40 +4,29 @@ import pytest import requests import scrapy -from pydantic import ValidationError from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import get_spider_metadata -from zyte_common_items import ProbabilityRequest, Product, ProductNavigation, Request +from web_poet.page_inputs.browser import BrowserResponse +from zyte_common_items import ( + ProbabilityRequest, + Product, + ProductNavigation, + SearchRequestTemplate, + SearchRequestTemplateMetadata, +) from zyte_spider_templates._geolocations import ( GEOLOCATION_OPTIONS, GEOLOCATION_OPTIONS_WITH_CODE, Geolocation, ) -from zyte_spider_templates.spiders.ecommerce import ( - EcommerceCrawlStrategy, - EcommerceSpider, -) +from zyte_spider_templates.spiders.ecommerce import EcommerceSpider from . import get_crawler from .test_utils import URL_TO_DOMAIN from .utils import assertEqualSpiderMetadata -def test_parameters(): - with pytest.raises(ValidationError): - EcommerceSpider() - - EcommerceSpider(url="https://example.com") - EcommerceSpider( - url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.automatic - ) - EcommerceSpider(url="https://example.com", crawl_strategy="automatic") - - with pytest.raises(ValidationError): - EcommerceSpider(url="https://example.com", crawl_strategy="unknown") - - def test_start_requests(): url = "https://example.com" crawler = get_crawler() @@ -258,106 +247,31 @@ def test_parse_product(probability, has_item, item_drop, caplog): assert str(product) in caplog.text -def test_arguments(): - # Ensure passing no arguments works. +@pytest.mark.parametrize( + ("probability", "yields_items"), + ( + (None, True), # Default + (-1.0, False), + (0.0, False), # page.no_item_found() + (1.0, True), + ), +) +def test_parse_search_request_template_probability(probability, yields_items): crawler = get_crawler() - - # Needed since it's a required argument. - base_kwargs = {"url": "https://example.com"} - - EcommerceSpider.from_crawler(crawler, **base_kwargs) - - for param, arg, setting, old_setting_value, getter_name, new_setting_value in ( - ("max_requests", "123", "ZYTE_API_MAX_REQUESTS", None, "getint", 123), - ( - "geolocation", - "DE", - "ZYTE_API_AUTOMAP_PARAMS", - None, - "getdict", - {"geolocation": "DE"}, - ), - ( - "geolocation", - "DE", - "ZYTE_API_AUTOMAP_PARAMS", - '{"browserHtml": true}', - "getdict", - {"browserHtml": True, "geolocation": "DE"}, - ), - ( - "geolocation", - "DE", - "ZYTE_API_AUTOMAP_PARAMS", - '{"geolocation": "IE"}', - "getdict", - {"geolocation": "DE"}, - ), - ( - "geolocation", - "DE", - "ZYTE_API_PROVIDER_PARAMS", - None, - "getdict", - {"geolocation": "DE"}, - ), - ( - "geolocation", - "DE", - "ZYTE_API_PROVIDER_PARAMS", - '{"browserHtml": true}', - "getdict", - {"browserHtml": True, "geolocation": "DE"}, - ), - ( - "geolocation", - "DE", - "ZYTE_API_PROVIDER_PARAMS", - '{"geolocation": "IE"}', - "getdict", - {"geolocation": "DE"}, - ), - ( - "extract_from", - "browserHtml", - "ZYTE_API_PROVIDER_PARAMS", - None, - "getdict", - { - "productOptions": {"extractFrom": "browserHtml"}, - "productNavigationOptions": {"extractFrom": "browserHtml"}, - }, - ), - ( - "extract_from", - "httpResponseBody", - "ZYTE_API_PROVIDER_PARAMS", - {"geolocation": "US"}, - "getdict", - { - "productOptions": {"extractFrom": "httpResponseBody"}, - "productNavigationOptions": {"extractFrom": "httpResponseBody"}, - "geolocation": "US", - }, - ), - ( - "extract_from", - None, - "ZYTE_API_PROVIDER_PARAMS", - {"geolocation": "US"}, - "getdict", - {"geolocation": "US"}, - ), - ): - kwargs = {param: arg} - settings = {} - if old_setting_value is not None: - settings[setting] = old_setting_value - crawler = get_crawler(settings=settings) - spider = EcommerceSpider.from_crawler(crawler, **kwargs, **base_kwargs) - getter = getattr(crawler.settings, getter_name) - assert getter(setting) == new_setting_value - assert spider.allowed_domains == ["example.com"] + spider = EcommerceSpider.from_crawler( + crawler, url="https://example.com", search_queries="foo" + ) + search_request_template = SearchRequestTemplate(url="https://example.com") + if probability is not None: + search_request_template.metadata = SearchRequestTemplateMetadata( + probability=probability + ) + items = list( + spider.parse_search_request_template( + DummyResponse("https://example.com"), search_request_template, DynamicDeps() + ) + ) + assert items if yields_items else not items def test_metadata(): @@ -420,6 +334,17 @@ def test_metadata(): "title": "URLs file", "type": "string", }, + "search_queries": { + "default": [], + "description": ( + "A list of search queries, one per line, to submit " + "using the search form found on each input URL." + ), + "items": {"type": "string"}, + "title": "Search Queries", + "type": "array", + "widget": "textarea", + }, "crawl_strategy": { "default": "automatic", "description": "Determines how the start URL and follow-up URLs are crawled.", @@ -492,11 +417,7 @@ def test_metadata(): {"type": "null"}, ], "default": None, - "description": ( - "ISO 3166-1 alpha-2 2-character string specified in " - "https://docs.zyte.com/zyte-api/usage/reference.html" - "#operation/extract/request/geolocation." - ), + "description": "Country of the IP addresses to use.", "enumMeta": { code: { "title": GEOLOCATION_OPTIONS_WITH_CODE[code], @@ -607,7 +528,7 @@ def test_get_subcategory_request(): url = "https://example.com" # Normal request but with mostly empty values - request = Request(url) + request = ProbabilityRequest(url=url) spider = EcommerceSpider(url="https://example.com") parse_navigation = lambda _: None spider.parse_navigation = parse_navigation # type: ignore @@ -678,7 +599,7 @@ def test_get_nextpage_request(): url = "https://example.com" # Minimal Args - request = Request(url) + request = ProbabilityRequest(url=url) spider = EcommerceSpider(url="https://example.com") parse_navigation = lambda _: None spider.parse_navigation = parse_navigation # type: ignore @@ -697,7 +618,7 @@ def test_get_parse_navigation_request(): url = "https://example.com" # Minimal args - request = Request(url) + request = ProbabilityRequest(url=url) spider = EcommerceSpider(url="https://example.com") parse_navigation = lambda _: None spider.parse_navigation = parse_navigation # type: ignore @@ -722,7 +643,7 @@ def test_set_allowed_domains(url, allowed_domain): kwargs = {"url": url} spider = EcommerceSpider.from_crawler(crawler, **kwargs) - assert spider.allowed_domains == [allowed_domain] + assert spider.allowed_domains == [allowed_domain] # type: ignore[attr-defined] def test_input_none(): @@ -820,6 +741,58 @@ def test_urls_file(): assert start_requests[2].url == "https://c.example" +def test_search_queries(): + crawler = get_crawler() + url = "https://example.com" + + spider = EcommerceSpider.from_crawler(crawler, url=url, search_queries="foo bar") + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_search_request_template + assert spider.args.search_queries == ["foo bar"] + + spider = EcommerceSpider.from_crawler(crawler, url=url, search_queries="foo\nbar") + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_search_request_template + assert spider.args.search_queries == ["foo", "bar"] + + spider = EcommerceSpider.from_crawler( + crawler, url=url, search_queries=["foo", "bar"] + ) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].url == url + assert start_requests[0].callback == spider.parse_search_request_template + assert spider.args.search_queries == ["foo", "bar"] + + +def test_search_queries_extract_from(): + crawler = get_crawler() + url = "https://example.com" + + spider = EcommerceSpider.from_crawler(crawler, url=url, search_queries="foo") + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert "inject" not in start_requests[0].meta + + spider = EcommerceSpider.from_crawler( + crawler, url=url, search_queries="foo", extract_from="httpResponseBody" + ) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert "inject" not in start_requests[0].meta + + spider = EcommerceSpider.from_crawler( + crawler, url=url, search_queries="foo", extract_from="browserHtml" + ) + start_requests = list(spider.start_requests()) + assert len(start_requests) == 1 + assert start_requests[0].meta["inject"] == [BrowserResponse] + + @pytest.mark.parametrize( "url,has_full_domain", ( diff --git a/tests/test_params.py b/tests/test_params.py index df08a19..bc6bd15 100644 --- a/tests/test_params.py +++ b/tests/test_params.py @@ -1,8 +1,13 @@ import re import pytest +from pydantic import ValidationError +from zyte_spider_templates import EcommerceSpider, GoogleSearchSpider from zyte_spider_templates.params import URL_FIELD_KWARGS +from zyte_spider_templates.spiders.ecommerce import EcommerceCrawlStrategy + +from . import get_crawler @pytest.mark.parametrize( @@ -49,3 +54,218 @@ def test_url_pattern(url, valid): assert isinstance(URL_FIELD_KWARGS["pattern"], str) assert bool(re.match(URL_FIELD_KWARGS["pattern"], url)) == valid + + +REQUIRED_ARGS = { + EcommerceSpider: {"url": "https://example.com"}, + GoogleSearchSpider: {"search_queries": "foo"}, +} + + +@pytest.mark.parametrize( + ("spider_cls",), ((spider_cls,) for spider_cls in REQUIRED_ARGS) +) +def test_required_args(spider_cls): + crawler = get_crawler() + + with pytest.raises(ValidationError): + spider_cls.from_crawler(crawler) + + spider_cls.from_crawler(crawler, **REQUIRED_ARGS[spider_cls]) + + +@pytest.mark.parametrize( + ("spider_cls", "args", "valid"), + ( + ( + EcommerceSpider, + { + "url": "https://example.com", + "crawl_strategy": EcommerceCrawlStrategy.automatic, + }, + True, + ), + ( + EcommerceSpider, + {"url": "https://example.com", "crawl_strategy": "automatic"}, + True, + ), + ( + EcommerceSpider, + {"url": "https://example.com", "crawl_strategy": "unknown"}, + False, + ), + ( + EcommerceSpider, + { + "url": "https://example.com", + "crawl_strategy": "direct_item", + "search_queries": "", + }, + True, + ), + ( + EcommerceSpider, + { + "url": "https://example.com", + "crawl_strategy": "automatic", + "search_queries": "foo", + }, + True, + ), + ( + EcommerceSpider, + { + "url": "https://example.com", + "crawl_strategy": "direct_item", + "search_queries": "foo", + }, + False, + ), + (GoogleSearchSpider, {"domain": "google.com"}, False), + ( + GoogleSearchSpider, + {"domain": "google.cat", "search_queries": "foo bar"}, + True, + ), + ( + GoogleSearchSpider, + {"domain": "google.cat", "search_queries": "foo bar", "max_pages": 10}, + True, + ), + ( + GoogleSearchSpider, + {"domain": "google.foo", "search_queries": "foo bar"}, + False, + ), + (GoogleSearchSpider, {"search_queries": "foo bar", "max_pages": "all"}, False), + (GoogleSearchSpider, {"search_queries": "foo", "results_per_page": 0}, False), + ), +) +def test_arg_combinations(spider_cls, args, valid): + crawler = get_crawler() + if valid: + spider_cls.from_crawler(crawler, **args) + else: + with pytest.raises(ValidationError): + spider_cls.from_crawler(crawler, **args) + + +@pytest.mark.parametrize( + ("spider_cls", "param", "arg", "setting", "old", "getter", "new"), + ( + # extract_from + *( + (EcommerceSpider, *scenario) + for scenario in ( + ( + "extract_from", + "browserHtml", + "ZYTE_API_PROVIDER_PARAMS", + None, + "getdict", + { + "productOptions": {"extractFrom": "browserHtml"}, + "productNavigationOptions": {"extractFrom": "browserHtml"}, + }, + ), + ( + "extract_from", + "httpResponseBody", + "ZYTE_API_PROVIDER_PARAMS", + {"geolocation": "US"}, + "getdict", + { + "productOptions": {"extractFrom": "httpResponseBody"}, + "productNavigationOptions": {"extractFrom": "httpResponseBody"}, + "geolocation": "US", + }, + ), + ( + "extract_from", + None, + "ZYTE_API_PROVIDER_PARAMS", + {"geolocation": "US"}, + "getdict", + {"geolocation": "US"}, + ), + ) + ), + # geolocation + *( + (spider_cls, *scenario) + for spider_cls in (EcommerceSpider, GoogleSearchSpider) + for scenario in ( + ( + "geolocation", + "DE", + "ZYTE_API_AUTOMAP_PARAMS", + None, + "getdict", + {"geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_AUTOMAP_PARAMS", + '{"browserHtml": true}', + "getdict", + {"browserHtml": True, "geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_AUTOMAP_PARAMS", + '{"geolocation": "IE"}', + "getdict", + {"geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_PROVIDER_PARAMS", + None, + "getdict", + {"geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_PROVIDER_PARAMS", + '{"browserHtml": true}', + "getdict", + {"browserHtml": True, "geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_PROVIDER_PARAMS", + '{"geolocation": "IE"}', + "getdict", + {"geolocation": "DE"}, + ), + ) + ), + # max_requests + *( + ( + spider_cls, + "max_requests", + "123", + "ZYTE_API_MAX_REQUESTS", + None, + "getint", + 123, + ) + for spider_cls in (EcommerceSpider, GoogleSearchSpider) + ), + ), +) +def test_setting_setter_params(spider_cls, param, arg, setting, old, getter, new): + settings = {} + if old is not None: + settings[setting] = old + crawler = get_crawler(settings=settings) + spider_cls.from_crawler(crawler, **REQUIRED_ARGS[spider_cls], **{param: arg}) + read = getattr(crawler.settings, getter) + assert read(setting) == new diff --git a/tests/test_search.py b/tests/test_search.py new file mode 100644 index 0000000..c4554a8 --- /dev/null +++ b/tests/test_search.py @@ -0,0 +1,666 @@ +import pytest +from pytest_twisted import ensureDeferred +from web_poet import AnyResponse, BrowserResponse, HttpResponse, PageParams + +from zyte_spider_templates.pages.search_request_template import ( + DefaultSearchRequestTemplatePage, +) + + +@pytest.mark.parametrize( + ("html", "page_params", "expected"), + ( + # Extruct #-----------------------------------------------------------# + # JSON-LD example from Google + # https://developers.google.com/search/docs/appearance/structured-data/sitelinks-searchbox#example + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}", + }, + ), + # Microdata example from Google + # https://developers.google.com/search/docs/appearance/structured-data/sitelinks-searchbox#example + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}", + }, + ), + # Non-compliant JSON-LD that uses a JSON array for potentialAction + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}", + }, + ), + # Non-default placeholder, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}&dont_replace={search_term_string}", + }, + ), + # Non-default placeholder, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}&dont_replace={search_term_string}", + }, + ), + # JSON-LD, WebSite isPartOf WebPage + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}", + }, + ), + # Relative URL, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://example.com/search?q={{ query|quote_plus }}", + }, + ), + # Relative URL, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://example.com/search?q={{ query|quote_plus }}", + }, + ), + # Wrong escaping in JSON-LD + ( + rb""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://example.com/search?a=b&q={{ query|quote_plus }}", + }, + ), + # Query in path, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://example.com/s/{{ query|urlencode }}", + }, + ), + # Relative URL, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://example.com/s/{{ query|urlencode }}", + }, + ), + # No potentialAction, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "error": "Cannot build a search request template", + }, + ), + # No potentialAction, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + {"error": "Cannot build a search request template"}, + ), + # No SearchAction type, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + {"error": "Cannot build a search request template"}, + ), + # No SearchAction type, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + {"error": "Cannot build a search request template"}, + ), + # No target, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + {"error": "Cannot build a search request template"}, + ), + # No target, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + {"error": "Cannot build a search request template"}, + ), + # No query variable name, JSON-LD + ( + b""" + + + The title of the page + + + + + + """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}", + }, + ), + # No query variable name, Microdata + ( + b""" +
+ +
+ + + +
+
+ """, + {"search_request_builders": ["extruct"]}, + { + "url": "https://query.example.com/search?q={{ query|quote_plus }}", + }, + ), + # Formasaurus and form heuristics #-----------------------------------# + *( + (html, {"search_request_builders": [builder]}, expected) + for builder in ("formasaurus", "form_heuristics") + for html, expected in ( + # Basic form + ( + b""" + + """, + { + "url": "https://example.com?q={{ query|quote_plus }}", + }, + ), + # No form + ( + b"
", + {"error": "Cannot build a search request template"}, + ), + # No named input field + ( + b""" + + """, + {"error": "Cannot build a search request template"}, + ), + # Multi-part form + ( + b""" + + """, + {"error": "Cannot build a search request template"}, + ), + # Non-HTML response (JSON) + ( + b"""{"a": "b"}""", + {"error": "Cannot build a search request template"}, + ), + ) + ), + # Link heuristics #---------------------------------------------------# + # Link with recognized parameters + *( + ( + f"""""", + {"search_request_builders": ["link_heuristics"]}, + {"error": "Cannot build a search request template"}, + ), + # No HTML (JSON) + ( + b"""{"a": "b"}""", + {"search_request_builders": ["link_heuristics"]}, + {"error": "Cannot build a search request template"}, + ), + # Parameter false positive (?q != q) + ( + b"""""", + {"search_request_builders": ["link_heuristics"]}, + {"error": "Cannot build a search request template"}, + ), + # Builder parameters #------------------------------------------------# + *( + ( + b""" +
+ + +
+ """, + page_params, + expected, + ) + for page_params, expected in ( + # By default, the popular builder strategy is used, meaning + # that even though the Extruct builder has the highest + # priority, if both the Formasaurus builder and the form + # heuristics builder output the same URL, that one is used + # instead. + ({}, {"url": "https://example.com/form?q={{ query|quote_plus }}"}), + ( + {"search_request_builder_strategy": "popular"}, + {"url": "https://example.com/form?q={{ query|quote_plus }}"}, + ), + ( + {"search_request_builder_strategy": "first"}, + {"url": "https://example.com/metadata?q={{ query|quote_plus }}"}, + ), + # Strategies only take into account the specified builders, and + # in the supplied order. + ( + { + "search_request_builder_strategy": "first", + "search_request_builders": ["formasaurus", "extruct"], + }, + {"url": "https://example.com/form?q={{ query|quote_plus }}"}, + ), + ( + { + "search_request_builder_strategy": "popular", + "search_request_builders": [ + "extruct", + "formasaurus", + "link_heuristics", + ], + }, + {"url": "https://example.com/metadata?q={{ query|quote_plus }}"}, + ), + # Unsupported strategies trigger a ValueError + ( + {"search_request_builder_strategy": "unsupported"}, + ValueError( + "Unsupported search_request_builder_strategy value: 'unsupported'" + ), + ), + ) + ), + ), +) +@ensureDeferred +async def test_search_request_template(html, page_params, expected, caplog): + caplog.clear() + caplog.at_level("ERROR") + + http_response = HttpResponse(url="https://example.com", status=200, body=html) + response = AnyResponse(response=http_response) + search_request_page = DefaultSearchRequestTemplatePage( + response=response, + page_params=PageParams(**page_params), + ) + try: + search_request = await search_request_page.to_item() + except Exception as exception: + assert isinstance(expected, Exception) + assert exception.__class__ == expected.__class__ + assert str(expected) in str(exception) + else: + if "error" in expected: + probability = search_request.get_probability() + assert probability is not None + assert probability <= 0.0 + assert expected["error"] in caplog.text + else: + assert isinstance(expected, dict) + assert expected["url"] == search_request.url + assert expected.get("body", b"") == (search_request.body or b"") + + +@ensureDeferred +async def test_search_request_template_browser(caplog): + """Do not suggest using a browser request if that is already the case.""" + caplog.clear() + caplog.at_level("ERROR") + + browser_response = BrowserResponse( + url="https://example.com", status=200, html="
" + ) + response = AnyResponse(response=browser_response) + search_request_page = DefaultSearchRequestTemplatePage( + response=response, page_params=PageParams() + ) + item = await search_request_page.to_item() + probability = item.get_probability() + assert probability is not None + assert probability <= 0.0 + assert "A quick workaround would be to use" in caplog.text diff --git a/tests/test_serp.py b/tests/test_serp.py index 78a1407..4de4e29 100644 --- a/tests/test_serp.py +++ b/tests/test_serp.py @@ -1,24 +1,38 @@ from urllib.parse import quote_plus import pytest -from pydantic import ValidationError from scrapy import Request from scrapy_spider_metadata import get_spider_metadata from scrapy_zyte_api.responses import ZyteAPITextResponse from w3lib.url import add_or_replace_parameter +from zyte_common_items import Product +from zyte_spider_templates._geolocations import ( + GEOLOCATION_OPTIONS, + GEOLOCATION_OPTIONS_WITH_CODE, + Geolocation, +) +from zyte_spider_templates.spiders._google_gl import ( + GOOGLE_GL_OPTIONS, + GOOGLE_GL_OPTIONS_WITH_CODE, + GoogleGl, +) from zyte_spider_templates.spiders._google_hl import ( GOOGLE_HL_OPTIONS, GOOGLE_HL_OPTIONS_WITH_CODE, GoogleHl, ) -from zyte_spider_templates.spiders.serp import GoogleSearchSpider +from zyte_spider_templates.spiders.serp import ( + ITEM_TYPE_CLASSES, + GoogleSearchSpider, + SerpItemType, +) from . import get_crawler from .utils import assertEqualSpiderMetadata -def run_parse_serp(spider, total_results=99999, page=1, query="foo"): +def run_parse_serp(spider, total_results=99999, page=1, query="foo", results=10): url = f"https://www.google.com/search?q={quote_plus(query)}" if page > 1: url = add_or_replace_parameter(url, "start", (page - 1) * 10) @@ -32,7 +46,7 @@ def run_parse_serp(spider, total_results=99999, page=1, query="foo"): "url": f"https://example.com/{rank}", "rank": rank, } - for rank in range(1, 11) + for rank in range(1, results + 1) ], "metadata": { "dateDownloaded": "2024-10-25T08:59:45Z", @@ -56,24 +70,6 @@ def run_parse_serp(spider, total_results=99999, page=1, query="foo"): return items, requests -def test_parameters(): - with pytest.raises(ValidationError): - GoogleSearchSpider() - - with pytest.raises(ValidationError): - GoogleSearchSpider(domain="google.com") - - GoogleSearchSpider(search_queries="foo bar") - GoogleSearchSpider(domain="google.cat", search_queries="foo bar") - GoogleSearchSpider(domain="google.cat", search_queries="foo bar", max_pages=10) - - with pytest.raises(ValidationError): - GoogleSearchSpider(domain="google.foo", search_queries="foo bar") - - with pytest.raises(ValidationError): - GoogleSearchSpider(search_queries="foo bar", max_pages="all") - - def test_start_requests(): crawler = get_crawler() spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") @@ -295,6 +291,19 @@ def test_metadata(): "title": "Search Queries", "widget": "textarea", }, + "max_requests": { + "anyOf": [{"type": "integer"}, {"type": "null"}], + "default": 100, + "description": ( + "The maximum number of Zyte API requests allowed for the crawl.\n" + "\n" + "Requests with error responses that cannot be retried or exceed " + "their retry limit also count here, but they incur in no costs " + "and do not increase the request count in Scrapy Cloud." + ), + "title": "Max Requests", + "widget": "request-limit", + }, "max_pages": { "default": 1, "description": ( @@ -304,6 +313,91 @@ def test_metadata(): "title": "Max Pages", "type": "integer", }, + "results_per_page": { + "anyOf": [ + { + "minimum": 1, + "type": "integer", + }, + { + "type": "null", + }, + ], + "default": None, + "description": "Maximum number of results per page.", + "title": "Results Per Page", + }, + "item_type": { + "anyOf": [{"type": "string"}, {"type": "null"}], + "default": None, + "description": ( + "If specified, follow organic search result links, " + "and extract the selected data type from the target " + "pages. Spider output items will be of the specified " + "data type, not search engine results page items." + ), + "enum": [ + "article", + "articleList", + "forumThread", + "jobPosting", + "product", + "productList", + ], + "title": "Follow and Extract", + }, + "gl": { + "anyOf": [ + {"type": "string"}, + {"type": "null"}, + ], + "default": None, + "description": ( + "Boosts results relevant to this country. See " + "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.gl" + ), + "enumMeta": { + code: { + "title": GOOGLE_GL_OPTIONS_WITH_CODE[code], + } + for code in sorted(GoogleGl) + }, + "title": "User Country", + "enum": list( + sorted(GOOGLE_GL_OPTIONS, key=GOOGLE_GL_OPTIONS.__getitem__) + ), + }, + "cr": { + "anyOf": [ + {"type": "string"}, + {"type": "null"}, + ], + "default": None, + "description": ( + "Restricts search results to documents originating in " + "particular countries. See " + "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.cr" + ), + "title": "Content Countries", + }, + "geolocation": { + "anyOf": [ + {"type": "string"}, + {"type": "null"}, + ], + "default": None, + "description": "Country of the IP addresses to use.", + "enumMeta": { + code: { + "title": GEOLOCATION_OPTIONS_WITH_CODE[code], + } + for code in sorted(Geolocation) + }, + "title": "IP Country", + "enum": list( + sorted(GEOLOCATION_OPTIONS, key=GEOLOCATION_OPTIONS.__getitem__) + ), + }, "hl": { "anyOf": [ {"type": "string"}, @@ -321,7 +415,7 @@ def test_metadata(): } for code in sorted(GoogleHl) }, - "title": "UI Language", + "title": "User Language", "enum": list( sorted(GOOGLE_HL_OPTIONS, key=GOOGLE_HL_OPTIONS.__getitem__) ), @@ -339,19 +433,6 @@ def test_metadata(): ), "title": "Content Languages", }, - "max_requests": { - "anyOf": [{"type": "integer"}, {"type": "null"}], - "default": 100, - "description": ( - "The maximum number of Zyte API requests allowed for the crawl.\n" - "\n" - "Requests with error responses that cannot be retried or exceed " - "their retry limit also count here, but they incur in no costs " - "and do not increase the request count in Scrapy Cloud." - ), - "title": "Max Requests", - "widget": "request-limit", - }, }, "required": ["search_queries"], "title": "GoogleSearchSpiderParams", @@ -360,6 +441,11 @@ def test_metadata(): } assertEqualSpiderMetadata(actual_metadata, expected_metadata) + geolocation = actual_metadata["param_schema"]["properties"]["geolocation"] + assert geolocation["enum"][0] == "AF" + assert geolocation["enumMeta"]["UY"] == {"title": "Uruguay (UY)"} + assert set(geolocation["enum"]) == set(geolocation["enumMeta"]) + def test_input_none(): crawler = get_crawler() @@ -399,7 +485,9 @@ def test_search_queries(): def test_pagination(): crawler = get_crawler() - spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", max_pages=3 + ) items, requests = run_parse_serp( spider, @@ -438,6 +526,35 @@ def test_pagination(): assert requests[0].url == "https://www.google.com/search?q=foo+bar&start=20" assert requests[0].cb_kwargs["page_number"] == 3 + items, requests = run_parse_serp( + spider, + total_results=None, + ) + assert len(items) == 1 + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo&start=10" + assert requests[0].cb_kwargs["page_number"] == 2 + + # Ensure a lack of results stops pagination even if total_results reports + # additional results. + # https://github.com/zytedata/zyte-spider-templates/pull/80/files/359c342008e2e4d5a913d450ddd2dda6c887747c#r1840897802 + items, requests = run_parse_serp( + spider, + total_results=None, + results=0, + ) + assert len(items) == 1 + assert len(requests) == 0 + + # Do not go over max_pages + items, requests = run_parse_serp( + spider, + total_results=31, + page=3, + ) + assert len(items) == 1 + assert len(requests) == 0 + def test_get_serp_request(): crawler = get_crawler() @@ -449,12 +566,14 @@ def test_get_serp_request(): # The page_number parameter is required. with pytest.raises(TypeError): - spider.get_serp_request(url) + spider.get_serp_request(url) # type: ignore[call-arg] def test_parse_serp(): crawler = get_crawler() - spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo bar") + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", max_pages=43 + ) url = "https://www.google.com/search?q=foo+bar" response = ZyteAPITextResponse.from_api_response( api_response={ @@ -494,12 +613,14 @@ def test_parse_serp(): # The page_number parameter is required. with pytest.raises(TypeError): - spider.parse_serp(response) + spider.parse_serp(response) # type: ignore[call-arg] def test_hl(): crawler = get_crawler() - spider = GoogleSearchSpider.from_crawler(crawler, search_queries="foo", hl="gl") + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo", hl="gl", max_pages=2 + ) requests = list(spider.start_requests()) assert len(requests) == 1 assert requests[0].url == "https://www.google.com/search?q=foo&hl=gl" @@ -513,7 +634,7 @@ def test_hl(): def test_lr(): crawler = get_crawler() spider = GoogleSearchSpider.from_crawler( - crawler, search_queries="foo", lr="lang_ja" + crawler, search_queries="foo", lr="lang_ja", max_pages=2 ) requests = list(spider.start_requests()) assert len(requests) == 1 @@ -523,3 +644,121 @@ def test_lr(): assert len(items) == 1 assert len(requests) == 1 assert requests[0].url == "https://www.google.com/search?q=foo&start=10&lr=lang_ja" + + +def test_cr(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo", cr="(-countryFR).(-countryIT)", max_pages=2 + ) + requests = list(spider.start_requests()) + assert len(requests) == 1 + assert ( + requests[0].url + == "https://www.google.com/search?q=foo&cr=%28-countryFR%29.%28-countryIT%29" + ) + + items, requests = run_parse_serp(spider) + assert len(items) == 1 + assert len(requests) == 1 + assert ( + requests[0].url + == "https://www.google.com/search?q=foo&start=10&cr=%28-countryFR%29.%28-countryIT%29" + ) + + +def test_gl(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo", gl="af", max_pages=2 + ) + requests = list(spider.start_requests()) + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo&gl=af" + + items, requests = run_parse_serp(spider) + assert len(items) == 1 + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo&start=10&gl=af" + + +def test_results_per_page(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo", results_per_page=1, max_pages=2 + ) + requests = list(spider.start_requests()) + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo&num=1" + + items, requests = run_parse_serp(spider) + assert len(items) == 1 + assert len(requests) == 1 + assert requests[0].url == "https://www.google.com/search?q=foo&start=1&num=1" + + +def test_item_type(): + crawler = get_crawler() + spider = GoogleSearchSpider.from_crawler( + crawler, search_queries="foo bar", max_pages=43, item_type="product" + ) + url = "https://www.google.com/search?q=foo+bar" + response = ZyteAPITextResponse.from_api_response( + api_response={ + "serp": { + "organicResults": [ + { + "description": "…", + "name": "…", + "url": f"https://example.com/{rank}", + "rank": rank, + } + for rank in range(1, 11) + ], + "metadata": { + "dateDownloaded": "2024-10-25T08:59:45Z", + "displayedQuery": "foo bar", + "searchedQuery": "foo bar", + "totalOrganicResults": 99999, + }, + "pageNumber": 1, + "url": url, + }, + "url": url, + }, + ) + items = [] + requests = [] + for item_or_request in spider.parse_serp(response, page_number=42): + if isinstance(item_or_request, Request): + requests.append(item_or_request) + else: + items.append(item_or_request) + assert len(items) == 0 + assert len(requests) == 11 + + assert requests[0].url == add_or_replace_parameter(url, "start", "420") + assert requests[0].cb_kwargs["page_number"] == 43 + + for rank in range(1, 11): + assert requests[rank].url == f"https://example.com/{rank}" + assert requests[rank].callback == spider.parse_result + assert requests[rank].meta == { + "crawling_logs": {"page_type": "product"}, + "inject": [Product], + } + + +def test_item_type_mappings(): + # Ensure that all SerpItemType keys and values match. + for entry in SerpItemType: + assert entry.name == entry.value + + # Ensure that the ITEM_TYPE_CLASSES dict maps all values from the + # corresponding enum except for serp. + actual_keys = set(ITEM_TYPE_CLASSES) + expected_keys = set(entry.value for entry in SerpItemType) + assert actual_keys == expected_keys + + # Also ensure that no dict value is repeated. + assert len(actual_keys) == len(set(ITEM_TYPE_CLASSES.values())) diff --git a/tox.ini b/tox.ini index 3fa9108..576d792 100644 --- a/tox.ini +++ b/tox.ini @@ -20,13 +20,18 @@ commands = basepython = python3.9 deps = {[testenv]deps} + extruct==0.18.0 + form2request==0.2.0 + formasaurus==0.10.0 + jmespath==0.9.5 pydantic==2.1 - requests==0.10.1 + requests==1.0.0 scrapy==2.11.0 scrapy-poet==0.24.0 scrapy-spider-metadata==0.2.0 scrapy-zyte-api[provider]==0.23.0 - zyte-common-items==0.23.0 + web-poet==0.17.1 + zyte-common-items==0.26.2 [testenv:mypy] deps = diff --git a/utils/google-gl-updater/requirements.in b/utils/google-gl-updater/requirements.in new file mode 100644 index 0000000..25d38c0 --- /dev/null +++ b/utils/google-gl-updater/requirements.in @@ -0,0 +1,3 @@ +jinja2 +parsel +requests diff --git a/utils/google-gl-updater/requirements.txt b/utils/google-gl-updater/requirements.txt new file mode 100644 index 0000000..93b80f5 --- /dev/null +++ b/utils/google-gl-updater/requirements.txt @@ -0,0 +1,32 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile +# +certifi==2024.8.30 + # via requests +charset-normalizer==3.4.0 + # via requests +cssselect==1.2.0 + # via parsel +idna==3.10 + # via requests +jinja2==3.1.4 + # via -r requirements.in +jmespath==1.0.1 + # via parsel +lxml==5.3.0 + # via parsel +markupsafe==3.0.2 + # via jinja2 +packaging==24.2 + # via parsel +parsel==1.9.1 + # via -r requirements.in +requests==2.32.3 + # via -r requirements.in +urllib3==2.2.3 + # via requests +w3lib==2.2.1 + # via parsel diff --git a/utils/google-gl-updater/template.py b/utils/google-gl-updater/template.py new file mode 100644 index 0000000..9112d9a --- /dev/null +++ b/utils/google-gl-updater/template.py @@ -0,0 +1,18 @@ +{% raw %}# ../_geolocations.py counterpart for +# https://developers.google.com/custom-search/docs/json_api_reference#countryCodes +# +# Built automatically with ../../utils/google-gl-updater + +from enum import Enum + +GOOGLE_GL_OPTIONS = {{% endraw %}{% for country in countries %} + "{{ country.code }}": "{{ country.name }}",{% endfor %}{% raw %} +} +GOOGLE_GL_OPTIONS_WITH_CODE = { + code: f"{name} ({code})" for code, name in GOOGLE_GL_OPTIONS.items() +} + + +class GoogleGl(str, Enum):{% endraw %}{% for country in countries %} + {{ country.keyword }}: str = "{{ country.code }}"{% endfor %} + diff --git a/utils/google-gl-updater/update.py b/utils/google-gl-updater/update.py new file mode 100644 index 0000000..28f7d63 --- /dev/null +++ b/utils/google-gl-updater/update.py @@ -0,0 +1,35 @@ +from keyword import iskeyword +from pathlib import Path + +import jinja2 +import requests +from parsel import Selector + +countries = [] + +response = requests.get( + "https://developers.google.com/custom-search/docs/json_api_reference" +) +selector = Selector(text=response.text) +table = selector.xpath('//*[@id="country-codes"]/following-sibling::table[1]') +for tr in table.css("tr"): + name = tr.xpath("td/text()").get() + if not name: # header + continue + code = tr.xpath("td/span/text()").get() + keyword = f"{code}_" if iskeyword(code) else code + countries.append({"code": code, "keyword": keyword, "name": name}) + +template_path = Path(__file__).parent / "template.py" +template_environment = jinja2.Environment() +with template_path.open() as f: + template = template_environment.from_string(f.read()) +output = template.render(countries=countries) +output_path = ( + Path(__file__).parent.parent.parent + / "zyte_spider_templates" + / "spiders" + / "_google_gl.py" +) +with output_path.open("w") as f: + f.write(output) diff --git a/zyte_spider_templates/pages/product_navigation_heuristics.py b/zyte_spider_templates/pages/product_navigation_heuristics.py index bd012ff..fd2a8ae 100644 --- a/zyte_spider_templates/pages/product_navigation_heuristics.py +++ b/zyte_spider_templates/pages/product_navigation_heuristics.py @@ -45,7 +45,7 @@ def _probably_category_links(self) -> List[ProbabilityRequest]: default_probability = 0.1 link_extractor = LinkExtractor( - allow_domains=self.page_params.get("full_domain") + allow_domains=self.page_params.get("full_domain", []) ) ignore_urls = set(self._urls_for_category()) diff --git a/zyte_spider_templates/pages/search_request_template.py b/zyte_spider_templates/pages/search_request_template.py new file mode 100644 index 0000000..f7a3653 --- /dev/null +++ b/zyte_spider_templates/pages/search_request_template.py @@ -0,0 +1,310 @@ +import html +import re +from collections import defaultdict +from logging import getLogger +from random import choice +from string import ascii_letters, digits +from urllib.parse import parse_qs, urlparse + +import attrs +import extruct +import formasaurus +import jmespath +from form2request import form2request +from lxml import etree +from scrapy.http.response.html import HtmlResponse +from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor +from w3lib.url import add_or_replace_parameters +from web_poet import AnyResponse, PageParams, handle_urls +from web_poet.pages import validates_input +from zyte_common_items import SearchRequestTemplate, SearchRequestTemplatePage + +logger = getLogger(__name__) + +# Because Jinja2 syntax gets percent-encoded in a URL, we instead use a +# placeholder made of URL-safe characters, and replace it with Jinja2 code +# after URL encoding. +# +# We use a random placeholder instead of a readable one to minimize risk of +# accidental conflict, and we generate it at run time to minimize risk of +# purposeful conflict. +_url_safe_chars = ascii_letters + digits +_PLACEHOLDER = "".join(choice(_url_safe_chars) for _ in range(32)) + + +def _any_http_response_to_scrapy_response(response: AnyResponse) -> HtmlResponse: + kwargs = {} + encoding = getattr(response, "_encoding", None) or "utf-8" + kwargs["encoding"] = encoding + kwargs["headers"] = getattr(response, "headers", {}) + return HtmlResponse( + url=str(response.url), body=response.text, status=response.status, **kwargs + ) + + +@handle_urls("", priority=250) +@attrs.define +class DefaultSearchRequestTemplatePage(SearchRequestTemplatePage): + response: AnyResponse # type: ignore[assignment] + page_params: PageParams + + def _item_from_form_heuristics(self): + form_xpath = """ + //form[ + descendant-or-self::*[ + contains(@action, "search") + or contains(@aria-label, "search") + or contains(@aria-labelledby, "search") + or contains(@class, "search") + or contains(@data-set, "search") + or contains(@formaction, "search") + or contains(@id, "search") + or contains(@role, "search") + or contains(@title, "search") + ] + ] + """ + forms = self.response.xpath(form_xpath) + if not forms: + raise ValueError("No search forms found.") + + field_xpath = """ + descendant::textarea + /@name + | descendant::input[ + not(@type) + or @type[ + not( + re:test( + ., + "^(?:checkbox|image|radio|reset|submit)$", + "i" + ) + ) + ] + ] + /@name + """ + search_query_field = None + for form in forms: + search_query_field = form.xpath(field_xpath).get() + if search_query_field: + break + if not search_query_field: + raise ValueError( + "No search query field found in any potential search form." + ) + data = {search_query_field: _PLACEHOLDER} + try: + request_data = form2request(form, data) + except NotImplementedError: + raise ValueError("form2request does not support the target search form") + return SearchRequestTemplate( + url=request_data.url.replace(_PLACEHOLDER, "{{ query|quote_plus }}"), + method=request_data.method, + headers=request_data.headers, + body=request_data.body.decode().replace( + _PLACEHOLDER, "{{ query|quote_plus }}" + ), + ) + + def _item_from_extruct(self): + metadata = extruct.extract( + self.response.text, + base_url=str(self.response.url), + syntaxes=["json-ld", "microdata"], + ) + query_field = None + for entry in metadata["microdata"]: + if not (actions := entry.get("properties", {}).get("potentialAction", {})): + continue + if not isinstance(actions, list): + actions = [actions] + for action in actions: + if action.get("type") != "https://schema.org/SearchAction": + continue + url_template = jmespath.search( + "properties.target.urlTemplate || properties.target", action + ) + if not url_template: + continue + query_input = action.get("properties", {}).get("query-input", {}) + query_field = query_input.get("valueName", "search_term_string") + break + if query_field: + break + if not query_field: + for entry in metadata["json-ld"]: + action = jmespath.search( + '"@graph"[].potentialAction || isPartOf.potentialAction || potentialAction', + entry, + ) + if not action: + continue + if isinstance(action, list): + action = jmespath.search( + '([?"@type"==`SearchAction`] | [0]) || @', action + ) + if not action or action.get("@type") != "SearchAction": + continue + url_template = jmespath.search("target.urlTemplate || target", action) + if not url_template: + continue + query_input = action.get( + "query-input", "required name=search_term_string" + ) + query_field_match = re.search(r"\bname=(\S+)", query_input) + if query_field_match: + query_field = query_field_match[1] + else: + query_field = "search_term_string" + break + if query_field: + break + if not query_field: + raise ValueError( + "Could not find HTML metadata to compose a search request template." + ) + parts = url_template.split("?", maxsplit=1) + parts[0] = parts[0].replace(f"{{{query_field}}}", "{{ query|urlencode }}") + if len(parts) > 1: + parts[1] = parts[1].replace(f"{{{query_field}}}", "{{ query|quote_plus }}") + url = "?".join(parts) + url = str(self.response.urljoin(url)) + url = html.unescape(url) + return SearchRequestTemplate( + url=url, + method="GET", + headers=[], + body="", + ) + + def _item_from_link_heuristics(self): + query_parameters = "|".join( + ( + r"[a-z]?(?:(?:field|search)[_-]?)?key(?:word)?s?", + r"[a-z]?(?:(?:field|search)[_-]?)?query", + r"[a-z]?(?:(?:field|search)[_-]?)?params?", + r"[a-z]?(?:(?:field|search)[_-]?)?terms?", + r"[a-z]?(?:(?:field|search)[_-]?)?text", + r"[a-z]?search", + r"qs?", + r"s", + ) + ) + param_regexp = f"(?i)^(?:{query_parameters})$" + url_regexp = f"(?i)[?&](?:{query_parameters})=(?!$)[^&]" + netloc = urlparse(str(self.response.url)).netloc + scrapy_response = _any_http_response_to_scrapy_response(self.response) + try: + search_links = LxmlLinkExtractor( + allow=url_regexp, allow_domains=netloc + ).extract_links(scrapy_response) + except AttributeError as exception: + raise ValueError(str(exception)) + if not search_links: + raise ValueError(f"No valid search links found on {self.response.url}") + for search_link in search_links: + query_string = urlparse(search_link.url).query + query = parse_qs(query_string) + search_params = set() + for k in query: + if re.search(param_regexp, k): + search_params.add(k) + if not search_params: + continue + url = add_or_replace_parameters( + search_link.url, {k: _PLACEHOLDER for k in search_params} + ) + url = url.replace(_PLACEHOLDER, "{{ query|quote_plus }}") + return SearchRequestTemplate( + url=url, + method="GET", + headers=[], + body="", + ) + raise ValueError(f"No valid search links found on {self.response.url}") + + def _item_from_formasaurus(self): + try: + form, data, submit_button = formasaurus.build_submission( + self.response.selector, + "search", + {"search query": _PLACEHOLDER}, + ) + except AttributeError as exception: + raise ValueError(str(exception)) + if not data: + form_excerpt = etree.tostring(form).decode()[:64] + if len(form_excerpt) >= 64: + form_excerpt = form_excerpt[:-1] + "…" + raise ValueError( + f"Did not find an input field for the search query in " + f"the most likely search form at {self.response.url} " + f"(form_excerpt)." + ) + try: + request_data = form2request(form, data, click=submit_button) + except NotImplementedError: + raise ValueError("form2request does not support the target search form") + return SearchRequestTemplate( + url=request_data.url.replace(_PLACEHOLDER, "{{ query|quote_plus }}"), + method=request_data.method, + headers=request_data.headers, + body=request_data.body.decode().replace( + _PLACEHOLDER, "{{ query|quote_plus }}" + ), + ) + + @validates_input + async def to_item(self) -> SearchRequestTemplate: + builders = { + "extruct": self._item_from_extruct, + "formasaurus": self._item_from_formasaurus, + "link_heuristics": self._item_from_link_heuristics, + "form_heuristics": self._item_from_form_heuristics, + } + builder_ids = self.page_params.get("search_request_builders", list(builders)) + builder_strategy = self.page_params.get( + "search_request_builder_strategy", "popular" + ) + if builder_strategy not in {"first", "popular"}: + raise ValueError( + f"Unsupported search_request_builder_strategy value: {builder_strategy!r}" + ) + results = defaultdict(list) + for builder_id in builder_ids: + builder = builders[builder_id] + try: + result = builder() + except ValueError: + continue + if result: + if builder_strategy == "first": + return result + results[(result.url, result.body)].append((builder_id, result)) + if results: + assert builder_strategy == "popular" + top_count = max(len(v) for v in results.values()) + top_results = { + builder_id: result + for result_list in results.values() + for builder_id, result in result_list + if len(result_list) == top_count + } + for builder_id in builder_ids: + if builder_id not in top_results: + continue + return top_results[builder_id] + + logger.error( + f"Cannot build a search request template for " + f"{self.response.url}. A quick workaround would be to use a " + f"search URL as input URL instead of using the search " + f"queries input field. You can also manually implement " + f"search support for a given website " + f"(https://zyte-common-items.readthedocs.io/en/latest/usage/re" + f"quest-templates.html#writing-a-request-template-page-object)" + f"." + ) + return self.no_item_found() diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index e74f3f8..d844fc2 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -78,8 +78,7 @@ class ExtractFromParam(BaseModel): class GeolocationParam(BaseModel): geolocation: Optional[Geolocation] = Field( title="Geolocation", - description="ISO 3166-1 alpha-2 2-character string specified in " - "https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.", + description="Country of the IP addresses to use.", default=None, json_schema_extra={ "enumMeta": { @@ -108,6 +107,40 @@ class MaxRequestsParam(BaseModel): ) +class SearchQueriesParam(BaseModel): + search_queries: List[str] = Field( + title="Search Queries", + description=( + "A list of search queries, one per line, to submit using the " + "search form found on each input URL." + ), + default_factory=list, + json_schema_extra={ + "default": [], + "widget": "textarea", + }, + ) + + @field_validator("search_queries", mode="before") + @classmethod + def validate_search_queries(cls, value: Union[List[str], str]) -> List[str]: + """Validate a list of search queries. + + If a string is received as input, it is split into multiple strings + on new lines. + """ + if isinstance(value, str): + value = value.split("\n") + if not value: + return value + result = [] + for v in value: + if not (v := v.strip()): + continue + result.append(v) + return result + + INPUT_GROUP_FIELDS = ("url", "urls", "urls_file") INPUT_GROUP: JsonDict = { "id": "inputs", @@ -155,7 +188,7 @@ def validate_input_group(model): class UrlsFileParam(BaseModel): - urls_file: str = Field(**URLS_FILE_FIELD_KWARGS) # type: ignore[misc, arg-type] + urls_file: str = Field(**URLS_FILE_FIELD_KWARGS) # type: ignore[call-overload, misc, arg-type] @model_validator(mode="after") def input_group(self): @@ -193,7 +226,7 @@ def parse_input_params(spider): class UrlParam(BaseModel): - url: str = Field(**URL_FIELD_KWARGS) # type: ignore[misc, arg-type] + url: str = Field(**URL_FIELD_KWARGS) # type: ignore[call-overload, misc, arg-type] URLS_FIELD_KWARGS = { @@ -247,7 +280,7 @@ def input_group(self): class UrlsParam(BaseModel): - urls: Optional[List[str]] = Field(**URLS_FIELD_KWARGS) # type: ignore[misc, arg-type] + urls: Optional[List[str]] = Field(**URLS_FIELD_KWARGS) # type: ignore[call-overload, misc, arg-type] @model_validator(mode="after") def input_group(self): diff --git a/zyte_spider_templates/spiders/_google_gl.py b/zyte_spider_templates/spiders/_google_gl.py new file mode 100644 index 0000000..6e01d38 --- /dev/null +++ b/zyte_spider_templates/spiders/_google_gl.py @@ -0,0 +1,493 @@ +# ../_geolocations.py counterpart for +# https://developers.google.com/custom-search/docs/json_api_reference#countryCodes +# +# Built automatically with ../../utils/google-gl-updater + +from enum import Enum + +GOOGLE_GL_OPTIONS = { + "af": "Afghanistan", + "al": "Albania", + "dz": "Algeria", + "as": "American Samoa", + "ad": "Andorra", + "ao": "Angola", + "ai": "Anguilla", + "aq": "Antarctica", + "ag": "Antigua and Barbuda", + "ar": "Argentina", + "am": "Armenia", + "aw": "Aruba", + "au": "Australia", + "at": "Austria", + "az": "Azerbaijan", + "bs": "Bahamas", + "bh": "Bahrain", + "bd": "Bangladesh", + "bb": "Barbados", + "by": "Belarus", + "be": "Belgium", + "bz": "Belize", + "bj": "Benin", + "bm": "Bermuda", + "bt": "Bhutan", + "bo": "Bolivia", + "ba": "Bosnia and Herzegovina", + "bw": "Botswana", + "bv": "Bouvet Island", + "br": "Brazil", + "io": "British Indian Ocean Territory", + "bn": "Brunei Darussalam", + "bg": "Bulgaria", + "bf": "Burkina Faso", + "bi": "Burundi", + "kh": "Cambodia", + "cm": "Cameroon", + "ca": "Canada", + "cv": "Cape Verde", + "ky": "Cayman Islands", + "cf": "Central African Republic", + "td": "Chad", + "cl": "Chile", + "cn": "China", + "cx": "Christmas Island", + "cc": "Cocos (Keeling) Islands", + "co": "Colombia", + "km": "Comoros", + "cg": "Congo", + "cd": "Congo, the Democratic Republic of the", + "ck": "Cook Islands", + "cr": "Costa Rica", + "ci": "Cote D'ivoire", + "hr": "Croatia", + "cu": "Cuba", + "cy": "Cyprus", + "cz": "Czech Republic", + "dk": "Denmark", + "dj": "Djibouti", + "dm": "Dominica", + "do": "Dominican Republic", + "ec": "Ecuador", + "eg": "Egypt", + "sv": "El Salvador", + "gq": "Equatorial Guinea", + "er": "Eritrea", + "ee": "Estonia", + "et": "Ethiopia", + "fk": "Falkland Islands (Malvinas)", + "fo": "Faroe Islands", + "fj": "Fiji", + "fi": "Finland", + "fr": "France", + "gf": "French Guiana", + "pf": "French Polynesia", + "tf": "French Southern Territories", + "ga": "Gabon", + "gm": "Gambia", + "ge": "Georgia", + "de": "Germany", + "gh": "Ghana", + "gi": "Gibraltar", + "gr": "Greece", + "gl": "Greenland", + "gd": "Grenada", + "gp": "Guadeloupe", + "gu": "Guam", + "gt": "Guatemala", + "gn": "Guinea", + "gw": "Guinea-Bissau", + "gy": "Guyana", + "ht": "Haiti", + "hm": "Heard Island and Mcdonald Islands", + "va": "Holy See (Vatican City State)", + "hn": "Honduras", + "hk": "Hong Kong", + "hu": "Hungary", + "is": "Iceland", + "in": "India", + "id": "Indonesia", + "ir": "Iran, Islamic Republic of", + "iq": "Iraq", + "ie": "Ireland", + "il": "Israel", + "it": "Italy", + "jm": "Jamaica", + "jp": "Japan", + "jo": "Jordan", + "kz": "Kazakhstan", + "ke": "Kenya", + "ki": "Kiribati", + "kp": "Korea, Democratic People's Republic of", + "kr": "Korea, Republic of", + "kw": "Kuwait", + "kg": "Kyrgyzstan", + "la": "Lao People's Democratic Republic", + "lv": "Latvia", + "lb": "Lebanon", + "ls": "Lesotho", + "lr": "Liberia", + "ly": "Libyan Arab Jamahiriya", + "li": "Liechtenstein", + "lt": "Lithuania", + "lu": "Luxembourg", + "mo": "Macao", + "mk": "Macedonia, the Former Yugosalv Republic of", + "mg": "Madagascar", + "mw": "Malawi", + "my": "Malaysia", + "mv": "Maldives", + "ml": "Mali", + "mt": "Malta", + "mh": "Marshall Islands", + "mq": "Martinique", + "mr": "Mauritania", + "mu": "Mauritius", + "yt": "Mayotte", + "mx": "Mexico", + "fm": "Micronesia, Federated States of", + "md": "Moldova, Republic of", + "mc": "Monaco", + "mn": "Mongolia", + "ms": "Montserrat", + "ma": "Morocco", + "mz": "Mozambique", + "mm": "Myanmar", + "na": "Namibia", + "nr": "Nauru", + "np": "Nepal", + "nl": "Netherlands", + "an": "Netherlands Antilles", + "nc": "New Caledonia", + "nz": "New Zealand", + "ni": "Nicaragua", + "ne": "Niger", + "ng": "Nigeria", + "nu": "Niue", + "nf": "Norfolk Island", + "mp": "Northern Mariana Islands", + "no": "Norway", + "om": "Oman", + "pk": "Pakistan", + "pw": "Palau", + "ps": "Palestinian Territory, Occupied", + "pa": "Panama", + "pg": "Papua New Guinea", + "py": "Paraguay", + "pe": "Peru", + "ph": "Philippines", + "pn": "Pitcairn", + "pl": "Poland", + "pt": "Portugal", + "pr": "Puerto Rico", + "qa": "Qatar", + "re": "Reunion", + "ro": "Romania", + "ru": "Russian Federation", + "rw": "Rwanda", + "sh": "Saint Helena", + "kn": "Saint Kitts and Nevis", + "lc": "Saint Lucia", + "pm": "Saint Pierre and Miquelon", + "vc": "Saint Vincent and the Grenadines", + "ws": "Samoa", + "sm": "San Marino", + "st": "Sao Tome and Principe", + "sa": "Saudi Arabia", + "sn": "Senegal", + "cs": "Serbia and Montenegro", + "sc": "Seychelles", + "sl": "Sierra Leone", + "sg": "Singapore", + "sk": "Slovakia", + "si": "Slovenia", + "sb": "Solomon Islands", + "so": "Somalia", + "za": "South Africa", + "gs": "South Georgia and the South Sandwich Islands", + "es": "Spain", + "lk": "Sri Lanka", + "sd": "Sudan", + "sr": "Suriname", + "sj": "Svalbard and Jan Mayen", + "sz": "Swaziland", + "se": "Sweden", + "ch": "Switzerland", + "sy": "Syrian Arab Republic", + "tw": "Taiwan, Province of China", + "tj": "Tajikistan", + "tz": "Tanzania, United Republic of", + "th": "Thailand", + "tl": "Timor-Leste", + "tg": "Togo", + "tk": "Tokelau", + "to": "Tonga", + "tt": "Trinidad and Tobago", + "tn": "Tunisia", + "tr": "Turkey", + "tm": "Turkmenistan", + "tc": "Turks and Caicos Islands", + "tv": "Tuvalu", + "ug": "Uganda", + "ua": "Ukraine", + "ae": "United Arab Emirates", + "uk": "United Kingdom", + "us": "United States", + "um": "United States Minor Outlying Islands", + "uy": "Uruguay", + "uz": "Uzbekistan", + "vu": "Vanuatu", + "ve": "Venezuela", + "vn": "Viet Nam", + "vg": "Virgin Islands, British", + "vi": "Virgin Islands, U.S.", + "wf": "Wallis and Futuna", + "eh": "Western Sahara", + "ye": "Yemen", + "zm": "Zambia", + "zw": "Zimbabwe", +} +GOOGLE_GL_OPTIONS_WITH_CODE = { + code: f"{name} ({code})" for code, name in GOOGLE_GL_OPTIONS.items() +} + + +class GoogleGl(str, Enum): + af: str = "af" + al: str = "al" + dz: str = "dz" + as_: str = "as" + ad: str = "ad" + ao: str = "ao" + ai: str = "ai" + aq: str = "aq" + ag: str = "ag" + ar: str = "ar" + am: str = "am" + aw: str = "aw" + au: str = "au" + at: str = "at" + az: str = "az" + bs: str = "bs" + bh: str = "bh" + bd: str = "bd" + bb: str = "bb" + by: str = "by" + be: str = "be" + bz: str = "bz" + bj: str = "bj" + bm: str = "bm" + bt: str = "bt" + bo: str = "bo" + ba: str = "ba" + bw: str = "bw" + bv: str = "bv" + br: str = "br" + io: str = "io" + bn: str = "bn" + bg: str = "bg" + bf: str = "bf" + bi: str = "bi" + kh: str = "kh" + cm: str = "cm" + ca: str = "ca" + cv: str = "cv" + ky: str = "ky" + cf: str = "cf" + td: str = "td" + cl: str = "cl" + cn: str = "cn" + cx: str = "cx" + cc: str = "cc" + co: str = "co" + km: str = "km" + cg: str = "cg" + cd: str = "cd" + ck: str = "ck" + cr: str = "cr" + ci: str = "ci" + hr: str = "hr" + cu: str = "cu" + cy: str = "cy" + cz: str = "cz" + dk: str = "dk" + dj: str = "dj" + dm: str = "dm" + do: str = "do" + ec: str = "ec" + eg: str = "eg" + sv: str = "sv" + gq: str = "gq" + er: str = "er" + ee: str = "ee" + et: str = "et" + fk: str = "fk" + fo: str = "fo" + fj: str = "fj" + fi: str = "fi" + fr: str = "fr" + gf: str = "gf" + pf: str = "pf" + tf: str = "tf" + ga: str = "ga" + gm: str = "gm" + ge: str = "ge" + de: str = "de" + gh: str = "gh" + gi: str = "gi" + gr: str = "gr" + gl: str = "gl" + gd: str = "gd" + gp: str = "gp" + gu: str = "gu" + gt: str = "gt" + gn: str = "gn" + gw: str = "gw" + gy: str = "gy" + ht: str = "ht" + hm: str = "hm" + va: str = "va" + hn: str = "hn" + hk: str = "hk" + hu: str = "hu" + is_: str = "is" + in_: str = "in" + id: str = "id" + ir: str = "ir" + iq: str = "iq" + ie: str = "ie" + il: str = "il" + it: str = "it" + jm: str = "jm" + jp: str = "jp" + jo: str = "jo" + kz: str = "kz" + ke: str = "ke" + ki: str = "ki" + kp: str = "kp" + kr: str = "kr" + kw: str = "kw" + kg: str = "kg" + la: str = "la" + lv: str = "lv" + lb: str = "lb" + ls: str = "ls" + lr: str = "lr" + ly: str = "ly" + li: str = "li" + lt: str = "lt" + lu: str = "lu" + mo: str = "mo" + mk: str = "mk" + mg: str = "mg" + mw: str = "mw" + my: str = "my" + mv: str = "mv" + ml: str = "ml" + mt: str = "mt" + mh: str = "mh" + mq: str = "mq" + mr: str = "mr" + mu: str = "mu" + yt: str = "yt" + mx: str = "mx" + fm: str = "fm" + md: str = "md" + mc: str = "mc" + mn: str = "mn" + ms: str = "ms" + ma: str = "ma" + mz: str = "mz" + mm: str = "mm" + na: str = "na" + nr: str = "nr" + np: str = "np" + nl: str = "nl" + an: str = "an" + nc: str = "nc" + nz: str = "nz" + ni: str = "ni" + ne: str = "ne" + ng: str = "ng" + nu: str = "nu" + nf: str = "nf" + mp: str = "mp" + no: str = "no" + om: str = "om" + pk: str = "pk" + pw: str = "pw" + ps: str = "ps" + pa: str = "pa" + pg: str = "pg" + py: str = "py" + pe: str = "pe" + ph: str = "ph" + pn: str = "pn" + pl: str = "pl" + pt: str = "pt" + pr: str = "pr" + qa: str = "qa" + re: str = "re" + ro: str = "ro" + ru: str = "ru" + rw: str = "rw" + sh: str = "sh" + kn: str = "kn" + lc: str = "lc" + pm: str = "pm" + vc: str = "vc" + ws: str = "ws" + sm: str = "sm" + st: str = "st" + sa: str = "sa" + sn: str = "sn" + cs: str = "cs" + sc: str = "sc" + sl: str = "sl" + sg: str = "sg" + sk: str = "sk" + si: str = "si" + sb: str = "sb" + so: str = "so" + za: str = "za" + gs: str = "gs" + es: str = "es" + lk: str = "lk" + sd: str = "sd" + sr: str = "sr" + sj: str = "sj" + sz: str = "sz" + se: str = "se" + ch: str = "ch" + sy: str = "sy" + tw: str = "tw" + tj: str = "tj" + tz: str = "tz" + th: str = "th" + tl: str = "tl" + tg: str = "tg" + tk: str = "tk" + to: str = "to" + tt: str = "tt" + tn: str = "tn" + tr: str = "tr" + tm: str = "tm" + tc: str = "tc" + tv: str = "tv" + ug: str = "ug" + ua: str = "ua" + ae: str = "ae" + uk: str = "uk" + us: str = "us" + um: str = "um" + uy: str = "uy" + uz: str = "uz" + vu: str = "vu" + ve: str = "ve" + vn: str = "vn" + vg: str = "vg" + vi: str = "vi" + wf: str = "wf" + eh: str = "eh" + ye: str = "ye" + zm: str = "zm" + zw: str = "zw" diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py index e6e78f4..b4de089 100644 --- a/zyte_spider_templates/spiders/base.py +++ b/zyte_spider_templates/spiders/base.py @@ -1,5 +1,7 @@ +from __future__ import annotations + from importlib.metadata import version -from typing import Annotated, Any, Dict +from typing import TYPE_CHECKING, Annotated, Any, Dict from warnings import warn import scrapy @@ -13,11 +15,17 @@ ExtractFromParam, GeolocationParam, MaxRequestsParam, + SearchQueriesParam, UrlParam, UrlsFileParam, UrlsParam, ) +if TYPE_CHECKING: + # typing.Self requires Python 3.11 + from typing_extensions import Self + + # Higher priority than command-line-defined settings (40). ARG_SETTING_PRIORITY: int = 50 @@ -26,6 +34,7 @@ class BaseSpiderParams( ExtractFromParam, MaxRequestsParam, GeolocationParam, + SearchQueriesParam, UrlsFileParam, UrlsParam, UrlParam, @@ -49,10 +58,11 @@ def deprecated(self): ), DeprecationWarning, ) + return self class BaseSpider(scrapy.Spider): - custom_settings: Dict[str, Any] = { + custom_settings: Dict[str, Any] = { # type: ignore[assignment] "ZYTE_API_TRANSPARENT_MODE": True, "_ZYTE_API_USER_AGENT": f"zyte-spider-templates/{version('zyte-spider-templates')}", } @@ -68,9 +78,13 @@ class BaseSpider(scrapy.Spider): _custom_attrs_dep = None @classmethod - def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: + def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self: spider = super().from_crawler(crawler, *args, **kwargs) + # all subclasses of this need to also have Args as a subclass + # this may be possible to express in type hints instead + assert hasattr(spider, "args") + if geolocation := getattr(spider.args, "geolocation", None): # We set the geolocation in ZYTE_API_PROVIDER_PARAMS for injected # dependencies, and in ZYTE_API_AUTOMAP_PARAMS for page object diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index 5e87266..586c364 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -1,21 +1,24 @@ +from __future__ import annotations + from enum import Enum -from typing import Any, Callable, Dict, Iterable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Optional, Union, cast import scrapy -from pydantic import BaseModel, ConfigDict, Field -from scrapy import Request +from pydantic import BaseModel, ConfigDict, Field, model_validator from scrapy.crawler import Crawler from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args +from web_poet.page_inputs.browser import BrowserResponse from zyte_common_items import ( CustomAttributes, ProbabilityRequest, Product, ProductNavigation, + SearchRequestTemplate, ) from zyte_spider_templates.heuristics import is_homepage -from zyte_spider_templates.params import parse_input_params +from zyte_spider_templates.params import ExtractFrom, parse_input_params from zyte_spider_templates.spiders.base import ( ARG_SETTING_PRIORITY, INPUT_GROUP, @@ -30,11 +33,16 @@ ExtractFromParam, GeolocationParam, MaxRequestsParam, + SearchQueriesParam, UrlParam, UrlsFileParam, UrlsParam, ) +if TYPE_CHECKING: + # typing.Self requires Python 3.11 + from typing_extensions import Self + @document_enum class EcommerceCrawlStrategy(str, Enum): @@ -148,6 +156,7 @@ class EcommerceSpiderParams( MaxRequestsParam, GeolocationParam, EcommerceCrawlStrategyParam, + SearchQueriesParam, UrlsFileParam, UrlsParam, UrlParam, @@ -161,6 +170,20 @@ class EcommerceSpiderParams( }, ) + @model_validator(mode="after") + def validate_direct_item_and_search_queries(self): + if self.search_queries and self.crawl_strategy in { + EcommerceCrawlStrategy.direct_item, + EcommerceCrawlStrategy.full, + EcommerceCrawlStrategy.navigation, + }: + raise ValueError( + f"Cannot combine the {self.crawl_strategy.value!r} value of " + f"the crawl_strategy spider parameter with the search_queries " + f"spider parameter." + ) + return self + class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider): """Yield products from an e-commerce website. @@ -180,7 +203,7 @@ class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider): } @classmethod - def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: + def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self: spider = super(EcommerceSpider, cls).from_crawler(crawler, *args, **kwargs) parse_input_params(spider) spider._init_extract_from() @@ -204,7 +227,7 @@ def get_start_request(self, url): if self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item else self.parse_navigation ) - meta = { + meta: Dict[str, Any] = { "crawling_logs": { "page_type": "product" if self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item @@ -234,19 +257,49 @@ def get_start_request(self, url): f"Heuristics won't be used to crawl other pages which might have products." ) - return Request( + return scrapy.Request( url=url, callback=callback, meta=meta, ) - def start_requests(self) -> Iterable[Request]: - for url in self.start_urls: - yield self.get_start_request(url) + def start_requests(self) -> Iterable[scrapy.Request]: + if self.args.search_queries: + for url in self.start_urls: + meta: Dict[str, Any] = { + "crawling_logs": {"page_type": "searchRequestTemplate"}, + } + if self.args.extract_from == ExtractFrom.browserHtml: + meta["inject"] = [BrowserResponse] + yield scrapy.Request( + url=url, + callback=self.parse_search_request_template, + meta=meta, + ) + else: + for url in self.start_urls: + yield self.get_start_request(url) + + def parse_search_request_template( + self, + response: DummyResponse, + search_request_template: SearchRequestTemplate, + dynamic: DynamicDeps, + ) -> Iterable[scrapy.Request]: + probability = search_request_template.get_probability() + if probability is not None and probability <= 0: + return + for query in self.args.search_queries: + yield search_request_template.request(query=query).to_scrapy( + callback=self.parse_navigation, + meta={ + "crawling_logs": {"page_type": "productNavigation"}, + }, + ) def parse_navigation( self, response: DummyResponse, navigation: ProductNavigation - ) -> Iterable[Request]: + ) -> Iterable[scrapy.Request]: page_params = self._modify_page_params_for_heuristics( response.meta.get("page_params") ) @@ -262,9 +315,14 @@ def parse_navigation( f"are no product links found in {navigation.url}" ) else: - yield self.get_nextpage_request(navigation.nextPage) + yield self.get_nextpage_request( + cast(ProbabilityRequest, navigation.nextPage) + ) - if self.args.crawl_strategy != EcommerceCrawlStrategy.pagination_only: + if ( + self.args.crawl_strategy != EcommerceCrawlStrategy.pagination_only + and not self.args.search_queries + ): for request in navigation.subCategories or []: yield self.get_subcategory_request(request, page_params=page_params) @@ -285,6 +343,7 @@ def parse_product( else: yield product else: + assert self.crawler.stats self.crawler.stats.inc_value("drop_item/product/low_probability") self.logger.info( f"Ignoring item from {response.url} since its probability is " @@ -292,9 +351,7 @@ def parse_product( ) @staticmethod - def get_parse_navigation_request_priority( - request: Union[ProbabilityRequest, Request] - ) -> int: + def get_parse_navigation_request_priority(request: ProbabilityRequest) -> int: if ( not hasattr(request, "metadata") or not request.metadata @@ -305,7 +362,7 @@ def get_parse_navigation_request_priority( def get_parse_navigation_request( self, - request: Union[ProbabilityRequest, Request], + request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, priority: Optional[int] = None, @@ -328,7 +385,7 @@ def get_parse_navigation_request( def get_subcategory_request( self, - request: Union[ProbabilityRequest, Request], + request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, priority: Optional[int] = None, @@ -350,7 +407,7 @@ def get_subcategory_request( def get_nextpage_request( self, - request: Union[ProbabilityRequest, Request], + request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, ): @@ -369,7 +426,7 @@ def get_parse_product_request( priority = self.get_parse_product_request_priority(request) probability = request.get_probability() - meta = { + meta: Dict[str, Any] = { "crawling_logs": { "name": request.name, "probability": probability, diff --git a/zyte_spider_templates/spiders/serp.py b/zyte_spider_templates/spiders/serp.py index 5f3a0c6..3106682 100644 --- a/zyte_spider_templates/spiders/serp.py +++ b/zyte_spider_templates/spiders/serp.py @@ -1,21 +1,65 @@ +from enum import Enum from typing import Any, Dict, Iterable, List, Optional, Union from pydantic import BaseModel, Field, field_validator from scrapy import Request from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings +from scrapy_poet import DummyResponse, DynamicDeps from scrapy_spider_metadata import Args from w3lib.url import add_or_replace_parameter -from zyte_common_items import Serp +from zyte_common_items import ( + Article, + ArticleList, + ForumThread, + JobPosting, + Product, + ProductList, + Serp, +) +from .._geolocations import GEOLOCATION_OPTIONS_WITH_CODE, Geolocation +from ..documentation import document_enum from ..params import MaxRequestsParam from ._google_domains import GoogleDomain +from ._google_gl import GOOGLE_GL_OPTIONS_WITH_CODE, GoogleGl from ._google_hl import GOOGLE_HL_OPTIONS_WITH_CODE, GoogleHl from .base import BaseSpider +class GoogleCrParam(BaseModel): + cr: Optional[str] = Field( + title="Content Countries", + description=( + "Restricts search results to documents originating in " + "particular countries. See " + "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.cr" + ), + default=None, + ) + + +class GoogleGlParam(BaseModel): + gl: Optional[GoogleGl] = Field( + title="User Country", + description=( + "Boosts results relevant to this country. See " + "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.gl" + ), + default=None, + json_schema_extra={ + "enumMeta": { + code: { + "title": GOOGLE_GL_OPTIONS_WITH_CODE[code], + } + for code in GoogleGl + } + }, + ) + + class GoogleHlParam(BaseModel): hl: Optional[GoogleHl] = Field( - title="UI Language", + title="User Language", description=( "User interface language, which can affect search results. See " "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.hl" @@ -71,6 +115,26 @@ def validate_search_queries(cls, value: Union[List[str], str]) -> List[str]: return result +class SerpGeolocationParam(BaseModel): + # We use “geolocation” as parameter name (instead of e.g. “ip_geolocation”) + # to reuse the implementation in BaseSpider. + geolocation: Optional[Geolocation] = Field( + # The title, worded like this for contrast with gl, is the reason why + # ..params.GeolocationParam is not used. + title="IP Country", + description="Country of the IP addresses to use.", + default=None, + json_schema_extra={ + "enumMeta": { + code: { + "title": GEOLOCATION_OPTIONS_WITH_CODE[code], + } + for code in Geolocation + } + }, + ) + + class SerpMaxPagesParam(BaseModel): max_pages: int = Field( title="Max Pages", @@ -80,6 +144,71 @@ class SerpMaxPagesParam(BaseModel): ) +class SerpResultsPerPageParam(BaseModel): + results_per_page: Optional[int] = Field( + title="Results Per Page", + description="Maximum number of results per page.", + ge=1, + default=None, + ) + + +@document_enum +class SerpItemType(str, Enum): + article: str = "article" + """ + Article data. + """ + + articleList: str = "articleList" + """ + Article list data. + """ + + forumThread: str = "forumThread" + """ + Forum thread data. + """ + + jobPosting: str = "jobPosting" + """ + Job posting data. + """ + + product: str = "product" + """ + Product data. + """ + + productList: str = "productList" + """ + Product list data. + """ + + +ITEM_TYPE_CLASSES = { + SerpItemType.article: Article, + SerpItemType.articleList: ArticleList, + SerpItemType.forumThread: ForumThread, + SerpItemType.jobPosting: JobPosting, + SerpItemType.product: Product, + SerpItemType.productList: ProductList, +} + + +class SerpItemTypeParam(BaseModel): + item_type: Optional[SerpItemType] = Field( + title="Follow and Extract", + description=( + "If specified, follow organic search result links, and extract " + "the selected data type from the target pages. Spider output " + "items will be of the specified data type, not search engine " + "results page items." + ), + default=None, + ) + + class GoogleDomainParam(BaseModel): domain: GoogleDomain = Field( title="Domain", @@ -89,10 +218,15 @@ class GoogleDomainParam(BaseModel): class GoogleSearchSpiderParams( - MaxRequestsParam, GoogleLrParam, GoogleHlParam, + SerpGeolocationParam, + GoogleCrParam, + GoogleGlParam, + SerpItemTypeParam, + SerpResultsPerPageParam, SerpMaxPagesParam, + MaxRequestsParam, SearchQueriesParam, GoogleDomainParam, BaseModel, @@ -110,7 +244,7 @@ class GoogleSearchSpider(Args[GoogleSearchSpiderParams], BaseSpider): """ name = "google_search" - _results_per_page = 10 + _default_results_per_page = 10 metadata: Dict[str, Any] = { **BaseSpider.metadata, @@ -133,10 +267,20 @@ def update_settings(cls, settings: BaseSettings) -> None: ) def get_serp_request(self, url: str, *, page_number: int): - if self.args.hl: - url = add_or_replace_parameter(url, "hl", self.args.hl.value) - if self.args.lr: - url = add_or_replace_parameter(url, "lr", self.args.lr) + for argument, parameter in ( + (self.args.cr, "cr"), + (self.args.gl, "gl"), + (self.args.hl, "hl"), + (self.args.lr, "lr"), + (self.args.results_per_page, "num"), + ): + if not argument: + continue + if isinstance(argument, Enum): + argument = argument.value + if not isinstance(argument, str): + argument = str(argument) + url = add_or_replace_parameter(url, parameter, argument) return Request( url=url, callback=self.parse_serp, @@ -164,9 +308,32 @@ def start_requests(self) -> Iterable[Request]: def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]: serp = Serp.from_dict(response.raw_api_response["serp"]) - next_start = page_number * self._results_per_page - if serp.organicResults and serp.metadata.totalOrganicResults > next_start: - next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) - yield self.get_serp_request(next_url, page_number=page_number + 1) + if page_number < self.args.max_pages: + next_start = page_number * ( + self.args.results_per_page or self._default_results_per_page + ) + if serp.organicResults and ( + serp.metadata.totalOrganicResults is None + or serp.metadata.totalOrganicResults > next_start + ): + next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) + yield self.get_serp_request(next_url, page_number=page_number + 1) + + if self.args.item_type is None: + yield serp + return + + for result in serp.organicResults: + yield response.follow( + result.url, + callback=self.parse_result, + meta={ + "crawling_logs": {"page_type": self.args.item_type.value}, + "inject": [ITEM_TYPE_CLASSES[self.args.item_type]], + }, + ) - yield serp + def parse_result( + self, response: DummyResponse, dynamic: DynamicDeps + ) -> Iterable[Any]: + yield next(iter(dynamic.values()))