E-commerce search support (#77)

zytedata · Nov 22, 2024 · e9bf031 · e9bf031
1 parent 71a5f71
commit e9bf031
Show file tree

Hide file tree

Showing 13 changed files with 1,280 additions and 11 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -22,6 +22,14 @@
 html_theme = "sphinx_rtd_theme"
 
 intersphinx_mapping = {
+    "form2request": (
+        "https://form2request.readthedocs.io/en/latest",
+        None,
+    ),
+    "formasaurus": (
+        "https://formasaurus.readthedocs.io/en/latest",
+        None,
+    ),
     "python": (
         "https://docs.python.org/3",
         None,

diff --git a/docs/customization/pages.rst b/docs/customization/pages.rst
@@ -6,7 +6,8 @@ Customizing page objects
 
 All parsing is implemented using :ref:`web-poet page objects <page-objects>`
 that use `Zyte API automatic extraction`_ to extract :ref:`standard items
-<item-api>`, both for navigation and for item details.
+<item-api>`: for navigation, for item details, and even for :ref:`search
+request generation <search-queries>`.
 
 .. _Zyte API automatic extraction: https://docs.zyte.com/zyte-api/usage/extract.html
 
@@ -141,3 +142,27 @@ To extract a new field for one or more websites:
 
             def parse_product(self, response: DummyResponse, product: CustomProduct):
                 yield from super().parse_product(response, product)
+
+.. _fix-search:
+
+Fixing search support
+=====================
+
+If the default implementation to build a request out of :ref:`search queries
+<search-queries>` does not work on a given website, you can implement your
+own search request page object to fix that. See
+:ref:`custom-request-template-page`.
+
+For example:
+
+.. code-block:: python
+
+    from web_poet import handle_urls
+    from zyte_common_items import BaseSearchRequestTemplatePage
+
+
+    @handle_urls("example.com")
+    class ExampleComSearchRequestTemplatePage(BaseSearchRequestTemplatePage):
+        @field
+        def url(self):
+            return "https://example.com/search?q={{ query|quote_plus }}"
diff --git a/docs/features/search.rst b/docs/features/search.rst
@@ -0,0 +1,43 @@
+.. _search-queries:
+
+==============
+Search queries
+==============
+
+The :ref:`e-commerce spider template <e-commerce>` supports a spider argument,
+:data:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.search_queries`,
+that allows you to define a different search query per line, and
+turns the input URLs into search requests for those queries.
+
+For example, given the following input URLs:
+
+.. code-block:: none
+
+    https://a.example
+    https://b.example
+
+And the following list of search queries:
+
+.. code-block:: none
+
+    foo bar
+    baz
+
+By default, the spider would send 2 initial requests to those 2 input URLs,
+to try and find out how to build a search request for them, and if it succeeds,
+it will then send 4 search requests, 1 per combination of input URL and search
+query. For example:
+
+.. code-block:: none
+
+    https://a.example/search?q=foo+bar
+    https://a.example/search?q=baz
+    https://b.example/s/foo%20bar
+    https://b.example/s/baz
+
+The default implementation uses a combination of HTML metadata, AI-based HTML
+form inspection and heuristics to find the most likely way to build a search
+request for a given website.
+
+If this default implementation does not work as expected on a given website,
+you can :ref:`write a page object to fix that <fix-search>`.
diff --git a/docs/index.rst b/docs/index.rst
@@ -20,6 +20,12 @@ zyte-spider-templates documentation
    E-commerce <templates/e-commerce>
    Google search <templates/google-search>
 
+.. toctree::
+   :caption: Features
+   :hidden:
+
+   Search queries <features/search>
+
 .. toctree::
    :caption: Customization
    :hidden:

diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+filterwarnings =
+    ignore:deprecated string literal syntax::jmespath.lexer
diff --git a/setup.py b/setup.py
@@ -12,13 +12,18 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
+        "extruct>=0.18.0",
+        "form2request>=0.2.0",
+        "formasaurus>=0.10.0",
+        "jmespath>=0.9.5",
         "pydantic>=2.1",
-        "requests>=0.10.1",
+        "requests>=1.0.0",
         "scrapy>=2.11.0",
         "scrapy-poet>=0.24.0",
         "scrapy-spider-metadata>=0.2.0",
         "scrapy-zyte-api[provider]>=0.23.0",
-        "zyte-common-items>=0.23.0",
+        "web-poet>=0.17.1",
+        "zyte-common-items>=0.25.0",
     ],
     classifiers=[
         "Development Status :: 3 - Alpha",

diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py
@@ -7,7 +7,14 @@
 from pydantic import ValidationError
 from scrapy_poet import DummyResponse, DynamicDeps
 from scrapy_spider_metadata import get_spider_metadata
-from zyte_common_items import ProbabilityRequest, Product, ProductNavigation
+from web_poet.page_inputs.browser import BrowserResponse
+from zyte_common_items import (
+    ProbabilityRequest,
+    Product,
+    ProductNavigation,
+    SearchRequestTemplate,
+    SearchRequestTemplateMetadata,
+)
 
 from zyte_spider_templates._geolocations import (
     GEOLOCATION_OPTIONS,
@@ -37,6 +44,19 @@ def test_parameters():
     with pytest.raises(ValidationError):
         EcommerceSpider(url="https://example.com", crawl_strategy="unknown")
 
+    EcommerceSpider(
+        url="https://example.com", crawl_strategy="direct_item", search_queries=""
+    )
+    EcommerceSpider(
+        url="https://example.com", crawl_strategy="automatic", search_queries="foo"
+    )
+    with pytest.raises(ValidationError):
+        EcommerceSpider(
+            url="https://example.com",
+            crawl_strategy="direct_item",
+            search_queries="foo",
+        )
+
 
 def test_start_requests():
     url = "https://example.com"
@@ -258,6 +278,33 @@ def test_parse_product(probability, has_item, item_drop, caplog):
         assert str(product) in caplog.text
 
 
+@pytest.mark.parametrize(
+    ("probability", "yields_items"),
+    (
+        (None, True),  # Default
+        (-1.0, False),
+        (0.0, False),  # page.no_item_found()
+        (1.0, True),
+    ),
+)
+def test_parse_search_request_template_probability(probability, yields_items):
+    crawler = get_crawler()
+    spider = EcommerceSpider.from_crawler(
+        crawler, url="https://example.com", search_queries="foo"
+    )
+    search_request_template = SearchRequestTemplate(url="https://example.com")
+    if probability is not None:
+        search_request_template.metadata = SearchRequestTemplateMetadata(
+            probability=probability
+        )
+    items = list(
+        spider.parse_search_request_template(
+            DummyResponse("https://example.com"), search_request_template, DynamicDeps()
+        )
+    )
+    assert items if yields_items else not items
+
+
 def test_arguments():
     # Ensure passing no arguments works.
     crawler = get_crawler()
@@ -420,6 +467,17 @@ def test_metadata():
                     "title": "URLs file",
                     "type": "string",
                 },
+                "search_queries": {
+                    "default": [],
+                    "description": (
+                        "A list of search queries, one per line, to submit "
+                        "using the search form found on each input URL."
+                    ),
+                    "items": {"type": "string"},
+                    "title": "Search Queries",
+                    "type": "array",
+                    "widget": "textarea",
+                },
                 "crawl_strategy": {
                     "default": "automatic",
                     "description": "Determines how the start URL and follow-up URLs are crawled.",
@@ -820,6 +878,58 @@ def test_urls_file():
     assert start_requests[2].url == "https://c.example"
 
 
+def test_search_queries():
+    crawler = get_crawler()
+    url = "https://example.com"
+
+    spider = EcommerceSpider.from_crawler(crawler, url=url, search_queries="foo bar")
+    start_requests = list(spider.start_requests())
+    assert len(start_requests) == 1
+    assert start_requests[0].url == url
+    assert start_requests[0].callback == spider.parse_search_request_template
+    assert spider.args.search_queries == ["foo bar"]
+
+    spider = EcommerceSpider.from_crawler(crawler, url=url, search_queries="foo\nbar")
+    start_requests = list(spider.start_requests())
+    assert len(start_requests) == 1
+    assert start_requests[0].url == url
+    assert start_requests[0].callback == spider.parse_search_request_template
+    assert spider.args.search_queries == ["foo", "bar"]
+
+    spider = EcommerceSpider.from_crawler(
+        crawler, url=url, search_queries=["foo", "bar"]
+    )
+    start_requests = list(spider.start_requests())
+    assert len(start_requests) == 1
+    assert start_requests[0].url == url
+    assert start_requests[0].callback == spider.parse_search_request_template
+    assert spider.args.search_queries == ["foo", "bar"]
+
+
+def test_search_queries_extract_from():
+    crawler = get_crawler()
+    url = "https://example.com"
+
+    spider = EcommerceSpider.from_crawler(crawler, url=url, search_queries="foo")
+    start_requests = list(spider.start_requests())
+    assert len(start_requests) == 1
+    assert "inject" not in start_requests[0].meta
+
+    spider = EcommerceSpider.from_crawler(
+        crawler, url=url, search_queries="foo", extract_from="httpResponseBody"
+    )
+    start_requests = list(spider.start_requests())
+    assert len(start_requests) == 1
+    assert "inject" not in start_requests[0].meta
+
+    spider = EcommerceSpider.from_crawler(
+        crawler, url=url, search_queries="foo", extract_from="browserHtml"
+    )
+    start_requests = list(spider.start_requests())
+    assert len(start_requests) == 1
+    assert start_requests[0].meta["inject"] == [BrowserResponse]
+
+
 @pytest.mark.parametrize(
     "url,has_full_domain",
     (