Merge remote-tracking branch 'zytedata/main' into google-search-max-r…

…equests-int
zytedata · Nov 22, 2024 · f121311 · f121311
2 parents 9773bca + 8e0eefb
commit f121311
Show file tree

Hide file tree

Showing 29 changed files with 2,928 additions and 206 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -22,6 +22,14 @@
 html_theme = "sphinx_rtd_theme"
 
 intersphinx_mapping = {
+    "form2request": (
+        "https://form2request.readthedocs.io/en/latest",
+        None,
+    ),
+    "formasaurus": (
+        "https://formasaurus.readthedocs.io/en/latest",
+        None,
+    ),
     "python": (
         "https://docs.python.org/3",
         None,

diff --git a/docs/customization/pages.rst b/docs/customization/pages.rst
@@ -6,7 +6,8 @@ Customizing page objects
 
 All parsing is implemented using :ref:`web-poet page objects <page-objects>`
 that use `Zyte API automatic extraction`_ to extract :ref:`standard items
-<item-api>`, both for navigation and for item details.
+<item-api>`: for navigation, for item details, and even for :ref:`search
+request generation <search-queries>`.
 
 .. _Zyte API automatic extraction: https://docs.zyte.com/zyte-api/usage/extract.html
 
@@ -141,3 +142,27 @@ To extract a new field for one or more websites:
 
             def parse_product(self, response: DummyResponse, product: CustomProduct):
                 yield from super().parse_product(response, product)
+
+.. _fix-search:
+
+Fixing search support
+=====================
+
+If the default implementation to build a request out of :ref:`search queries
+<search-queries>` does not work on a given website, you can implement your
+own search request page object to fix that. See
+:ref:`custom-request-template-page`.
+
+For example:
+
+.. code-block:: python
+
+    from web_poet import handle_urls
+    from zyte_common_items import BaseSearchRequestTemplatePage
+
+
+    @handle_urls("example.com")
+    class ExampleComSearchRequestTemplatePage(BaseSearchRequestTemplatePage):
+        @field
+        def url(self):
+            return "https://example.com/search?q={{ query|quote_plus }}"
diff --git a/docs/features/search.rst b/docs/features/search.rst
@@ -0,0 +1,43 @@
+.. _search-queries:
+
+==============
+Search queries
+==============
+
+The :ref:`e-commerce spider template <e-commerce>` supports a spider argument,
+:data:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.search_queries`,
+that allows you to define a different search query per line, and
+turns the input URLs into search requests for those queries.
+
+For example, given the following input URLs:
+
+.. code-block:: none
+
+    https://a.example
+    https://b.example
+
+And the following list of search queries:
+
+.. code-block:: none
+
+    foo bar
+    baz
+
+By default, the spider would send 2 initial requests to those 2 input URLs,
+to try and find out how to build a search request for them, and if it succeeds,
+it will then send 4 search requests, 1 per combination of input URL and search
+query. For example:
+
+.. code-block:: none
+
+    https://a.example/search?q=foo+bar
+    https://a.example/search?q=baz
+    https://b.example/s/foo%20bar
+    https://b.example/s/baz
+
+The default implementation uses a combination of HTML metadata, AI-based HTML
+form inspection and heuristics to find the most likely way to build a search
+request for a given website.
+
+If this default implementation does not work as expected on a given website,
+you can :ref:`write a page object to fix that <fix-search>`.
diff --git a/docs/index.rst b/docs/index.rst
@@ -20,6 +20,12 @@ zyte-spider-templates documentation
    E-commerce <templates/e-commerce>
    Google search <templates/google-search>
 
+.. toctree::
+   :caption: Features
+   :hidden:
+
+   Search queries <features/search>
+
 .. toctree::
    :caption: Customization
    :hidden:

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
@@ -52,5 +52,10 @@ Parameter mixins
 
 .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
 
+.. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpItemTypeParam
+    :exclude-members: model_computed_fields
+
+.. autoenum:: zyte_spider_templates.spiders.serp.SerpItemType
+
 .. autopydantic_model:: zyte_spider_templates.spiders.serp.SerpMaxPagesParam
     :exclude-members: model_computed_fields
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,3 +8,4 @@ ignore_missing_imports = true
 
 [tool.black]
 target-version = ["py38", "py39", "py310", "py311", "py312"]
+force-exclude = "template.py"
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+filterwarnings =
+    ignore:deprecated string literal syntax::jmespath.lexer
diff --git a/setup.cfg b/setup.cfg
@@ -26,6 +26,9 @@ ignore =
     # First line should not be the function's "signature"
     D402
 
+exclude =
+    template.py
+
 per-file-ignores =
     # F401: Ignore "imported but unused" errors in __init__ files, as those
     # imports are there to expose submodule functions so they can be imported

diff --git a/setup.py b/setup.py
@@ -12,13 +12,18 @@
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
+        "extruct>=0.18.0",
+        "form2request>=0.2.0",
+        "formasaurus>=0.10.0",
+        "jmespath>=0.9.5",
         "pydantic>=2.1",
-        "requests>=0.10.1",
+        "requests>=1.0.0",
         "scrapy>=2.11.0",
         "scrapy-poet>=0.24.0",
         "scrapy-spider-metadata>=0.2.0",
         "scrapy-zyte-api[provider]>=0.23.0",
-        "zyte-common-items>=0.23.0",
+        "web-poet>=0.17.1",
+        "zyte-common-items>=0.26.2",
     ],
     classifiers=[
         "Development Status :: 3 - Alpha",