Skip to content

Commit

Permalink
E-commerce search support (#77)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Nov 22, 2024
1 parent 71a5f71 commit e9bf031
Show file tree
Hide file tree
Showing 13 changed files with 1,280 additions and 11 deletions.
8 changes: 8 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@
html_theme = "sphinx_rtd_theme"

intersphinx_mapping = {
"form2request": (
"https://form2request.readthedocs.io/en/latest",
None,
),
"formasaurus": (
"https://formasaurus.readthedocs.io/en/latest",
None,
),
"python": (
"https://docs.python.org/3",
None,
Expand Down
27 changes: 26 additions & 1 deletion docs/customization/pages.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ Customizing page objects

All parsing is implemented using :ref:`web-poet page objects <page-objects>`
that use `Zyte API automatic extraction`_ to extract :ref:`standard items
<item-api>`, both for navigation and for item details.
<item-api>`: for navigation, for item details, and even for :ref:`search
request generation <search-queries>`.

.. _Zyte API automatic extraction: https://docs.zyte.com/zyte-api/usage/extract.html

Expand Down Expand Up @@ -141,3 +142,27 @@ To extract a new field for one or more websites:
def parse_product(self, response: DummyResponse, product: CustomProduct):
yield from super().parse_product(response, product)
.. _fix-search:

Fixing search support
=====================

If the default implementation to build a request out of :ref:`search queries
<search-queries>` does not work on a given website, you can implement your
own search request page object to fix that. See
:ref:`custom-request-template-page`.

For example:

.. code-block:: python
from web_poet import handle_urls
from zyte_common_items import BaseSearchRequestTemplatePage
@handle_urls("example.com")
class ExampleComSearchRequestTemplatePage(BaseSearchRequestTemplatePage):
@field
def url(self):
return "https://example.com/search?q={{ query|quote_plus }}"
43 changes: 43 additions & 0 deletions docs/features/search.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
.. _search-queries:

==============
Search queries
==============

The :ref:`e-commerce spider template <e-commerce>` supports a spider argument,
:data:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.search_queries`,
that allows you to define a different search query per line, and
turns the input URLs into search requests for those queries.

For example, given the following input URLs:

.. code-block:: none
https://a.example
https://b.example
And the following list of search queries:

.. code-block:: none
foo bar
baz
By default, the spider would send 2 initial requests to those 2 input URLs,
to try and find out how to build a search request for them, and if it succeeds,
it will then send 4 search requests, 1 per combination of input URL and search
query. For example:

.. code-block:: none
https://a.example/search?q=foo+bar
https://a.example/search?q=baz
https://b.example/s/foo%20bar
https://b.example/s/baz
The default implementation uses a combination of HTML metadata, AI-based HTML
form inspection and heuristics to find the most likely way to build a search
request for a given website.

If this default implementation does not work as expected on a given website,
you can :ref:`write a page object to fix that <fix-search>`.
6 changes: 6 additions & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ zyte-spider-templates documentation
E-commerce <templates/e-commerce>
Google search <templates/google-search>

.. toctree::
:caption: Features
:hidden:

Search queries <features/search>

.. toctree::
:caption: Customization
:hidden:
Expand Down
3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[pytest]
filterwarnings =
ignore:deprecated string literal syntax::jmespath.lexer
9 changes: 7 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,18 @@
packages=find_packages(),
include_package_data=True,
install_requires=[
"extruct>=0.18.0",
"form2request>=0.2.0",
"formasaurus>=0.10.0",
"jmespath>=0.9.5",
"pydantic>=2.1",
"requests>=0.10.1",
"requests>=1.0.0",
"scrapy>=2.11.0",
"scrapy-poet>=0.24.0",
"scrapy-spider-metadata>=0.2.0",
"scrapy-zyte-api[provider]>=0.23.0",
"zyte-common-items>=0.23.0",
"web-poet>=0.17.1",
"zyte-common-items>=0.25.0",
],
classifiers=[
"Development Status :: 3 - Alpha",
Expand Down
112 changes: 111 additions & 1 deletion tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,14 @@
from pydantic import ValidationError
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import get_spider_metadata
from zyte_common_items import ProbabilityRequest, Product, ProductNavigation
from web_poet.page_inputs.browser import BrowserResponse
from zyte_common_items import (
ProbabilityRequest,
Product,
ProductNavigation,
SearchRequestTemplate,
SearchRequestTemplateMetadata,
)

from zyte_spider_templates._geolocations import (
GEOLOCATION_OPTIONS,
Expand Down Expand Up @@ -37,6 +44,19 @@ def test_parameters():
with pytest.raises(ValidationError):
EcommerceSpider(url="https://example.com", crawl_strategy="unknown")

EcommerceSpider(
url="https://example.com", crawl_strategy="direct_item", search_queries=""
)
EcommerceSpider(
url="https://example.com", crawl_strategy="automatic", search_queries="foo"
)
with pytest.raises(ValidationError):
EcommerceSpider(
url="https://example.com",
crawl_strategy="direct_item",
search_queries="foo",
)


def test_start_requests():
url = "https://example.com"
Expand Down Expand Up @@ -258,6 +278,33 @@ def test_parse_product(probability, has_item, item_drop, caplog):
assert str(product) in caplog.text


@pytest.mark.parametrize(
("probability", "yields_items"),
(
(None, True), # Default
(-1.0, False),
(0.0, False), # page.no_item_found()
(1.0, True),
),
)
def test_parse_search_request_template_probability(probability, yields_items):
crawler = get_crawler()
spider = EcommerceSpider.from_crawler(
crawler, url="https://example.com", search_queries="foo"
)
search_request_template = SearchRequestTemplate(url="https://example.com")
if probability is not None:
search_request_template.metadata = SearchRequestTemplateMetadata(
probability=probability
)
items = list(
spider.parse_search_request_template(
DummyResponse("https://example.com"), search_request_template, DynamicDeps()
)
)
assert items if yields_items else not items


def test_arguments():
# Ensure passing no arguments works.
crawler = get_crawler()
Expand Down Expand Up @@ -420,6 +467,17 @@ def test_metadata():
"title": "URLs file",
"type": "string",
},
"search_queries": {
"default": [],
"description": (
"A list of search queries, one per line, to submit "
"using the search form found on each input URL."
),
"items": {"type": "string"},
"title": "Search Queries",
"type": "array",
"widget": "textarea",
},
"crawl_strategy": {
"default": "automatic",
"description": "Determines how the start URL and follow-up URLs are crawled.",
Expand Down Expand Up @@ -820,6 +878,58 @@ def test_urls_file():
assert start_requests[2].url == "https://c.example"


def test_search_queries():
crawler = get_crawler()
url = "https://example.com"

spider = EcommerceSpider.from_crawler(crawler, url=url, search_queries="foo bar")
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert start_requests[0].url == url
assert start_requests[0].callback == spider.parse_search_request_template
assert spider.args.search_queries == ["foo bar"]

spider = EcommerceSpider.from_crawler(crawler, url=url, search_queries="foo\nbar")
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert start_requests[0].url == url
assert start_requests[0].callback == spider.parse_search_request_template
assert spider.args.search_queries == ["foo", "bar"]

spider = EcommerceSpider.from_crawler(
crawler, url=url, search_queries=["foo", "bar"]
)
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert start_requests[0].url == url
assert start_requests[0].callback == spider.parse_search_request_template
assert spider.args.search_queries == ["foo", "bar"]


def test_search_queries_extract_from():
crawler = get_crawler()
url = "https://example.com"

spider = EcommerceSpider.from_crawler(crawler, url=url, search_queries="foo")
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert "inject" not in start_requests[0].meta

spider = EcommerceSpider.from_crawler(
crawler, url=url, search_queries="foo", extract_from="httpResponseBody"
)
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert "inject" not in start_requests[0].meta

spider = EcommerceSpider.from_crawler(
crawler, url=url, search_queries="foo", extract_from="browserHtml"
)
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert start_requests[0].meta["inject"] == [BrowserResponse]


@pytest.mark.parametrize(
"url,has_full_domain",
(
Expand Down
Loading

0 comments on commit e9bf031

Please sign in to comment.