From 13f45c719c1fd714fbd69a25e98cc5863bbf4da7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 3 Nov 2023 09:43:25 +0100 Subject: [PATCH] Initial docs (#9) --- .bumpversion.cfg | 2 + .github/workflows/test.yml | 2 +- .pre-commit-config.yaml | 6 + .readthedocs.yml | 12 ++ README.rst | 25 +--- docs/Makefile | 20 +++ docs/_ext/__init__.py | 25 ++++ docs/changes.rst | 1 + docs/conf.py | 47 +++++++ docs/customization/index.rst | 13 ++ docs/customization/page-objects.rst | 143 +++++++++++++++++++++ docs/customization/spiders.rst | 105 +++++++++++++++ docs/index.rst | 34 +++++ docs/make.bat | 35 +++++ docs/requirements.txt | 5 + docs/setup.rst | 76 +++++++++++ docs/templates/e-commerce.rst | 24 ++++ docs/templates/index.rst | 31 +++++ tests/test_ecommerce.py | 4 + tox.ini | 7 + zyte_spider_templates/documentation.py | 6 + zyte_spider_templates/middlewares.py | 24 ++-- zyte_spider_templates/spiders/ecommerce.py | 19 +++ 23 files changed, 637 insertions(+), 29 deletions(-) create mode 100644 .readthedocs.yml create mode 100644 docs/Makefile create mode 100644 docs/_ext/__init__.py create mode 100644 docs/changes.rst create mode 100644 docs/conf.py create mode 100644 docs/customization/index.rst create mode 100644 docs/customization/page-objects.rst create mode 100644 docs/customization/spiders.rst create mode 100644 docs/index.rst create mode 100644 docs/make.bat create mode 100644 docs/requirements.txt create mode 100644 docs/setup.rst create mode 100644 docs/templates/e-commerce.rst create mode 100644 docs/templates/index.rst create mode 100644 zyte_spider_templates/documentation.py diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 9c83bf8..7955d88 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -5,3 +5,5 @@ tag = True tag_name = {new_version} [bumpversion:file:setup.py] + +[bumpversion:file:docs/conf.py] diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1a5be53..2d18612 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -45,7 +45,7 @@ jobs: fail-fast: false matrix: python-version: ["3.11"] - tox-job: ["mypy", "linters", "twine"] + tox-job: ["mypy", "linters", "twine", "docs"] steps: - uses: actions/checkout@v4 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 03f2874..42aa3e2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,3 +11,9 @@ repos: rev: 6.1.0 hooks: - id: flake8 +- repo: https://github.com/adamchainz/blacken-docs + rev: 1.16.0 + hooks: + - id: blacken-docs + additional_dependencies: + - black==23.10.1 diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..1519565 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,12 @@ +version: 2 +formats: all +sphinx: + configuration: docs/conf.py +build: + os: ubuntu-22.04 + tools: + python: "3.11" # Keep in sync with .github/workflows/test.yml +python: + install: + - requirements: docs/requirements.txt + - path: . diff --git a/README.rst b/README.rst index cf06844..b4ecfbf 100644 --- a/README.rst +++ b/README.rst @@ -19,6 +19,8 @@ zyte-spider-templates :alt: Coverage report +.. description starts + Spider templates for automatic crawlers. This library contains Scrapy_ spider templates. They can be used out of the box @@ -30,24 +32,7 @@ starting point for your own projects. .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html .. _sample Scrapy project: https://github.com/zytedata/zyte-spider-templates-project +.. description ends -Requirements -============ - -* Python 3.8+ -* Scrapy 2.11+ - - -Installation -============ - -.. code-block:: - - pip install zyte-spider-templates - - -Spiders -======= - -* ``EcommerceSpider``: a spider that can scrape products from any e-commerce - website. +* Documentation: https://zyte-spider-templates.readthedocs.io/en/latest/ +* License: BSD 3-clause diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_ext/__init__.py b/docs/_ext/__init__.py new file mode 100644 index 0000000..5a3839e --- /dev/null +++ b/docs/_ext/__init__.py @@ -0,0 +1,25 @@ +def setup(app): + # https://stackoverflow.com/a/13663325 + # + # Scrapy’s + # https://github.com/scrapy/scrapy/blob/dba37674e6eaa6c2030c8eb35ebf8127cd488062/docs/_ext/scrapydocs.py#L90C16-L110C6 + app.add_crossref_type( + directivename="setting", + rolename="setting", + indextemplate="pair: %s; setting", + ) + app.add_crossref_type( + directivename="signal", + rolename="signal", + indextemplate="pair: %s; signal", + ) + app.add_crossref_type( + directivename="command", + rolename="command", + indextemplate="pair: %s; command", + ) + app.add_crossref_type( + directivename="reqmeta", + rolename="reqmeta", + indextemplate="pair: %s; reqmeta", + ) diff --git a/docs/changes.rst b/docs/changes.rst new file mode 100644 index 0000000..d9e113e --- /dev/null +++ b/docs/changes.rst @@ -0,0 +1 @@ +.. include:: ../CHANGES.rst diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..a4b045f --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,47 @@ +import sys +from pathlib import Path + +project = "zyte-spider-templates" +copyright = "2023, Zyte Group Ltd" +author = "Zyte Group Ltd" +release = "0.2.0" + +sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext +extensions = [ + "_ext", + "enum_tools.autoenum", + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.viewcode", + "sphinxcontrib.autodoc_pydantic", +] + +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +html_theme = "sphinx_rtd_theme" + +intersphinx_mapping = { + "python": ( + "https://docs.python.org/3", + None, + ), + "scrapy": ( + "https://docs.scrapy.org/en/latest", + None, + ), + "scrapy-poet": ( + "https://scrapy-poet.readthedocs.io/en/stable", + None, + ), + "web-poet": ( + "https://web-poet.readthedocs.io/en/stable", + None, + ), + "zyte-common-items": ( + "https://zyte-common-items.readthedocs.io/en/latest", + None, + ), +} + +autodoc_pydantic_model_show_field_summary = False +autodoc_pydantic_model_show_json = False diff --git a/docs/customization/index.rst b/docs/customization/index.rst new file mode 100644 index 0000000..ad29712 --- /dev/null +++ b/docs/customization/index.rst @@ -0,0 +1,13 @@ +.. _customization: + +============= +Customization +============= + +:ref:`Built-in spider templates ` can be highly customized: + +- :ref:`Subclass spider templates ` to customize metadata, + parameters, and crawling logic. + +- :ref:`Implement page objects ` to override parsing + logic for all or some websites, both for navigation and item detail data. diff --git a/docs/customization/page-objects.rst b/docs/customization/page-objects.rst new file mode 100644 index 0000000..ed763a7 --- /dev/null +++ b/docs/customization/page-objects.rst @@ -0,0 +1,143 @@ +.. _custom-page-objects: + +======================== +Customizing page objects +======================== + +All parsing is implemented using :ref:`web-poet page objects ` +that use `Zyte API automatic extraction`_ to extract :ref:`standard items +`, both for navigation and for item details. + +.. _Zyte API automatic extraction: https://docs.zyte.com/zyte-api/usage/extract.html + +You can implement your own page object classes to override how extraction works +for any given combination of URL and item type. + +.. tip:: Make sure the import path of your page objects module is in the + :ref:`SCRAPY_POET_DISCOVER ` setting, otherwise your + page objects might be ignored. + +.. _configured scrapy-poet: https://scrapy-poet.readthedocs.io/en/stable/intro/install.html#configuring-the-project + +.. _override-parsing: + +Overriding parsing +================== + +To change or fix how a given field is extracted, overriding the value from +`Zyte API automatic extraction`_, create a page object class, configured to run +on some given URLs (:func:`web_poet.handle_urls`), that defines the logic to +extract that field. For example: + +.. code-block:: python + :caption: page_objects/books_toscrape_com.py + + import attrs + from number_parser import parse_number + from web_poet import HttpResponse, field, handle_urls + from zyte_common_items import AggregateRating, AutoProductPage + + + @handle_urls("books.toscrape.com") + @attrs.define + class BooksToScrapeComProductPage(AutoProductPage): + response: HttpResponse + + @field + async def aggregateRating(self): + element_class = self.response.css(".star-rating::attr(class)").get() + if not element_class: + return None + rating_str = element_class.split(" ")[-1] + rating = parse_number(rating_str) + if not rating: + return None + return AggregateRating(ratingValue=rating, bestRating=5) + +``AutoProductPage`` and other page objects from `zyte-common-items`_ +prefixed with ``Auto`` define fields for all standard items that return +the value from `Zyte API automatic extraction`_, so that you only need +to define your new field. + +.. _zyte-common-items: https://zyte-common-items.readthedocs.io/en/latest/ + +The page object above is decorated with ``@attrs.define`` so that it can +declare a dependency on :class:`~web_poet.page_inputs.http.HttpResponse` and +use that to implement custom parsing logic. You could alternatively use +:class:`~web_poet.page_inputs.browser.BrowserHtml` if needed. + + +.. _add-field: + +Parsing a new field +=================== + +To extract a new field for one or more websites: + +#. Declare a new item type that extends a :ref:`standard item ` with + your new field. For example: + + .. code-block:: python + :caption: items.py + + from typing import Optional + + import attrs + from zyte_common_items import Product + + + @attrs.define + class CustomProduct(Product): + stock: Optional[int] + +#. Create a page object class, configured to run for your new item type + (:class:`web_poet.pages.Returns`) on some given URLs + (:func:`web_poet.handle_urls`), that defines the logic to extract your new + field. For example: + + .. code-block:: python + :caption: page_objects/books_toscrape_com.py + + import re + + from web_poet import Returns, field, handle_urls + from zyte_common_items import AutoProductPage + + from ..items import CustomProduct + + + @handle_urls("books.toscrape.com") + class BookPage(AutoProductPage, Returns[CustomProduct]): + @field + async def stock(self): + for entry in await self.additionalProperties: + if entry.name == "availability": + match = re.search(r"\d([.,\s]*\d+)*(?=\s+available\b)", entry.value) + if not match: + return None + stock_str = re.sub(r"[.,\s]", "", match[0]) + return int(stock_str) + return None + +#. Create a spider template subclass that requests your new item type instead + of the standard one. For example: + + .. code-block:: python + :caption: spiders/books_toscrape_com.py + + from scrapy_poet import DummyResponse + + from ..items import CustomProduct + from ..zyte_crawlers.spiders.ecommerce import EcommerceSpider + + + class BooksToScrapeComSpider(EcommerceSpider): + name = "books_toscrape_com" + metadata = { + **EcommerceSpider.metadata, + "title": "Books to Scrape", + "description": "Spider template for books.toscrape.com", + } + + def parse_product(self, response: DummyResponse, product: CustomProduct): + yield from super().parse_product(response, product) diff --git a/docs/customization/spiders.rst b/docs/customization/spiders.rst new file mode 100644 index 0000000..a4bc8a1 --- /dev/null +++ b/docs/customization/spiders.rst @@ -0,0 +1,105 @@ +.. _custom-spiders: + +============================ +Customizing spider templates +============================ + +Subclass a :ref:`built-in spider template ` to customize its +:ref:`metadata `, :ref:`parameters `, and +:ref:`crawling logic `. + +.. _custom-metadata: + +Customizing metadata +==================== + +Spider template metadata is defined using `scrapy-spider-metadata`_, and can be +`redefined or customized in a subclass`_. + +For example, to keep the upstream ``title`` but change the ``description``: + +.. _scrapy-spider-metadata: https://scrapy-spider-metadata.readthedocs.io/en/latest +.. _redefined or customized in a subclass: https://scrapy-spider-metadata.readthedocs.io/en/latest/metadata.html#defining-spider-metadata + +.. code-block:: python + + from zyte_spider_templates import EcommerceSpider + + + class MySpider(EcommerceSpider): + name = "my_spider" + metadata = { + **EcommerceSpider.metadata, + "description": "Custom e-commerce spider template.", + } + + +.. _custom-params: + +Customizing parameters +====================== + +Spider template parameters are also defined using `scrapy-spider-metadata`_, +and can be `redefined or customized in a subclass as well`_. + +For example, to add a ``min_price`` parameter and filter out products with a +lower price: + +.. _scrapy-spider-metadata: https://scrapy-spider-metadata.readthedocs.io/en/latest +.. _redefined or customized in a subclass as well: https://scrapy-spider-metadata.readthedocs.io/en/latest/params.html + +.. code-block:: python + + from decimal import Decimal + from typing import Iterable + + from scrapy_poet import DummyResponse + from scrapy_spider_metadata import Args + from zyte_common_items import Product + from zyte_spider_templates import EcommerceSpider, EcommerceSpiderParams + + + class MyParams(EcommerceSpiderParams): + min_price: str = "0.00" + + + class MySpider(Args[MyParams], EcommerceSpider): + name = "my_spider" + + def parse_product( + self, response: DummyResponse, product: Product + ) -> Iterable[Product]: + for product in super().parse_product(response, product): + if Decimal(product.price) >= Decimal(self.args.min_price): + yield product + + +.. _custom-crawl: + +Customizing the crawling logic +============================== + +The crawling logic of spider templates can be customized as any other +:ref:`Scrapy spider `. + +For example, you can make a spider that expects a product details URL and does +not follow navigation at all: + +.. code-block:: python + + from typing import Iterable + + from scrapy import Request + from zyte_spider_templates import EcommerceSpider + + + class MySpider(EcommerceSpider): + name = "my_spider" + + def start_requests(self) -> Iterable[Request]: + for request in super().start_requests(): + yield request.replace(callback=self.parse_product) + +All parsing logic is implemented separately in :ref:`page objects +`, making it easier to read the code of :ref:`built-in +spider templates ` to modify them as desired. diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..af82f6d --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,34 @@ +=================================== +zyte-spider-templates documentation +=================================== + +.. include:: ../README.rst + :start-after: .. description starts + :end-before: .. description ends + +.. toctree:: + :caption: First steps + :hidden: + + setup + +.. toctree:: + :caption: Templates + :hidden: + + templates/index + E-commerce + +.. toctree:: + :caption: Customization + :hidden: + + customization/index + customization/spiders + customization/page-objects + +.. toctree:: + :caption: All the rest + :hidden: + + changes diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..954237b --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..7a1fba9 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,5 @@ +autodoc_pydantic==2.0.1 +enum-tools==0.11.0 +Sphinx==7.2.6 +sphinx-rtd-theme==1.3.0 +sphinx-toolbox==3.5.0 # optional dependency of enum-tools diff --git a/docs/setup.rst b/docs/setup.rst new file mode 100644 index 0000000..96304a4 --- /dev/null +++ b/docs/setup.rst @@ -0,0 +1,76 @@ +============= +Initial setup +============= + +Learn how to get :ref:`spider templates ` installed and +configured on an existing Scrapy_ project. + +.. _Scrapy: https://docs.scrapy.org/en/latest/ + +.. tip:: If you do not have a Scrapy project yet, use + `zyte-spider-templates-project`_ as a starting template to get started + quickly. + +.. _zyte-spider-templates-project: https://github.com/zytedata/zyte-spider-templates-project + +Requirements +============ + +- Python 3.8+ + +- Scrapy 2.11+ + +For Zyte API features, including AI-powered parsing, you need a `Zyte API`_ +subscription. + +.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html + +Installation +============ + +.. code-block:: shell + + pip install zyte-spider-templates + + +.. _config: + +Configuration +============= + +In your Scrapy project settings (usually in ``settings.py``): + +- Update :setting:`SPIDER_MODULES ` to include + ``"zyte_spider_templates.spiders"``. + +- `Configure scrapy-poet`_, and update :ref:`SCRAPY_POET_DISCOVER + ` to include + ``"zyte_spider_templates.page_objects"``. + + .. _Configure scrapy-poet: https://scrapy-poet.readthedocs.io/en/stable/intro/install.html#configuring-the-project + +For Zyte API features, including AI-powered parsing, `configure +scrapy-zyte-api`_ with `scrapy-poet integration`_. + +.. _configure scrapy-zyte-api: https://github.com/scrapy-plugins/scrapy-zyte-api#quick-start +.. _scrapy-poet integration: https://github.com/scrapy-plugins/scrapy-zyte-api#scrapy-poet-integration + +The following additional settings are recommended: + +- Set :setting:`CLOSESPIDER_TIMEOUT_NO_ITEM + ` to 600, to force the spider to stop + if no item has been found for 10 minutes. + +- Set :setting:`SCHEDULER_DISK_QUEUE ` to + ``"scrapy.squeues.PickleFifoDiskQueue"`` and + :setting:`SCHEDULER_MEMORY_QUEUE ` to + ``"scrapy.squeues.FifoMemoryQueue"``, for better request priority handling. + +- Update :setting:`SPIDER_MIDDLEWARES ` to include + ``"zyte_crawlers.middlewares.CrawlingLogsMiddleware": 1000``, to log crawl + data in JSON format for debugging purposes. + +For an example of a properly configured ``settings.py`` file, see `the one +in zyte-spider-templates-project`_. + +.. _the one in zyte-spider-templates-project: https://github.com/zytedata/zyte-spider-templates-project/blob/main/zyte_spider_templates_project/settings.py diff --git a/docs/templates/e-commerce.rst b/docs/templates/e-commerce.rst new file mode 100644 index 0000000..e2a8684 --- /dev/null +++ b/docs/templates/e-commerce.rst @@ -0,0 +1,24 @@ +.. _e-commerce: + +========================================== +E-commerce spider template (``ecommerce``) +========================================== + +Basic use +========= + +.. code-block:: shell + + scrapy crawl ecommerce -a url="https://books.toscrape.com" + +Parameters +========== + +.. autopydantic_model:: zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams + :inherited-members: BaseModel + +.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy + +.. autoenum:: zyte_spider_templates.spiders.ecommerce.ExtractFrom + +.. autoenum:: zyte_spider_templates.spiders.base.Geolocation diff --git a/docs/templates/index.rst b/docs/templates/index.rst new file mode 100644 index 0000000..c70a7de --- /dev/null +++ b/docs/templates/index.rst @@ -0,0 +1,31 @@ +.. _spider-templates: + +================ +Spider templates +================ + +Built-in `spider templates`_ use `Zyte API automatic extraction`_ to provide +automatic crawling and parsing, i.e. you can run these spiders on any website +of the right type to automatically extract the desired structured data. + +.. _spider templates: https://docs.zyte.com/scrapy-cloud/usage/spiders.html#spider-templates-and-virtual-spiders +.. _Zyte API automatic extraction: https://docs.zyte.com/zyte-api/usage/extract.html + +For example, to extract all products from an e-commerce website, you can run +the :ref:`e-commerce spider ` spider as follows: + +.. code-block:: shell + + scrapy crawl ecommerce -a url="https://books.toscrape.com" + +Spider templates support additional parameters beyond ``url``. See the +documentation of each specific spider for details. + +You can also :ref:`customize spider templates ` to meet your +needs. + +Spider template list +==================== + +:ref:`E-commerce ` + Get products from an e-commerce website. diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 8c6d244..6d27c92 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -346,6 +346,10 @@ def test_metadata(): "anyOf": [{"type": "string"}, {"type": "null"}], "default": None, "title": "Extraction source", + "description": ( + "Whether to perform extraction using a browser request " + "(browserHtml) or an HTTP request (httpResponseBody)." + ), "enum": ["httpResponseBody", "browserHtml"], "enumMeta": { "httpResponseBody": { diff --git a/tox.ini b/tox.ini index a22c785..b7c20d4 100644 --- a/tox.ini +++ b/tox.ini @@ -34,3 +34,10 @@ deps = commands = python setup.py sdist twine check dist/* + +[testenv:docs] +changedir = docs +deps = + -rdocs/requirements.txt +commands = + sphinx-build -W -b html . {envtmpdir}/html \ No newline at end of file diff --git a/zyte_spider_templates/documentation.py b/zyte_spider_templates/documentation.py new file mode 100644 index 0000000..a134b4d --- /dev/null +++ b/zyte_spider_templates/documentation.py @@ -0,0 +1,6 @@ +try: + from enum_tools.documentation import document_enum +except ImportError: + + def document_enum(func): + return func diff --git a/zyte_spider_templates/middlewares.py b/zyte_spider_templates/middlewares.py index 5aef138..526f2e6 100644 --- a/zyte_spider_templates/middlewares.py +++ b/zyte_spider_templates/middlewares.py @@ -39,18 +39,19 @@ class CrawlingLogsMiddleware: def process_spider_output(self, response, result, spider): result = list(result) + crawl_logs = self.crawl_logs(response, result) + logger.info(crawl_logs) + return result + + def crawl_logs(self, response, result): + current_page_type = response.meta.get("crawling_logs", {}).get("page_type") with warnings.catch_warnings(): warnings.filterwarnings( "ignore", category=ScrapyDeprecationWarning, message="Call to deprecated function scrapy.utils.request.request_fingerprint()*", ) - crawl_logs = self.crawl_logs(response, result) - logger.info(crawl_logs) - return result - - def crawl_logs(self, response, result): - current_page_type = response.meta.get("crawling_logs", {}).get("page_type") + fingerprint = request_fingerprint(response.request) data: Dict[str, Any] = { "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "current": { @@ -58,7 +59,7 @@ def crawl_logs(self, response, result): "request_url": response.request.url, # TODO: update this when the following is updated to use the same fingerprinter # with Scrapy: https://github.com/scrapinghub/scrapinghub-entrypoint-scrapy/ - "request_fingerprint": request_fingerprint(response.request), + "request_fingerprint": fingerprint, "page_type": current_page_type, "probability": response.meta.get("crawling_logs", {}).get( "probability" @@ -76,11 +77,18 @@ def crawl_logs(self, response, result): continue crawling_logs = entry.meta.get("crawling_logs", {}) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=ScrapyDeprecationWarning, + message="Call to deprecated function scrapy.utils.request.request_fingerprint()*", + ) + entry_fingerprint = request_fingerprint(entry) crawling_logs.update( { "request_url": entry.url, "request_priority": entry.priority, - "request_fingerprint": request_fingerprint(entry), + "request_fingerprint": entry_fingerprint, } ) diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index d0e26e3..a920f5e 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -9,18 +9,33 @@ from scrapy_spider_metadata import Args from zyte_common_items import Product, ProductNavigation +from zyte_spider_templates.documentation import document_enum from zyte_spider_templates.spiders.base import BaseSpider, BaseSpiderParams +@document_enum class EcommerceCrawlStrategy(str, Enum): full: str = "full" + """Follow most links within the domain of URL in an attempt to discover and + extract as many products as possible.""" + navigation: str = "navigation" + """Follow pagination, subcategories, and product detail pages.""" + pagination_only: str = "pagination_only" + """Follow pagination and product detail pages. SubCategory links are + ignored. Use this when some subCategory links are misidentified by + ML-extraction.""" +@document_enum class ExtractFrom(str, Enum): httpResponseBody: str = "httpResponseBody" + """Use HTTP responses. Cost-efficient and fast extraction method, which + works well on many websites.""" + browserHtml: str = "browserHtml" + """Use browser rendering. Often provides the best quality.""" class EcommerceSpiderParams(BaseSpiderParams): @@ -50,6 +65,10 @@ class EcommerceSpiderParams(BaseSpiderParams): ) extract_from: Optional[ExtractFrom] = Field( title="Extraction source", + description=( + "Whether to perform extraction using a browser request " + "(browserHtml) or an HTTP request (httpResponseBody)." + ), default=None, json_schema_extra={ "enumMeta": {