Initial docs (#9)

zytedata · Nov 3, 2023 · 13f45c7 · 13f45c7
1 parent d1a5bbd
commit 13f45c7
Show file tree

Hide file tree

Showing 23 changed files with 637 additions and 29 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -5,3 +5,5 @@ tag = True
 tag_name = {new_version}
 
 [bumpversion:file:setup.py]
+
+[bumpversion:file:docs/conf.py]
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -45,7 +45,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ["3.11"]
-        tox-job: ["mypy", "linters", "twine"]
+        tox-job: ["mypy", "linters", "twine", "docs"]
 
     steps:
     - uses: actions/checkout@v4

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,3 +11,9 @@ repos:
     rev: 6.1.0
     hooks:
     - id: flake8
+- repo: https://github.com/adamchainz/blacken-docs
+  rev: 1.16.0
+  hooks:
+  - id: blacken-docs
+    additional_dependencies:
+    - black==23.10.1
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -0,0 +1,12 @@
+version: 2
+formats: all
+sphinx:
+  configuration: docs/conf.py
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"  # Keep in sync with .github/workflows/test.yml
+python:
+  install:
+    - requirements: docs/requirements.txt
+    - path: .
diff --git a/README.rst b/README.rst
@@ -19,6 +19,8 @@ zyte-spider-templates
    :alt: Coverage report
 
 
+.. description starts
+
 Spider templates for automatic crawlers.
 
 This library contains Scrapy_ spider templates. They can be used out of the box
@@ -30,24 +32,7 @@ starting point for your own projects.
 .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html
 .. _sample Scrapy project: https://github.com/zytedata/zyte-spider-templates-project
 
+.. description ends
 
-Requirements
-============
-
-* Python 3.8+
-* Scrapy 2.11+
-
-
-Installation
-============
-
-.. code-block::
-
-    pip install zyte-spider-templates
-
-
-Spiders
-=======
-
-* ``EcommerceSpider``: a spider that can scrape products from any e-commerce
-  website.
+* Documentation: https://zyte-spider-templates.readthedocs.io/en/latest/
+* License: BSD 3-clause
diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/_ext/__init__.py b/docs/_ext/__init__.py
@@ -0,0 +1,25 @@
+def setup(app):
+    # https://stackoverflow.com/a/13663325
+    #
+    # Scrapy’s
+    # https://github.com/scrapy/scrapy/blob/dba37674e6eaa6c2030c8eb35ebf8127cd488062/docs/_ext/scrapydocs.py#L90C16-L110C6
+    app.add_crossref_type(
+        directivename="setting",
+        rolename="setting",
+        indextemplate="pair: %s; setting",
+    )
+    app.add_crossref_type(
+        directivename="signal",
+        rolename="signal",
+        indextemplate="pair: %s; signal",
+    )
+    app.add_crossref_type(
+        directivename="command",
+        rolename="command",
+        indextemplate="pair: %s; command",
+    )
+    app.add_crossref_type(
+        directivename="reqmeta",
+        rolename="reqmeta",
+        indextemplate="pair: %s; reqmeta",
+    )
diff --git a/docs/changes.rst b/docs/changes.rst
@@ -0,0 +1 @@
+.. include:: ../CHANGES.rst
diff --git a/docs/conf.py b/docs/conf.py
@@ -0,0 +1,47 @@
+import sys
+from pathlib import Path
+
+project = "zyte-spider-templates"
+copyright = "2023, Zyte Group Ltd"
+author = "Zyte Group Ltd"
+release = "0.2.0"
+
+sys.path.insert(0, str(Path(__file__).parent.absolute()))  # _ext
+extensions = [
+    "_ext",
+    "enum_tools.autoenum",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.viewcode",
+    "sphinxcontrib.autodoc_pydantic",
+]
+
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+html_theme = "sphinx_rtd_theme"
+
+intersphinx_mapping = {
+    "python": (
+        "https://docs.python.org/3",
+        None,
+    ),
+    "scrapy": (
+        "https://docs.scrapy.org/en/latest",
+        None,
+    ),
+    "scrapy-poet": (
+        "https://scrapy-poet.readthedocs.io/en/stable",
+        None,
+    ),
+    "web-poet": (
+        "https://web-poet.readthedocs.io/en/stable",
+        None,
+    ),
+    "zyte-common-items": (
+        "https://zyte-common-items.readthedocs.io/en/latest",
+        None,
+    ),
+}
+
+autodoc_pydantic_model_show_field_summary = False
+autodoc_pydantic_model_show_json = False
diff --git a/docs/customization/index.rst b/docs/customization/index.rst
@@ -0,0 +1,13 @@
+.. _customization:
+
+=============
+Customization
+=============
+
+:ref:`Built-in spider templates <spider-templates>` can be highly customized:
+
+-   :ref:`Subclass spider templates <custom-spiders>` to customize metadata,
+    parameters, and crawling logic.
+
+-   :ref:`Implement page objects <custom-page-objects>` to override parsing
+    logic for all or some websites, both for navigation and item detail data.
diff --git a/docs/customization/page-objects.rst b/docs/customization/page-objects.rst
@@ -0,0 +1,143 @@
+.. _custom-page-objects:
+
+========================
+Customizing page objects
+========================
+
+All parsing is implemented using :ref:`web-poet page objects <page-objects>`
+that use `Zyte API automatic extraction`_ to extract :ref:`standard items
+<item-api>`, both for navigation and for item details.
+
+.. _Zyte API automatic extraction: https://docs.zyte.com/zyte-api/usage/extract.html
+
+You can implement your own page object classes to override how extraction works
+for any given combination of URL and item type.
+
+.. tip:: Make sure the import path of your page objects module is in the
+    :ref:`SCRAPY_POET_DISCOVER <scrapy-poet:settings>` setting, otherwise your
+    page objects might be ignored.
+
+.. _configured scrapy-poet: https://scrapy-poet.readthedocs.io/en/stable/intro/install.html#configuring-the-project
+
+.. _override-parsing:
+
+Overriding parsing
+==================
+
+To change or fix how a given field is extracted, overriding the value from
+`Zyte API automatic extraction`_, create a page object class, configured to run
+on some given URLs (:func:`web_poet.handle_urls`), that defines the logic to
+extract that field. For example:
+
+.. code-block:: python
+    :caption: page_objects/books_toscrape_com.py
+
+    import attrs
+    from number_parser import parse_number
+    from web_poet import HttpResponse, field, handle_urls
+    from zyte_common_items import AggregateRating, AutoProductPage
+
+
+    @handle_urls("books.toscrape.com")
+    @attrs.define
+    class BooksToScrapeComProductPage(AutoProductPage):
+        response: HttpResponse
+
+        @field
+        async def aggregateRating(self):
+            element_class = self.response.css(".star-rating::attr(class)").get()
+            if not element_class:
+                return None
+            rating_str = element_class.split(" ")[-1]
+            rating = parse_number(rating_str)
+            if not rating:
+                return None
+            return AggregateRating(ratingValue=rating, bestRating=5)
+
+``AutoProductPage`` and other page objects from `zyte-common-items`_
+prefixed with ``Auto`` define fields for all standard items that return
+the value from `Zyte API automatic extraction`_, so that you only need
+to define your new field.
+
+.. _zyte-common-items: https://zyte-common-items.readthedocs.io/en/latest/
+
+The page object above is decorated with ``@attrs.define`` so that it can
+declare a dependency on :class:`~web_poet.page_inputs.http.HttpResponse` and
+use that to implement custom parsing logic. You could alternatively use
+:class:`~web_poet.page_inputs.browser.BrowserHtml` if needed.
+
+
+.. _add-field:
+
+Parsing a new field
+===================
+
+To extract a new field for one or more websites:
+
+#.  Declare a new item type that extends a :ref:`standard item <item-api>` with
+    your new field. For example:
+
+    .. code-block:: python
+        :caption: items.py
+
+        from typing import Optional
+
+        import attrs
+        from zyte_common_items import Product
+
+
+        @attrs.define
+        class CustomProduct(Product):
+            stock: Optional[int]
+
+#.  Create a page object class, configured to run for your new item type
+    (:class:`web_poet.pages.Returns`) on some given URLs
+    (:func:`web_poet.handle_urls`), that defines the logic to extract your new
+    field. For example:
+
+    .. code-block:: python
+        :caption: page_objects/books_toscrape_com.py
+
+        import re
+
+        from web_poet import Returns, field, handle_urls
+        from zyte_common_items import AutoProductPage
+
+        from ..items import CustomProduct
+
+
+        @handle_urls("books.toscrape.com")
+        class BookPage(AutoProductPage, Returns[CustomProduct]):
+            @field
+            async def stock(self):
+                for entry in await self.additionalProperties:
+                    if entry.name == "availability":
+                        match = re.search(r"\d([.,\s]*\d+)*(?=\s+available\b)", entry.value)
+                        if not match:
+                            return None
+                        stock_str = re.sub(r"[.,\s]", "", match[0])
+                        return int(stock_str)
+                return None
+
+#.  Create a spider template subclass that requests your new item type instead
+    of the standard one. For example:
+
+    .. code-block:: python
+        :caption: spiders/books_toscrape_com.py
+
+        from scrapy_poet import DummyResponse
+
+        from ..items import CustomProduct
+        from ..zyte_crawlers.spiders.ecommerce import EcommerceSpider
+
+
+        class BooksToScrapeComSpider(EcommerceSpider):
+            name = "books_toscrape_com"
+            metadata = {
+                **EcommerceSpider.metadata,
+                "title": "Books to Scrape",
+                "description": "Spider template for books.toscrape.com",
+            }
+
+            def parse_product(self, response: DummyResponse, product: CustomProduct):
+                yield from super().parse_product(response, product)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,3 +5,5 @@ tag = True
		tag_name = {new_version}

		[bumpversion:file:setup.py]

		[bumpversion:file:docs/conf.py]