-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
23 changed files
with
637 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,3 +5,5 @@ tag = True | |
tag_name = {new_version} | ||
|
||
[bumpversion:file:setup.py] | ||
|
||
[bumpversion:file:docs/conf.py] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
version: 2 | ||
formats: all | ||
sphinx: | ||
configuration: docs/conf.py | ||
build: | ||
os: ubuntu-22.04 | ||
tools: | ||
python: "3.11" # Keep in sync with .github/workflows/test.yml | ||
python: | ||
install: | ||
- requirements: docs/requirements.txt | ||
- path: . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Minimal makefile for Sphinx documentation | ||
# | ||
|
||
# You can set these variables from the command line, and also | ||
# from the environment for the first two. | ||
SPHINXOPTS ?= | ||
SPHINXBUILD ?= sphinx-build | ||
SOURCEDIR = . | ||
BUILDDIR = _build | ||
|
||
# Put it first so that "make" without argument is like "make help". | ||
help: | ||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) | ||
|
||
.PHONY: help Makefile | ||
|
||
# Catch-all target: route all unknown targets to Sphinx using the new | ||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). | ||
%: Makefile | ||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
def setup(app): | ||
# https://stackoverflow.com/a/13663325 | ||
# | ||
# Scrapy’s | ||
# https://github.com/scrapy/scrapy/blob/dba37674e6eaa6c2030c8eb35ebf8127cd488062/docs/_ext/scrapydocs.py#L90C16-L110C6 | ||
app.add_crossref_type( | ||
directivename="setting", | ||
rolename="setting", | ||
indextemplate="pair: %s; setting", | ||
) | ||
app.add_crossref_type( | ||
directivename="signal", | ||
rolename="signal", | ||
indextemplate="pair: %s; signal", | ||
) | ||
app.add_crossref_type( | ||
directivename="command", | ||
rolename="command", | ||
indextemplate="pair: %s; command", | ||
) | ||
app.add_crossref_type( | ||
directivename="reqmeta", | ||
rolename="reqmeta", | ||
indextemplate="pair: %s; reqmeta", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.. include:: ../CHANGES.rst |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import sys | ||
from pathlib import Path | ||
|
||
project = "zyte-spider-templates" | ||
copyright = "2023, Zyte Group Ltd" | ||
author = "Zyte Group Ltd" | ||
release = "0.2.0" | ||
|
||
sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext | ||
extensions = [ | ||
"_ext", | ||
"enum_tools.autoenum", | ||
"sphinx.ext.autodoc", | ||
"sphinx.ext.intersphinx", | ||
"sphinx.ext.viewcode", | ||
"sphinxcontrib.autodoc_pydantic", | ||
] | ||
|
||
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] | ||
|
||
html_theme = "sphinx_rtd_theme" | ||
|
||
intersphinx_mapping = { | ||
"python": ( | ||
"https://docs.python.org/3", | ||
None, | ||
), | ||
"scrapy": ( | ||
"https://docs.scrapy.org/en/latest", | ||
None, | ||
), | ||
"scrapy-poet": ( | ||
"https://scrapy-poet.readthedocs.io/en/stable", | ||
None, | ||
), | ||
"web-poet": ( | ||
"https://web-poet.readthedocs.io/en/stable", | ||
None, | ||
), | ||
"zyte-common-items": ( | ||
"https://zyte-common-items.readthedocs.io/en/latest", | ||
None, | ||
), | ||
} | ||
|
||
autodoc_pydantic_model_show_field_summary = False | ||
autodoc_pydantic_model_show_json = False |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
.. _customization: | ||
|
||
============= | ||
Customization | ||
============= | ||
|
||
:ref:`Built-in spider templates <spider-templates>` can be highly customized: | ||
|
||
- :ref:`Subclass spider templates <custom-spiders>` to customize metadata, | ||
parameters, and crawling logic. | ||
|
||
- :ref:`Implement page objects <custom-page-objects>` to override parsing | ||
logic for all or some websites, both for navigation and item detail data. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
.. _custom-page-objects: | ||
|
||
======================== | ||
Customizing page objects | ||
======================== | ||
|
||
All parsing is implemented using :ref:`web-poet page objects <page-objects>` | ||
that use `Zyte API automatic extraction`_ to extract :ref:`standard items | ||
<item-api>`, both for navigation and for item details. | ||
|
||
.. _Zyte API automatic extraction: https://docs.zyte.com/zyte-api/usage/extract.html | ||
|
||
You can implement your own page object classes to override how extraction works | ||
for any given combination of URL and item type. | ||
|
||
.. tip:: Make sure the import path of your page objects module is in the | ||
:ref:`SCRAPY_POET_DISCOVER <scrapy-poet:settings>` setting, otherwise your | ||
page objects might be ignored. | ||
|
||
.. _configured scrapy-poet: https://scrapy-poet.readthedocs.io/en/stable/intro/install.html#configuring-the-project | ||
|
||
.. _override-parsing: | ||
|
||
Overriding parsing | ||
================== | ||
|
||
To change or fix how a given field is extracted, overriding the value from | ||
`Zyte API automatic extraction`_, create a page object class, configured to run | ||
on some given URLs (:func:`web_poet.handle_urls`), that defines the logic to | ||
extract that field. For example: | ||
|
||
.. code-block:: python | ||
:caption: page_objects/books_toscrape_com.py | ||
import attrs | ||
from number_parser import parse_number | ||
from web_poet import HttpResponse, field, handle_urls | ||
from zyte_common_items import AggregateRating, AutoProductPage | ||
@handle_urls("books.toscrape.com") | ||
@attrs.define | ||
class BooksToScrapeComProductPage(AutoProductPage): | ||
response: HttpResponse | ||
@field | ||
async def aggregateRating(self): | ||
element_class = self.response.css(".star-rating::attr(class)").get() | ||
if not element_class: | ||
return None | ||
rating_str = element_class.split(" ")[-1] | ||
rating = parse_number(rating_str) | ||
if not rating: | ||
return None | ||
return AggregateRating(ratingValue=rating, bestRating=5) | ||
``AutoProductPage`` and other page objects from `zyte-common-items`_ | ||
prefixed with ``Auto`` define fields for all standard items that return | ||
the value from `Zyte API automatic extraction`_, so that you only need | ||
to define your new field. | ||
|
||
.. _zyte-common-items: https://zyte-common-items.readthedocs.io/en/latest/ | ||
|
||
The page object above is decorated with ``@attrs.define`` so that it can | ||
declare a dependency on :class:`~web_poet.page_inputs.http.HttpResponse` and | ||
use that to implement custom parsing logic. You could alternatively use | ||
:class:`~web_poet.page_inputs.browser.BrowserHtml` if needed. | ||
|
||
|
||
.. _add-field: | ||
|
||
Parsing a new field | ||
=================== | ||
|
||
To extract a new field for one or more websites: | ||
|
||
#. Declare a new item type that extends a :ref:`standard item <item-api>` with | ||
your new field. For example: | ||
|
||
.. code-block:: python | ||
:caption: items.py | ||
from typing import Optional | ||
import attrs | ||
from zyte_common_items import Product | ||
@attrs.define | ||
class CustomProduct(Product): | ||
stock: Optional[int] | ||
#. Create a page object class, configured to run for your new item type | ||
(:class:`web_poet.pages.Returns`) on some given URLs | ||
(:func:`web_poet.handle_urls`), that defines the logic to extract your new | ||
field. For example: | ||
|
||
.. code-block:: python | ||
:caption: page_objects/books_toscrape_com.py | ||
import re | ||
from web_poet import Returns, field, handle_urls | ||
from zyte_common_items import AutoProductPage | ||
from ..items import CustomProduct | ||
@handle_urls("books.toscrape.com") | ||
class BookPage(AutoProductPage, Returns[CustomProduct]): | ||
@field | ||
async def stock(self): | ||
for entry in await self.additionalProperties: | ||
if entry.name == "availability": | ||
match = re.search(r"\d([.,\s]*\d+)*(?=\s+available\b)", entry.value) | ||
if not match: | ||
return None | ||
stock_str = re.sub(r"[.,\s]", "", match[0]) | ||
return int(stock_str) | ||
return None | ||
#. Create a spider template subclass that requests your new item type instead | ||
of the standard one. For example: | ||
|
||
.. code-block:: python | ||
:caption: spiders/books_toscrape_com.py | ||
from scrapy_poet import DummyResponse | ||
from ..items import CustomProduct | ||
from ..zyte_crawlers.spiders.ecommerce import EcommerceSpider | ||
class BooksToScrapeComSpider(EcommerceSpider): | ||
name = "books_toscrape_com" | ||
metadata = { | ||
**EcommerceSpider.metadata, | ||
"title": "Books to Scrape", | ||
"description": "Spider template for books.toscrape.com", | ||
} | ||
def parse_product(self, response: DummyResponse, product: CustomProduct): | ||
yield from super().parse_product(response, product) |
Oops, something went wrong.