Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an input URL list parameter #38

Merged
merged 16 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 90 additions & 1 deletion tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,16 @@ def test_metadata():
"title": "E-commerce",
"description": "Template for spiders that extract product data from e-commerce websites.",
"param_schema": {
"groups": [
{
"id": "inputs",
"title": "Inputs",
"description": (
"Input data that determines the start URLs of the crawl."
),
"widget": "exclusive",
},
],
"properties": {
"crawl_strategy": {
"default": "full",
Expand Down Expand Up @@ -445,9 +455,25 @@ def test_metadata():
"you can copy and paste it from your browser. Example: https://toscrape.com/"
),
"pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
"default": "",
"group": "inputs",
},
"urls": {
"anyOf": [
{"type": "array", "items": {"type": "string"}},
{"type": "null"},
],
"title": "URLs",
"description": (
"Initial URLs for the crawl, separated by new lines. Enter the "
"full URL including http(s), you can copy and paste it from your "
"browser. Example: https://toscrape.com/"
),
"default": None,
"group": "inputs",
"widget": "textarea",
},
},
"required": ["url"],
"title": "EcommerceSpiderParams",
"type": "object",
},
Expand Down Expand Up @@ -637,3 +663,66 @@ def test_set_allowed_domains(url, allowed_domain):
kwargs = {"url": url}
spider = EcommerceSpider.from_crawler(crawler, **kwargs)
assert spider.allowed_domains == [allowed_domain]


def test_input_none():
crawler = get_crawler()
with pytest.raises(ValueError):
EcommerceSpider.from_crawler(crawler)


def test_input_multiple():
crawler = get_crawler()
with pytest.raises(ValueError):
EcommerceSpider.from_crawler(
crawler,
url="https://a.example",
urls=["https://b.example"],
)


def test_url_invalid():
crawler = get_crawler()
with pytest.raises(ValueError):
EcommerceSpider.from_crawler(crawler, url="foo")


def test_urls(caplog):
crawler = get_crawler()
url = "https://example.com"

spider = EcommerceSpider.from_crawler(crawler, urls=[url])
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert start_requests[0].url == url
assert start_requests[0].callback == spider.parse_navigation

spider = EcommerceSpider.from_crawler(crawler, urls=url)
start_requests = list(spider.start_requests())
assert len(start_requests) == 1
assert start_requests[0].url == url
assert start_requests[0].callback == spider.parse_navigation

caplog.clear()
spider = EcommerceSpider.from_crawler(
crawler,
urls="https://a.example\n \nhttps://b.example\nhttps://c.example\nfoo\n\n",
)
assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text
start_requests = list(spider.start_requests())
assert len(start_requests) == 3
assert all(
request.callback == spider.parse_navigation for request in start_requests
)
assert start_requests[0].url == "https://a.example"
assert start_requests[1].url == "https://b.example"
assert start_requests[2].url == "https://c.example"

caplog.clear()
with pytest.raises(ValueError):
spider = EcommerceSpider.from_crawler(
crawler,
urls="foo\nbar",
)
assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text
assert "'bar', from the 'urls' spider argument, is not a valid URL" in caplog.text
105 changes: 102 additions & 3 deletions zyte_spider_templates/spiders/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import re
from enum import Enum
from importlib.metadata import version
from typing import Any, Dict, Optional
from logging import getLogger
from typing import Any, Dict, List, Optional, Union

import scrapy
from pydantic import BaseModel, Field
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
from scrapy.crawler import Crawler

from zyte_spider_templates._geolocations import (
Expand All @@ -15,6 +17,8 @@
# Higher priority than command-line-defined settings (40).
ARG_SETTING_PRIORITY: int = 50

logger = getLogger(__name__)


@document_enum
class ExtractFrom(str, Enum):
Expand All @@ -26,12 +30,48 @@ class ExtractFrom(str, Enum):
"""Use browser rendering. Often provides the best quality."""


_INPUT_FIELDS = ("url", "urls")
_URL_PATTERN = r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$"


class BaseSpiderParams(BaseModel):
model_config = ConfigDict(
json_schema_extra={
"groups": [
{
"id": "inputs",
"title": "Inputs",
"description": (
"Input data that determines the start URLs of the crawl."
),
"widget": "exclusive",
},
],
},
)

url: str = Field(
title="URL",
description="Initial URL for the crawl. Enter the full URL including http(s), "
"you can copy and paste it from your browser. Example: https://toscrape.com/",
pattern=r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
pattern=_URL_PATTERN,
default="",
json_schema_extra={
"group": "inputs",
},
)
urls: Optional[List[str]] = Field(
title="URLs",
description=(
"Initial URLs for the crawl, separated by new lines. Enter the "
"full URL including http(s), you can copy and paste it from your "
"browser. Example: https://toscrape.com/"
),
default=None,
json_schema_extra={
"group": "inputs",
"widget": "textarea",
},
)
geolocation: Optional[Geolocation] = Field(
title="Geolocation",
Expand Down Expand Up @@ -81,6 +121,65 @@ class BaseSpiderParams(BaseModel):
},
)

@field_validator("urls", mode="before")
@classmethod
def validate_url_list(cls, value: Union[List[str], str]) -> List[str]:
"""Validate a list of URLs.

If a string is received as input, it is split into multiple strings
on new lines.

List items that do not match a URL pattern trigger a warning and are
removed from the list. If all URLs are invalid, validation fails.
"""
if isinstance(value, str):
value = value.split("\n")
if value:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What we expect to have is it's not str instance?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any of the actually supported types: None or a list of str.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Gallaecio will you add something to make this work?
mind that "URL\nURL" will not work - from your comment in description.
or running like
scrapy crawl spider -a urls="https://www.nytimes.com/\nhttps://www.theguardian.com"
I've added this workaround https://github.com/zytedata/zyte-spider-templates-private/blob/article/zyte_spider_templates/spiders/article.py#L135, but might be we can handle it in validate_url_list too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think supporting that syntax is a bit dangerous, as \n are valid URL characters. Since it will not be an issue in the UI or from code, I am not sure if it is worth supporting.

Maybe we could support spaces as a separator, but we risk scenarios were users expect a space → %20 conversion to happen.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Gallaecio I've checked https://www.rfc-editor.org/rfc/rfc3986 and haven't found anything about \n as allowed character - but might be missed something.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant valid characters, i.e. \ is a valid character and n is a valid character. https://example.com/\n is a valid URL.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, makes sense, thanks!

new_value = []
for v in value:
v = v.strip()
if not v:
continue
if not re.search(_URL_PATTERN, v):
logger.warning(
f"{v!r}, from the 'urls' spider argument, is not a "
f"valid URL and will be ignored."
)
continue
new_value.append(v)
if new_value:
value = new_value
else:
raise ValueError(f"No valid URL found in {value!r}")
return value

@model_validator(mode="after")
def single_input(self):
"""Fields
:class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.url`

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's base class and we are going to use the same approach in Articles - in this case, might be no need to mention ecommerce.EcommerceSpiderParams explicitly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once we merge article support, we can refactor the docs to cover shared params elsewhere in the docs, and point there. But for the current state of the repository, this is the best thing we can link to.

and
:class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls`
form a mandatory, mutually-exclusive field group: one of them must be
defined, the rest must not be defined."""
input_fields = set(
field for field in _INPUT_FIELDS if getattr(self, field, None)
)
if not input_fields:
input_field_list = ", ".join(_INPUT_FIELDS)
raise ValueError(
f"No input parameter defined. Please, define one of: "
f"{input_field_list}."
)
elif len(input_fields) > 1:
input_field_list = ", ".join(
f"{field} ({getattr(self, field)!r})" for field in input_fields
)
raise ValueError(
f"Expected a single input parameter, got {len(input_fields)}: "
f"{input_field_list}."
)
return self


class BaseSpider(scrapy.Spider):
custom_settings: Dict[str, Any] = {
Expand Down
27 changes: 18 additions & 9 deletions zyte_spider_templates/spiders/ecommerce.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import Enum
from logging import getLogger
from typing import Any, Callable, Dict, Iterable, Optional, Union

import scrapy
Expand All @@ -17,6 +18,8 @@
)
from zyte_spider_templates.utils import get_domain

logger = getLogger(__name__)


@document_enum
class EcommerceCrawlStrategy(str, Enum):
Expand Down Expand Up @@ -87,7 +90,12 @@ class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider):
@classmethod
def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
spider = super(EcommerceSpider, cls).from_crawler(crawler, *args, **kwargs)
spider.allowed_domains = [get_domain(spider.args.url)]
url = getattr(spider.args, "url", None)
if url:
spider.start_urls = [url]
else:
spider.start_urls = spider.args.urls
spider.allowed_domains = [get_domain(url) for url in spider.start_urls]

if spider.args.extract_from is not None:
spider.settings.set(
Expand All @@ -109,14 +117,15 @@ def start_requests(self) -> Iterable[Request]:
if self.args.crawl_strategy == EcommerceCrawlStrategy.full:
page_params = {"full_domain": self.allowed_domains[0]}

yield Request(
url=self.args.url,
callback=self.parse_navigation,
meta={
"page_params": page_params,
"crawling_logs": {"page_type": "productNavigation"},
},
)
for url in self.start_urls:
yield Request(
url=url,
callback=self.parse_navigation,
meta={
"page_params": page_params,
"crawling_logs": {"page_type": "productNavigation"},
},
)

def parse_navigation(
self, response: DummyResponse, navigation: ProductNavigation
Expand Down
Loading