zytedata · Gallaecio · Aug 20, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py
@@ -355,6 +355,16 @@ def test_metadata():
         "title": "E-commerce",
         "description": "Template for spiders that extract product data from e-commerce websites.",
         "param_schema": {
+            "groups": [
+                {
+                    "id": "inputs",
+                    "title": "Inputs",
+                    "description": (
+                        "Input data that determines the start URLs of the crawl."
+                    ),
+                    "widget": "exclusive",
+                },
+            ],
             "properties": {
                 "crawl_strategy": {
                     "default": "full",
@@ -445,9 +455,25 @@ def test_metadata():
                         "you can copy and paste it from your browser. Example: https://toscrape.com/"
                     ),
                     "pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
+                    "default": "",
+                    "group": "inputs",
+                },
+                "urls": {
+                    "anyOf": [
+                        {"type": "array", "items": {"type": "string"}},
+                        {"type": "null"},
+                    ],
+                    "title": "URLs",
+                    "description": (
+                        "Initial URLs for the crawl, separated by new lines. Enter the "
+                        "full URL including http(s), you can copy and paste it from your "
+                        "browser. Example: https://toscrape.com/"
+                    ),
+                    "default": None,
+                    "group": "inputs",
+                    "widget": "textarea",
                 },
             },
-            "required": ["url"],
             "title": "EcommerceSpiderParams",
             "type": "object",
         },
@@ -637,3 +663,66 @@ def test_set_allowed_domains(url, allowed_domain):
     kwargs = {"url": url}
     spider = EcommerceSpider.from_crawler(crawler, **kwargs)
     assert spider.allowed_domains == [allowed_domain]
+
+
+def test_input_none():
+    crawler = get_crawler()
+    with pytest.raises(ValueError):
+        EcommerceSpider.from_crawler(crawler)
+
+
+def test_input_multiple():
+    crawler = get_crawler()
+    with pytest.raises(ValueError):
+        EcommerceSpider.from_crawler(
+            crawler,
+            url="https://a.example",
+            urls=["https://b.example"],
+        )
+
+
+def test_url_invalid():
+    crawler = get_crawler()
+    with pytest.raises(ValueError):
+        EcommerceSpider.from_crawler(crawler, url="foo")
+
+
+def test_urls(caplog):
+    crawler = get_crawler()
+    url = "https://example.com"
+
+    spider = EcommerceSpider.from_crawler(crawler, urls=[url])
+    start_requests = list(spider.start_requests())
+    assert len(start_requests) == 1
+    assert start_requests[0].url == url
+    assert start_requests[0].callback == spider.parse_navigation
+
+    spider = EcommerceSpider.from_crawler(crawler, urls=url)
+    start_requests = list(spider.start_requests())
+    assert len(start_requests) == 1
+    assert start_requests[0].url == url
+    assert start_requests[0].callback == spider.parse_navigation
+
+    caplog.clear()
+    spider = EcommerceSpider.from_crawler(
+        crawler,
+        urls="https://a.example\n \nhttps://b.example\nhttps://c.example\nfoo\n\n",
+    )
+    assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text
+    start_requests = list(spider.start_requests())
+    assert len(start_requests) == 3
+    assert all(
+        request.callback == spider.parse_navigation for request in start_requests
+    )
+    assert start_requests[0].url == "https://a.example"
+    assert start_requests[1].url == "https://b.example"
+    assert start_requests[2].url == "https://c.example"
+
+    caplog.clear()
+    with pytest.raises(ValueError):
+        spider = EcommerceSpider.from_crawler(
+            crawler,
+            urls="foo\nbar",
+        )
+    assert "'foo', from the 'urls' spider argument, is not a valid URL" in caplog.text
+    assert "'bar', from the 'urls' spider argument, is not a valid URL" in caplog.text
diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py
@@ -1,9 +1,11 @@
+import re
 from enum import Enum
 from importlib.metadata import version
-from typing import Any, Dict, Optional
+from logging import getLogger
+from typing import Any, Dict, List, Optional, Union
 
 import scrapy
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from scrapy.crawler import Crawler
 
 from zyte_spider_templates._geolocations import (
@@ -15,6 +17,8 @@
 # Higher priority than command-line-defined settings (40).
 ARG_SETTING_PRIORITY: int = 50
 
+logger = getLogger(__name__)
+
 
 @document_enum
 class ExtractFrom(str, Enum):
@@ -26,12 +30,48 @@ class ExtractFrom(str, Enum):
     """Use browser rendering. Often provides the best quality."""
 
 
+_INPUT_FIELDS = ("url", "urls")
+_URL_PATTERN = r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$"
+
+
 class BaseSpiderParams(BaseModel):
+    model_config = ConfigDict(
+        json_schema_extra={
+            "groups": [
+                {
+                    "id": "inputs",
+                    "title": "Inputs",
+                    "description": (
+                        "Input data that determines the start URLs of the crawl."
+                    ),
+                    "widget": "exclusive",
+                },
+            ],
+        },
+    )
+
     url: str = Field(
         title="URL",
         description="Initial URL for the crawl. Enter the full URL including http(s), "
         "you can copy and paste it from your browser. Example: https://toscrape.com/",
-        pattern=r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$",
+        pattern=_URL_PATTERN,
+        default="",
+        json_schema_extra={
+            "group": "inputs",
+        },
+    )
+    urls: Optional[List[str]] = Field(
+        title="URLs",
+        description=(
+            "Initial URLs for the crawl, separated by new lines. Enter the "
+            "full URL including http(s), you can copy and paste it from your "
+            "browser. Example: https://toscrape.com/"
+        ),
+        default=None,
+        json_schema_extra={
+            "group": "inputs",
+            "widget": "textarea",
+        },
     )
     geolocation: Optional[Geolocation] = Field(
         title="Geolocation",
@@ -81,6 +121,65 @@ class BaseSpiderParams(BaseModel):
         },
     )
 
+    @field_validator("urls", mode="before")
+    @classmethod
+    def validate_url_list(cls, value: Union[List[str], str]) -> List[str]:
+        """Validate a list of URLs.
+
+        If a string is received as input, it is split into multiple strings
+        on new lines.
+
+        List items that do not match a URL pattern trigger a warning and are
+        removed from the list. If all URLs are invalid, validation fails.
+        """
+        if isinstance(value, str):
+            value = value.split("\n")
+        if value:
+            new_value = []
+            for v in value:
+                v = v.strip()
+                if not v:
+                    continue
+                if not re.search(_URL_PATTERN, v):
+                    logger.warning(
+                        f"{v!r}, from the 'urls' spider argument, is not a "
+                        f"valid URL and will be ignored."
+                    )
+                    continue
+                new_value.append(v)
+            if new_value:
+                value = new_value
+            else:
+                raise ValueError(f"No valid URL found in {value!r}")
+        return value
+
+    @model_validator(mode="after")
+    def single_input(self):
+        """Fields
+        :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.url`
+        and
+        :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams.urls`
+        form a mandatory, mutually-exclusive field group: one of them must be
+        defined, the rest must not be defined."""
+        input_fields = set(
+            field for field in _INPUT_FIELDS if getattr(self, field, None)
+        )
+        if not input_fields:
+            input_field_list = ", ".join(_INPUT_FIELDS)
+            raise ValueError(
+                f"No input parameter defined. Please, define one of: "
+                f"{input_field_list}."
+            )
+        elif len(input_fields) > 1:
+            input_field_list = ", ".join(
+                f"{field} ({getattr(self, field)!r})" for field in input_fields
+            )
+            raise ValueError(
+                f"Expected a single input parameter, got {len(input_fields)}: "
+                f"{input_field_list}."
+            )
+        return self
+
 
 class BaseSpider(scrapy.Spider):
     custom_settings: Dict[str, Any] = {

diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py
@@ -1,4 +1,5 @@
 from enum import Enum
+from logging import getLogger
 from typing import Any, Callable, Dict, Iterable, Optional, Union
 
 import scrapy
@@ -17,6 +18,8 @@
 )
 from zyte_spider_templates.utils import get_domain
 
+logger = getLogger(__name__)
+
 
 @document_enum
 class EcommerceCrawlStrategy(str, Enum):
@@ -87,7 +90,12 @@ class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider):
     @classmethod
     def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
         spider = super(EcommerceSpider, cls).from_crawler(crawler, *args, **kwargs)
-        spider.allowed_domains = [get_domain(spider.args.url)]
+        url = getattr(spider.args, "url", None)
+        if url:
+            spider.start_urls = [url]
+        else:
+            spider.start_urls = spider.args.urls
+        spider.allowed_domains = [get_domain(url) for url in spider.start_urls]
 
         if spider.args.extract_from is not None:
             spider.settings.set(
@@ -109,14 +117,15 @@ def start_requests(self) -> Iterable[Request]:
         if self.args.crawl_strategy == EcommerceCrawlStrategy.full:
             page_params = {"full_domain": self.allowed_domains[0]}
 
-        yield Request(
-            url=self.args.url,
-            callback=self.parse_navigation,
-            meta={
-                "page_params": page_params,
-                "crawling_logs": {"page_type": "productNavigation"},
-            },
-        )
+        for url in self.start_urls:
+            yield Request(
+                url=url,
+                callback=self.parse_navigation,
+                meta={
+                    "page_params": page_params,
+                    "crawling_logs": {"page_type": "productNavigation"},
+                },
+            )
 
     def parse_navigation(
         self, response: DummyResponse, navigation: ProductNavigation