combine full and navigation strategies

zytedata · Apr 30, 2024 · c3dbcec · c3dbcec
1 parent 186eba3
commit c3dbcec
Show file tree

Hide file tree

Showing 4 changed files with 155 additions and 18 deletions.
diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py
@@ -32,9 +32,9 @@ def test_parameters():
 
     EcommerceSpider(url="https://example.com")
     EcommerceSpider(
-        url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.full
+        url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.default
     )
-    EcommerceSpider(url="https://example.com", crawl_strategy="full")
+    EcommerceSpider(url="https://example.com", crawl_strategy="default")
 
     with pytest.raises(ValidationError):
         EcommerceSpider(url="https://example.com", crawl_strategy="unknown")
@@ -465,18 +465,30 @@ def test_metadata():
                     "enum": ["httpResponseBody", "browserHtml"],
                 },
                 "crawl_strategy": {
-                    "default": "full",
+                    "default": "default",
                     "description": "Determines how the start URL and follow-up URLs are crawled.",
                     "enumMeta": {
+                        "default": {
+                            "description": (
+                                "Follow pagination, subcategories, and product detail pages. "
+                                "If starting on a homepage, it would attempt to discover other "
+                                "URLs in the page using heuristics."
+                            ),
+                            "title": "Default",
+                        },
                         "full": {
-                            "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
+                            "description": (
+                                "(Deprecated. Use Default instead) Follow most links "
+                                "within the domain of URL in an attempt to discover and "
+                                "extract as many products as possible."
+                            ),
                             "title": "Full",
                         },
                         "navigation": {
                             "description": (
-                                "Follow pagination, subcategories, and "
-                                "product detail pages. Pagination Only is a "
-                                "better choice if the target URL does not "
+                                "(Deprecated. Use Default instead) Follow pagination, "
+                                "subcategories, and product detail pages. Pagination "
+                                "Only is a better choice if the target URL does not "
                                 "have subcategories, or if Zyte API is "
                                 "misidentifying some URLs as subcategories."
                             ),
@@ -490,7 +502,7 @@ def test_metadata():
                         },
                     },
                     "title": "Crawl strategy",
-                    "enum": ["full", "navigation", "pagination_only"],
+                    "enum": ["default", "full", "navigation", "pagination_only"],
                     "type": "string",
                 },
             },
@@ -727,3 +739,57 @@ def test_urls_file():
     assert start_requests[0].url == "https://a.example"
     assert start_requests[1].url == "https://b.example"
     assert start_requests[2].url == "https://c.example"
+
+
+@pytest.mark.parametrize(
+    "url,has_full_domain",
+    (
+        ("https://example.com", (True, True, False, False)),
+        ("https://example.com/", (True, True, False, False)),
+        ("https://example.com/index.htm", (True, True, False, False)),
+        ("https://example.com/index.html", (True, True, False, False)),
+        ("https://example.com/index.php", (True, True, False, False)),
+        ("https://example.com/home", (True, True, False, False)),
+        ("https://example.com/some/category", (False, True, False, False)),
+        ("https://example.com/some/category?pid=123", (False, True, False, False)),
+    ),
+)
+def test_get_start_request_default_strategy(url, has_full_domain):
+    def assert_meta(has_page_params):
+        meta = {"crawling_logs": {"page_type": "productNavigation"}}
+        if has_page_params:
+            meta["page_params"] = {"full_domain": "example.com"}
+        assert result.meta == meta
+
+    for i, crawl_strategy in enumerate(
+        ["default", "full", "navigation", "pagination_only"]
+    ):
+        spider = EcommerceSpider.from_crawler(
+            get_crawler(), url=url, crawl_strategy=crawl_strategy
+        )
+        result = spider.get_start_request(url)
+        assert result.url == url
+        assert result.callback == spider.parse_navigation
+        assert_meta(has_full_domain[i])
+
+
+@pytest.mark.parametrize(
+    "crawl_strategy,expected_page_params",
+    (
+        ("default", {}),
+        ("full", {"full_domain": "example.com"}),
+        ("navigation", {}),
+        ("pagination_only", {}),
+    ),
+)
+def test_page_params_for_heuristics(crawl_strategy, expected_page_params):
+    url = "https://example.com"
+    meta = {"page_params": {"full_domain": "example.com"}}
+    request = scrapy.Request(url, meta=meta)
+    response = DummyResponse(url=url, request=request)
+
+    spider = EcommerceSpider.from_crawler(
+        get_crawler(), url=url, crawl_strategy=crawl_strategy
+    )
+    page_params = spider.page_params_for_heuristics(response)
+    assert page_params == expected_page_params
diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py
@@ -1,6 +1,6 @@
 import pytest
 
-from zyte_spider_templates.heuristics import might_be_category
+from zyte_spider_templates.heuristics import is_homepage, might_be_category
 
 
 @pytest.mark.parametrize(
@@ -50,3 +50,21 @@
 )
 def test_might_be_category(test_input, expected):
     assert might_be_category(test_input) == expected
+
+
+@pytest.mark.parametrize(
+    "url,expected",
+    (
+        ("https://example.com", True),
+        ("https://example.com/", True),
+        ("https://example.com/index.htm", True),
+        ("https://example.com/index.html", True),
+        ("https://example.com/index.php", True),
+        ("https://example.com/home", True),
+        ("https://example.com/?ref=abc", False),
+        ("https://example.com/some/category", False),
+        ("https://example.com/some/category?query=2123", False),
+    ),
+)
+def test_is_homepage(url, expected):
+    assert is_homepage(url) == expected
diff --git a/zyte_spider_templates/heuristics.py b/zyte_spider_templates/heuristics.py
@@ -1,5 +1,5 @@
 import re
-from urllib.parse import urlparse
+from urllib.parse import urlparse, urlsplit
 
 NO_CONTENT_PATHS = (
     "/authenticate",
@@ -56,3 +56,19 @@ def might_be_category(url: str) -> bool:
                 return False
 
     return True
+
+
+INDEX_URL_PATHS = {
+    "",
+    "/",
+    "/index.html",
+    "/index.htm",
+    "/index.php",
+    "/home",
+}
+
+
+# TODO: support localization suffixes? Example: /en, /en-us
+def is_homepage(url: str) -> bool:
+    url_split = urlsplit(url)
+    return url_split.path in INDEX_URL_PATHS and not url_split.query
diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py
@@ -10,6 +10,7 @@
 from scrapy_spider_metadata import Args
 from zyte_common_items import ProbabilityRequest, Product, ProductNavigation
 
+from zyte_spider_templates.heuristics import is_homepage
 from zyte_spider_templates.spiders.base import (
     ARG_SETTING_PRIORITY,
     BaseSpider,
@@ -23,6 +24,13 @@
 
 @document_enum
 class EcommerceCrawlStrategy(str, Enum):
+    default: str = "default"
+    """Follow pagination, subcategories, and product detail pages.
+
+    If the starting URL points to a homepage, it would attempt to discover other
+    URLs in the page using heuristics.
+    """
+
     full: str = "full"
     """Follow most links within the domain of URL in an attempt to discover and
     extract as many products as possible."""
@@ -43,20 +51,32 @@ class EcommerceCrawlStrategyParam(BaseModel):
     crawl_strategy: EcommerceCrawlStrategy = Field(
         title="Crawl strategy",
         description="Determines how the start URL and follow-up URLs are crawled.",
-        default=EcommerceCrawlStrategy.full,
+        default=EcommerceCrawlStrategy.default,
         json_schema_extra={
             "enumMeta": {
+                EcommerceCrawlStrategy.default: {
+                    "description": (
+                        "Follow pagination, subcategories, and product detail pages. "
+                        "If starting on a homepage, it would attempt to discover other "
+                        "URLs in the page using heuristics."
+                    ),
+                    "title": "Default",
+                },
                 EcommerceCrawlStrategy.full: {
                     "title": "Full",
-                    "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
+                    "description": (
+                        "(Deprecated. Use Default instead) Follow most links within the "
+                        "domain of URL in an attempt to discover and extract as many "
+                        "products as possible."
+                    ),
                 },
                 EcommerceCrawlStrategy.navigation: {
                     "title": "Navigation",
                     "description": (
-                        "Follow pagination, subcategories, and product detail "
-                        "pages. Pagination Only is a better choice if the "
-                        "target URL does not have subcategories, or if Zyte "
-                        "API is misidentifying some URLs as subcategories."
+                        "(Deprecated. Use Default instead) Follow pagination, "
+                        "subcategories, and product detail pages. Pagination Only is a "
+                        "better choice if the target URL does not have subcategories, "
+                        "or if Zyte API is misidentifying some URLs as subcategories."
                     ),
                 },
                 EcommerceCrawlStrategy.pagination_only: {
@@ -125,7 +145,10 @@ def get_start_request(self, url):
         meta = {
             "crawling_logs": {"page_type": "productNavigation"},
         }
-        if self.args.crawl_strategy == EcommerceCrawlStrategy.full:
+        if self.args.crawl_strategy == EcommerceCrawlStrategy.full or (
+            self.args.crawl_strategy == EcommerceCrawlStrategy.default
+            and is_homepage(url)
+        ):
             meta["page_params"] = {"full_domain": get_domain(url)}
         return Request(
             url=url,
@@ -140,7 +163,7 @@ def start_requests(self) -> Iterable[Request]:
     def parse_navigation(
         self, response: DummyResponse, navigation: ProductNavigation
     ) -> Iterable[Request]:
-        page_params = response.meta.get("page_params")
+        page_params = self.page_params_for_heuristics(response)
 
         products = navigation.items or []
         for request in products:
@@ -266,3 +289,17 @@ def get_parse_product_request(
         )
         scrapy_request.meta["allow_offsite"] = True
         return scrapy_request
+
+    def page_params_for_heuristics(
+        self, response: DummyResponse
+    ) -> Optional[Dict[str, Any]]:
+        page_params = response.meta.get("page_params")
+
+        # Only allow heuristic extraction of links in non-homepage when on "full" crawl.
+        if (
+            self.args.crawl_strategy != EcommerceCrawlStrategy.full
+            and "full_domain" in (page_params or {})
+        ):
+            page_params.pop("full_domain")
+
+        return page_params