diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 633aa71..52d222f 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -32,9 +32,9 @@ def test_parameters(): EcommerceSpider(url="https://example.com") EcommerceSpider( - url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.full + url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.default ) - EcommerceSpider(url="https://example.com", crawl_strategy="full") + EcommerceSpider(url="https://example.com", crawl_strategy="default") with pytest.raises(ValidationError): EcommerceSpider(url="https://example.com", crawl_strategy="unknown") @@ -465,18 +465,30 @@ def test_metadata(): "enum": ["httpResponseBody", "browserHtml"], }, "crawl_strategy": { - "default": "full", + "default": "default", "description": "Determines how the start URL and follow-up URLs are crawled.", "enumMeta": { + "default": { + "description": ( + "Follow pagination, subcategories, and product detail pages. " + "If starting on a homepage, it would attempt to discover other " + "URLs in the page using heuristics." + ), + "title": "Default", + }, "full": { - "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.", + "description": ( + "(Deprecated. Use Default instead) Follow most links " + "within the domain of URL in an attempt to discover and " + "extract as many products as possible." + ), "title": "Full", }, "navigation": { "description": ( - "Follow pagination, subcategories, and " - "product detail pages. Pagination Only is a " - "better choice if the target URL does not " + "(Deprecated. Use Default instead) Follow pagination, " + "subcategories, and product detail pages. Pagination " + "Only is a better choice if the target URL does not " "have subcategories, or if Zyte API is " "misidentifying some URLs as subcategories." ), @@ -490,7 +502,7 @@ def test_metadata(): }, }, "title": "Crawl strategy", - "enum": ["full", "navigation", "pagination_only"], + "enum": ["default", "full", "navigation", "pagination_only"], "type": "string", }, }, @@ -727,3 +739,57 @@ def test_urls_file(): assert start_requests[0].url == "https://a.example" assert start_requests[1].url == "https://b.example" assert start_requests[2].url == "https://c.example" + + +@pytest.mark.parametrize( + "url,has_full_domain", + ( + ("https://example.com", (True, True, False, False)), + ("https://example.com/", (True, True, False, False)), + ("https://example.com/index.htm", (True, True, False, False)), + ("https://example.com/index.html", (True, True, False, False)), + ("https://example.com/index.php", (True, True, False, False)), + ("https://example.com/home", (True, True, False, False)), + ("https://example.com/some/category", (False, True, False, False)), + ("https://example.com/some/category?pid=123", (False, True, False, False)), + ), +) +def test_get_start_request_default_strategy(url, has_full_domain): + def assert_meta(has_page_params): + meta = {"crawling_logs": {"page_type": "productNavigation"}} + if has_page_params: + meta["page_params"] = {"full_domain": "example.com"} + assert result.meta == meta + + for i, crawl_strategy in enumerate( + ["default", "full", "navigation", "pagination_only"] + ): + spider = EcommerceSpider.from_crawler( + get_crawler(), url=url, crawl_strategy=crawl_strategy + ) + result = spider.get_start_request(url) + assert result.url == url + assert result.callback == spider.parse_navigation + assert_meta(has_full_domain[i]) + + +@pytest.mark.parametrize( + "crawl_strategy,expected_page_params", + ( + ("default", {}), + ("full", {"full_domain": "example.com"}), + ("navigation", {}), + ("pagination_only", {}), + ), +) +def test_page_params_for_heuristics(crawl_strategy, expected_page_params): + url = "https://example.com" + meta = {"page_params": {"full_domain": "example.com"}} + request = scrapy.Request(url, meta=meta) + response = DummyResponse(url=url, request=request) + + spider = EcommerceSpider.from_crawler( + get_crawler(), url=url, crawl_strategy=crawl_strategy + ) + page_params = spider.page_params_for_heuristics(response) + assert page_params == expected_page_params diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py index fda92bd..9622694 100644 --- a/tests/test_heuristics.py +++ b/tests/test_heuristics.py @@ -1,6 +1,6 @@ import pytest -from zyte_spider_templates.heuristics import might_be_category +from zyte_spider_templates.heuristics import is_homepage, might_be_category @pytest.mark.parametrize( @@ -50,3 +50,21 @@ ) def test_might_be_category(test_input, expected): assert might_be_category(test_input) == expected + + +@pytest.mark.parametrize( + "url,expected", + ( + ("https://example.com", True), + ("https://example.com/", True), + ("https://example.com/index.htm", True), + ("https://example.com/index.html", True), + ("https://example.com/index.php", True), + ("https://example.com/home", True), + ("https://example.com/?ref=abc", False), + ("https://example.com/some/category", False), + ("https://example.com/some/category?query=2123", False), + ), +) +def test_is_homepage(url, expected): + assert is_homepage(url) == expected diff --git a/zyte_spider_templates/heuristics.py b/zyte_spider_templates/heuristics.py index 432d4ea..3afa4b1 100644 --- a/zyte_spider_templates/heuristics.py +++ b/zyte_spider_templates/heuristics.py @@ -1,5 +1,5 @@ import re -from urllib.parse import urlparse +from urllib.parse import urlparse, urlsplit NO_CONTENT_PATHS = ( "/authenticate", @@ -56,3 +56,19 @@ def might_be_category(url: str) -> bool: return False return True + + +INDEX_URL_PATHS = { + "", + "/", + "/index.html", + "/index.htm", + "/index.php", + "/home", +} + + +# TODO: support localization suffixes? Example: /en, /en-us +def is_homepage(url: str) -> bool: + url_split = urlsplit(url) + return url_split.path in INDEX_URL_PATHS and not url_split.query diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index 00a6173..7848f90 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -10,6 +10,7 @@ from scrapy_spider_metadata import Args from zyte_common_items import ProbabilityRequest, Product, ProductNavigation +from zyte_spider_templates.heuristics import is_homepage from zyte_spider_templates.spiders.base import ( ARG_SETTING_PRIORITY, BaseSpider, @@ -23,6 +24,13 @@ @document_enum class EcommerceCrawlStrategy(str, Enum): + default: str = "default" + """Follow pagination, subcategories, and product detail pages. + + If the starting URL points to a homepage, it would attempt to discover other + URLs in the page using heuristics. + """ + full: str = "full" """Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.""" @@ -43,20 +51,32 @@ class EcommerceCrawlStrategyParam(BaseModel): crawl_strategy: EcommerceCrawlStrategy = Field( title="Crawl strategy", description="Determines how the start URL and follow-up URLs are crawled.", - default=EcommerceCrawlStrategy.full, + default=EcommerceCrawlStrategy.default, json_schema_extra={ "enumMeta": { + EcommerceCrawlStrategy.default: { + "description": ( + "Follow pagination, subcategories, and product detail pages. " + "If starting on a homepage, it would attempt to discover other " + "URLs in the page using heuristics." + ), + "title": "Default", + }, EcommerceCrawlStrategy.full: { "title": "Full", - "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.", + "description": ( + "(Deprecated. Use Default instead) Follow most links within the " + "domain of URL in an attempt to discover and extract as many " + "products as possible." + ), }, EcommerceCrawlStrategy.navigation: { "title": "Navigation", "description": ( - "Follow pagination, subcategories, and product detail " - "pages. Pagination Only is a better choice if the " - "target URL does not have subcategories, or if Zyte " - "API is misidentifying some URLs as subcategories." + "(Deprecated. Use Default instead) Follow pagination, " + "subcategories, and product detail pages. Pagination Only is a " + "better choice if the target URL does not have subcategories, " + "or if Zyte API is misidentifying some URLs as subcategories." ), }, EcommerceCrawlStrategy.pagination_only: { @@ -125,7 +145,10 @@ def get_start_request(self, url): meta = { "crawling_logs": {"page_type": "productNavigation"}, } - if self.args.crawl_strategy == EcommerceCrawlStrategy.full: + if self.args.crawl_strategy == EcommerceCrawlStrategy.full or ( + self.args.crawl_strategy == EcommerceCrawlStrategy.default + and is_homepage(url) + ): meta["page_params"] = {"full_domain": get_domain(url)} return Request( url=url, @@ -140,7 +163,7 @@ def start_requests(self) -> Iterable[Request]: def parse_navigation( self, response: DummyResponse, navigation: ProductNavigation ) -> Iterable[Request]: - page_params = response.meta.get("page_params") + page_params = self.page_params_for_heuristics(response) products = navigation.items or [] for request in products: @@ -266,3 +289,17 @@ def get_parse_product_request( ) scrapy_request.meta["allow_offsite"] = True return scrapy_request + + def page_params_for_heuristics( + self, response: DummyResponse + ) -> Optional[Dict[str, Any]]: + page_params = response.meta.get("page_params") + + # Only allow heuristic extraction of links in non-homepage when on "full" crawl. + if ( + self.args.crawl_strategy != EcommerceCrawlStrategy.full + and "full_domain" in (page_params or {}) + ): + page_params.pop("full_domain") + + return page_params