diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 82828f4..2f55da9 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -183,6 +183,29 @@ def test_crawl(): assert requests[1].url == item_urls[1] assert requests[1].callback == spider.parse_product + # Test parse_navigation() behavior on pagination_only crawl strategy. + spider = EcommerceSpider( + url="https://example.com/", crawl_strategy="pagination_only" + ) + + # nextpage + items + navigation = ProductNavigation.from_dict( + { + "url": url, + **subcategories, + **nextpage, + **items, + } + ) + requests = list(spider.parse_navigation(response, navigation)) + urls = {request.url for request in requests} + assert urls == {*item_urls, nextpage_url} + for request in requests: + if request.url in item_urls: + assert request.callback == spider.parse_product + else: + assert request.callback == spider.parse_navigation + @pytest.mark.parametrize( "probability,has_item", ((0.9, True), (0.09, False), (0.1, True), (None, True)) @@ -292,7 +315,7 @@ def test_metadata(): "title": "Crawl strategy", "description": "Determines how the start URL and follow-up URLs are crawled.", "type": "string", - "enum": ["navigation", "full"], + "enum": ["full", "navigation", "pagination_only"], "enumMeta": { "full": { "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.", @@ -302,6 +325,13 @@ def test_metadata(): "description": "Follow pagination, subcategories, and product detail pages.", "title": "Navigation", }, + "pagination_only": { + "description": ( + "Follow pagination and product detail pages. SubCategory links are ignored. " + "Use this when some subCategory links are misidentified by ML-extraction." + ), + "title": "Pagination Only", + }, }, }, "extract_from": { diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index e0a69b6..41eb71d 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -14,8 +14,9 @@ class EcommerceCrawlStrategy(str, Enum): - navigation: str = "navigation" full: str = "full" + navigation: str = "navigation" + pagination_only: str = "pagination_only" class ExtractFrom(str, Enum): @@ -30,13 +31,20 @@ class EcommerceSpiderParams(BaseSpiderParams): default=EcommerceCrawlStrategy.navigation, json_schema_extra={ "enumMeta": { + EcommerceCrawlStrategy.full: { + "title": "Full", + "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.", + }, EcommerceCrawlStrategy.navigation: { "title": "Navigation", "description": "Follow pagination, subcategories, and product detail pages.", }, - EcommerceCrawlStrategy.full: { - "title": "Full", - "description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.", + EcommerceCrawlStrategy.pagination_only: { + "title": "Pagination Only", + "description": ( + "Follow pagination and product detail pages. SubCategory links are ignored. " + "Use this when some subCategory links are misidentified by ML-extraction." + ), }, }, }, @@ -135,13 +143,15 @@ def parse_navigation( navigation.nextPage, priority=self.ITEM_REQUEST_PRIORITY - 1, ) - for request in navigation.subCategories or []: - if "[heuristics]" in (request.name or ""): - yield self.get_parse_navigation_request( - request, page_params=page_params - ) - else: - yield self.get_parse_navigation_request(request) + + if self.args.crawl_strategy != EcommerceCrawlStrategy.pagination_only: + for request in navigation.subCategories or []: + if "[heuristics]" in (request.name or ""): + yield self.get_parse_navigation_request( + request, page_params=page_params + ) + else: + yield self.get_parse_navigation_request(request) def parse_product( self, response: DummyResponse, product: Product