Skip to content

Commit

Permalink
Merge pull request #2 from zytedata/strategy-pagination-only
Browse files Browse the repository at this point in the history
add new crawl_strategy: `pagination_only`
  • Loading branch information
kmike authored Oct 27, 2023
2 parents a6d4f20 + 396283d commit 2abab5c
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 12 deletions.
32 changes: 31 additions & 1 deletion tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,29 @@ def test_crawl():
assert requests[1].url == item_urls[1]
assert requests[1].callback == spider.parse_product

# Test parse_navigation() behavior on pagination_only crawl strategy.
spider = EcommerceSpider(
url="https://example.com/", crawl_strategy="pagination_only"
)

# nextpage + items
navigation = ProductNavigation.from_dict(
{
"url": url,
**subcategories,
**nextpage,
**items,
}
)
requests = list(spider.parse_navigation(response, navigation))
urls = {request.url for request in requests}
assert urls == {*item_urls, nextpage_url}
for request in requests:
if request.url in item_urls:
assert request.callback == spider.parse_product
else:
assert request.callback == spider.parse_navigation


@pytest.mark.parametrize(
"probability,has_item", ((0.9, True), (0.09, False), (0.1, True), (None, True))
Expand Down Expand Up @@ -292,7 +315,7 @@ def test_metadata():
"title": "Crawl strategy",
"description": "Determines how the start URL and follow-up URLs are crawled.",
"type": "string",
"enum": ["navigation", "full"],
"enum": ["full", "navigation", "pagination_only"],
"enumMeta": {
"full": {
"description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
Expand All @@ -302,6 +325,13 @@ def test_metadata():
"description": "Follow pagination, subcategories, and product detail pages.",
"title": "Navigation",
},
"pagination_only": {
"description": (
"Follow pagination and product detail pages. SubCategory links are ignored. "
"Use this when some subCategory links are misidentified by ML-extraction."
),
"title": "Pagination Only",
},
},
},
"extract_from": {
Expand Down
32 changes: 21 additions & 11 deletions zyte_spider_templates/spiders/ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@


class EcommerceCrawlStrategy(str, Enum):
navigation: str = "navigation"
full: str = "full"
navigation: str = "navigation"
pagination_only: str = "pagination_only"


class ExtractFrom(str, Enum):
Expand All @@ -30,13 +31,20 @@ class EcommerceSpiderParams(BaseSpiderParams):
default=EcommerceCrawlStrategy.navigation,
json_schema_extra={
"enumMeta": {
EcommerceCrawlStrategy.full: {
"title": "Full",
"description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
},
EcommerceCrawlStrategy.navigation: {
"title": "Navigation",
"description": "Follow pagination, subcategories, and product detail pages.",
},
EcommerceCrawlStrategy.full: {
"title": "Full",
"description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
EcommerceCrawlStrategy.pagination_only: {
"title": "Pagination Only",
"description": (
"Follow pagination and product detail pages. SubCategory links are ignored. "
"Use this when some subCategory links are misidentified by ML-extraction."
),
},
},
},
Expand Down Expand Up @@ -135,13 +143,15 @@ def parse_navigation(
navigation.nextPage,
priority=self.ITEM_REQUEST_PRIORITY - 1,
)
for request in navigation.subCategories or []:
if "[heuristics]" in (request.name or ""):
yield self.get_parse_navigation_request(
request, page_params=page_params
)
else:
yield self.get_parse_navigation_request(request)

if self.args.crawl_strategy != EcommerceCrawlStrategy.pagination_only:
for request in navigation.subCategories or []:
if "[heuristics]" in (request.name or ""):
yield self.get_parse_navigation_request(
request, page_params=page_params
)
else:
yield self.get_parse_navigation_request(request)

def parse_product(
self, response: DummyResponse, product: Product
Expand Down

0 comments on commit 2abab5c

Please sign in to comment.