From b6a103f7acdb5ad3156121b303592f35d9006e0c Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 26 Oct 2023 18:42:51 +0800 Subject: [PATCH] redefine how request probabilities are computed --- tests/test_ecommerce.py | 16 +++++++++++---- zyte_spider_templates/spiders/base.py | 24 ++++++++++++++-------- zyte_spider_templates/spiders/ecommerce.py | 2 +- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 82828f4..b721581 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -60,15 +60,15 @@ def test_crawl(): subcategories = { "subCategories": [ - {"url": subcategory_urls[0]}, - {"url": subcategory_urls[1]}, + {"url": subcategory_urls[0], "metadata": {"probability": 0.95}}, + {"url": subcategory_urls[1], "metadata": {"probability": 0.78}}, ], } nextpage = {"nextPage": {"url": nextpage_url}} items = { "items": [ - {"url": item_urls[0]}, - {"url": item_urls[1]}, + {"url": item_urls[0], "metadata": {"probability": 0.99}}, + {"url": item_urls[1], "metadata": {"probability": 0.83}}, ], } @@ -86,8 +86,10 @@ def test_crawl(): assert len(requests) == 2 assert requests[0].url == subcategory_urls[0] assert requests[0].callback == spider.parse_navigation + assert requests[0].priority == 95 assert requests[1].url == subcategory_urls[1] assert requests[1].callback == spider.parse_navigation + assert requests[1].priority == 78 # subcategories + nextpage navigation = ProductNavigation.from_dict( @@ -102,6 +104,7 @@ def test_crawl(): urls = {request.url for request in requests} assert urls == {*subcategory_urls, nextpage_url} assert all(request.callback == spider.parse_navigation for request in requests) + assert [request.priority for request in requests] == [100, 95, 78] # subcategories + nextpage + items navigation = ProductNavigation.from_dict( @@ -120,6 +123,7 @@ def test_crawl(): assert request.callback == spider.parse_product else: assert request.callback == spider.parse_navigation + assert [request.priority for request in requests] == [199, 183, 100, 95, 78] # nextpage + items navigation = ProductNavigation.from_dict( @@ -137,6 +141,7 @@ def test_crawl(): assert requests[1].callback == spider.parse_product assert requests[2].url == nextpage_url assert requests[2].callback == spider.parse_navigation + assert [request.priority for request in requests] == [199, 183, 100] # subcategories + items navigation = ProductNavigation.from_dict( @@ -156,6 +161,7 @@ def test_crawl(): assert requests[2].callback == spider.parse_navigation assert requests[3].url == subcategory_urls[1] assert requests[3].callback == spider.parse_navigation + assert [request.priority for request in requests] == [199, 183, 95, 78] # nextpage navigation = ProductNavigation.from_dict( @@ -168,6 +174,7 @@ def test_crawl(): assert len(requests) == 1 assert requests[0].url == nextpage_url assert requests[0].callback == spider.parse_navigation + assert [request.priority for request in requests] == [100] # items navigation = ProductNavigation.from_dict( @@ -182,6 +189,7 @@ def test_crawl(): assert requests[0].callback == spider.parse_product assert requests[1].url == item_urls[1] assert requests[1].callback == spider.parse_product + assert [request.priority for request in requests] == [199, 183] @pytest.mark.parametrize( diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py index 1c6bd43..4b2fe5b 100644 --- a/zyte_spider_templates/spiders/base.py +++ b/zyte_spider_templates/spiders/base.py @@ -1,11 +1,11 @@ from importlib.metadata import version -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Dict, Optional, Union import scrapy from pydantic import BaseModel, Field from scrapy.crawler import Crawler from scrapy.utils.url import parse_url -from zyte_common_items import Request +from zyte_common_items import ProbabilityRequest, Request from zyte_spider_templates._geolocations import ( GEOLOCATION_OPTIONS_WITH_CODE, @@ -57,7 +57,7 @@ class BaseSpider(scrapy.Spider): "description": "Base template.", } - ITEM_REQUEST_PRIORITY: int = 10 + _NEXT_PAGE_PRIORITY: int = 100 @classmethod def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: @@ -86,18 +86,20 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: return spider @staticmethod - def get_parse_navigation_request_priority(request: Request) -> int: + def get_parse_navigation_request_priority( + request: Union[ProbabilityRequest, Request] + ) -> int: if ( not hasattr(request, "metadata") or not request.metadata or request.metadata.probability is None ): return 0 - return int(10 * request.metadata.probability) + return int(100 * request.metadata.probability) def get_parse_navigation_request( self, - request: Request, + request: Union[ProbabilityRequest, Request], callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, priority: Optional[int] = None, @@ -109,11 +111,15 @@ def get_parse_navigation_request( meta={"page_params": page_params or {}}, ) - def get_parse_product_request_priority(self, request: Request) -> int: - return self.ITEM_REQUEST_PRIORITY + def get_parse_product_request_priority(self, request: ProbabilityRequest) -> int: + # TODO: Simplify when https://github.com/zytedata/zyte-common-items/pull/64 is released + probability = 0 + if metadata := getattr(request, "metadata", None): + probability = metadata.probability + return int(100 * probability) + self._NEXT_PAGE_PRIORITY def get_parse_product_request( - self, request: Request, callback: Optional[Callable] = None + self, request: ProbabilityRequest, callback: Optional[Callable] = None ) -> scrapy.Request: callback = callback or self.parse_product return request.to_scrapy( diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index e0a69b6..7c4db2e 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -133,7 +133,7 @@ def parse_navigation( if navigation.nextPage: yield self.get_parse_navigation_request( navigation.nextPage, - priority=self.ITEM_REQUEST_PRIORITY - 1, + priority=self._NEXT_PAGE_PRIORITY, ) for request in navigation.subCategories or []: if "[heuristics]" in (request.name or ""):