Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

redefine how request probabilities are computed #3

Merged
merged 1 commit into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ def test_crawl():

subcategories = {
"subCategories": [
{"url": subcategory_urls[0]},
{"url": subcategory_urls[1]},
{"url": subcategory_urls[0], "metadata": {"probability": 0.95}},
{"url": subcategory_urls[1], "metadata": {"probability": 0.78}},
],
}
nextpage = {"nextPage": {"url": nextpage_url}}
items = {
"items": [
{"url": item_urls[0]},
{"url": item_urls[1]},
{"url": item_urls[0], "metadata": {"probability": 0.99}},
{"url": item_urls[1], "metadata": {"probability": 0.83}},
],
}

Expand All @@ -86,8 +86,10 @@ def test_crawl():
assert len(requests) == 2
assert requests[0].url == subcategory_urls[0]
assert requests[0].callback == spider.parse_navigation
assert requests[0].priority == 95
assert requests[1].url == subcategory_urls[1]
assert requests[1].callback == spider.parse_navigation
assert requests[1].priority == 78

# subcategories + nextpage
navigation = ProductNavigation.from_dict(
Expand All @@ -102,6 +104,7 @@ def test_crawl():
urls = {request.url for request in requests}
assert urls == {*subcategory_urls, nextpage_url}
assert all(request.callback == spider.parse_navigation for request in requests)
assert [request.priority for request in requests] == [100, 95, 78]

# subcategories + nextpage + items
navigation = ProductNavigation.from_dict(
Expand All @@ -120,6 +123,7 @@ def test_crawl():
assert request.callback == spider.parse_product
else:
assert request.callback == spider.parse_navigation
assert [request.priority for request in requests] == [199, 183, 100, 95, 78]

# nextpage + items
navigation = ProductNavigation.from_dict(
Expand All @@ -137,6 +141,7 @@ def test_crawl():
assert requests[1].callback == spider.parse_product
assert requests[2].url == nextpage_url
assert requests[2].callback == spider.parse_navigation
assert [request.priority for request in requests] == [199, 183, 100]

# subcategories + items
navigation = ProductNavigation.from_dict(
Expand All @@ -156,6 +161,7 @@ def test_crawl():
assert requests[2].callback == spider.parse_navigation
assert requests[3].url == subcategory_urls[1]
assert requests[3].callback == spider.parse_navigation
assert [request.priority for request in requests] == [199, 183, 95, 78]

# nextpage
navigation = ProductNavigation.from_dict(
Expand All @@ -168,6 +174,7 @@ def test_crawl():
assert len(requests) == 1
assert requests[0].url == nextpage_url
assert requests[0].callback == spider.parse_navigation
assert [request.priority for request in requests] == [100]

# items
navigation = ProductNavigation.from_dict(
Expand All @@ -182,6 +189,7 @@ def test_crawl():
assert requests[0].callback == spider.parse_product
assert requests[1].url == item_urls[1]
assert requests[1].callback == spider.parse_product
assert [request.priority for request in requests] == [199, 183]


@pytest.mark.parametrize(
Expand Down
24 changes: 15 additions & 9 deletions zyte_spider_templates/spiders/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from importlib.metadata import version
from typing import Any, Callable, Dict, Optional
from typing import Any, Callable, Dict, Optional, Union

import scrapy
from pydantic import BaseModel, Field
from scrapy.crawler import Crawler
from scrapy.utils.url import parse_url
from zyte_common_items import Request
from zyte_common_items import ProbabilityRequest, Request

from zyte_spider_templates._geolocations import (
GEOLOCATION_OPTIONS_WITH_CODE,
Expand Down Expand Up @@ -57,7 +57,7 @@ class BaseSpider(scrapy.Spider):
"description": "Base template.",
}

ITEM_REQUEST_PRIORITY: int = 10
_NEXT_PAGE_PRIORITY: int = 100

@classmethod
def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
Expand Down Expand Up @@ -86,18 +86,20 @@ def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider:
return spider

@staticmethod
def get_parse_navigation_request_priority(request: Request) -> int:
def get_parse_navigation_request_priority(
request: Union[ProbabilityRequest, Request]
) -> int:
if (
not hasattr(request, "metadata")
or not request.metadata
or request.metadata.probability is None
):
return 0
return int(10 * request.metadata.probability)
return int(100 * request.metadata.probability)

def get_parse_navigation_request(
self,
request: Request,
request: Union[ProbabilityRequest, Request],
callback: Optional[Callable] = None,
page_params: Optional[Dict[str, Any]] = None,
priority: Optional[int] = None,
Expand All @@ -109,11 +111,15 @@ def get_parse_navigation_request(
meta={"page_params": page_params or {}},
)

def get_parse_product_request_priority(self, request: Request) -> int:
return self.ITEM_REQUEST_PRIORITY
def get_parse_product_request_priority(self, request: ProbabilityRequest) -> int:
# TODO: Simplify when https://github.com/zytedata/zyte-common-items/pull/64 is released
probability = 0
if metadata := getattr(request, "metadata", None):
probability = metadata.probability
return int(100 * probability) + self._NEXT_PAGE_PRIORITY

def get_parse_product_request(
self, request: Request, callback: Optional[Callable] = None
self, request: ProbabilityRequest, callback: Optional[Callable] = None
) -> scrapy.Request:
callback = callback or self.parse_product
return request.to_scrapy(
Expand Down
2 changes: 1 addition & 1 deletion zyte_spider_templates/spiders/ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def parse_navigation(
if navigation.nextPage:
yield self.get_parse_navigation_request(
navigation.nextPage,
priority=self.ITEM_REQUEST_PRIORITY - 1,
priority=self._NEXT_PAGE_PRIORITY,
)
for request in navigation.subCategories or []:
if "[heuristics]" in (request.name or ""):
Expand Down