Skip to content

Commit

Permalink
combine full and navigation strategies
Browse files Browse the repository at this point in the history
  • Loading branch information
BurnzZ committed Apr 30, 2024
1 parent 186eba3 commit c3dbcec
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 18 deletions.
82 changes: 74 additions & 8 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ def test_parameters():

EcommerceSpider(url="https://example.com")
EcommerceSpider(
url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.full
url="https://example.com", crawl_strategy=EcommerceCrawlStrategy.default
)
EcommerceSpider(url="https://example.com", crawl_strategy="full")
EcommerceSpider(url="https://example.com", crawl_strategy="default")

with pytest.raises(ValidationError):
EcommerceSpider(url="https://example.com", crawl_strategy="unknown")
Expand Down Expand Up @@ -465,18 +465,30 @@ def test_metadata():
"enum": ["httpResponseBody", "browserHtml"],
},
"crawl_strategy": {
"default": "full",
"default": "default",
"description": "Determines how the start URL and follow-up URLs are crawled.",
"enumMeta": {
"default": {
"description": (
"Follow pagination, subcategories, and product detail pages. "
"If starting on a homepage, it would attempt to discover other "
"URLs in the page using heuristics."
),
"title": "Default",
},
"full": {
"description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
"description": (
"(Deprecated. Use Default instead) Follow most links "
"within the domain of URL in an attempt to discover and "
"extract as many products as possible."
),
"title": "Full",
},
"navigation": {
"description": (
"Follow pagination, subcategories, and "
"product detail pages. Pagination Only is a "
"better choice if the target URL does not "
"(Deprecated. Use Default instead) Follow pagination, "
"subcategories, and product detail pages. Pagination "
"Only is a better choice if the target URL does not "
"have subcategories, or if Zyte API is "
"misidentifying some URLs as subcategories."
),
Expand All @@ -490,7 +502,7 @@ def test_metadata():
},
},
"title": "Crawl strategy",
"enum": ["full", "navigation", "pagination_only"],
"enum": ["default", "full", "navigation", "pagination_only"],
"type": "string",
},
},
Expand Down Expand Up @@ -727,3 +739,57 @@ def test_urls_file():
assert start_requests[0].url == "https://a.example"
assert start_requests[1].url == "https://b.example"
assert start_requests[2].url == "https://c.example"


@pytest.mark.parametrize(
"url,has_full_domain",
(
("https://example.com", (True, True, False, False)),
("https://example.com/", (True, True, False, False)),
("https://example.com/index.htm", (True, True, False, False)),
("https://example.com/index.html", (True, True, False, False)),
("https://example.com/index.php", (True, True, False, False)),
("https://example.com/home", (True, True, False, False)),
("https://example.com/some/category", (False, True, False, False)),
("https://example.com/some/category?pid=123", (False, True, False, False)),
),
)
def test_get_start_request_default_strategy(url, has_full_domain):
def assert_meta(has_page_params):
meta = {"crawling_logs": {"page_type": "productNavigation"}}
if has_page_params:
meta["page_params"] = {"full_domain": "example.com"}
assert result.meta == meta

for i, crawl_strategy in enumerate(
["default", "full", "navigation", "pagination_only"]
):
spider = EcommerceSpider.from_crawler(
get_crawler(), url=url, crawl_strategy=crawl_strategy
)
result = spider.get_start_request(url)
assert result.url == url
assert result.callback == spider.parse_navigation
assert_meta(has_full_domain[i])


@pytest.mark.parametrize(
"crawl_strategy,expected_page_params",
(
("default", {}),
("full", {"full_domain": "example.com"}),
("navigation", {}),
("pagination_only", {}),
),
)
def test_page_params_for_heuristics(crawl_strategy, expected_page_params):
url = "https://example.com"
meta = {"page_params": {"full_domain": "example.com"}}
request = scrapy.Request(url, meta=meta)
response = DummyResponse(url=url, request=request)

spider = EcommerceSpider.from_crawler(
get_crawler(), url=url, crawl_strategy=crawl_strategy
)
page_params = spider.page_params_for_heuristics(response)
assert page_params == expected_page_params
20 changes: 19 additions & 1 deletion tests/test_heuristics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from zyte_spider_templates.heuristics import might_be_category
from zyte_spider_templates.heuristics import is_homepage, might_be_category


@pytest.mark.parametrize(
Expand Down Expand Up @@ -50,3 +50,21 @@
)
def test_might_be_category(test_input, expected):
assert might_be_category(test_input) == expected


@pytest.mark.parametrize(
"url,expected",
(
("https://example.com", True),
("https://example.com/", True),
("https://example.com/index.htm", True),
("https://example.com/index.html", True),
("https://example.com/index.php", True),
("https://example.com/home", True),
("https://example.com/?ref=abc", False),
("https://example.com/some/category", False),
("https://example.com/some/category?query=2123", False),
),
)
def test_is_homepage(url, expected):
assert is_homepage(url) == expected
18 changes: 17 additions & 1 deletion zyte_spider_templates/heuristics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re
from urllib.parse import urlparse
from urllib.parse import urlparse, urlsplit

NO_CONTENT_PATHS = (
"/authenticate",
Expand Down Expand Up @@ -56,3 +56,19 @@ def might_be_category(url: str) -> bool:
return False

return True


INDEX_URL_PATHS = {
"",
"/",
"/index.html",
"/index.htm",
"/index.php",
"/home",
}


# TODO: support localization suffixes? Example: /en, /en-us
def is_homepage(url: str) -> bool:
url_split = urlsplit(url)
return url_split.path in INDEX_URL_PATHS and not url_split.query
53 changes: 45 additions & 8 deletions zyte_spider_templates/spiders/ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from scrapy_spider_metadata import Args
from zyte_common_items import ProbabilityRequest, Product, ProductNavigation

from zyte_spider_templates.heuristics import is_homepage
from zyte_spider_templates.spiders.base import (
ARG_SETTING_PRIORITY,
BaseSpider,
Expand All @@ -23,6 +24,13 @@

@document_enum
class EcommerceCrawlStrategy(str, Enum):
default: str = "default"
"""Follow pagination, subcategories, and product detail pages.
If the starting URL points to a homepage, it would attempt to discover other
URLs in the page using heuristics.
"""

full: str = "full"
"""Follow most links within the domain of URL in an attempt to discover and
extract as many products as possible."""
Expand All @@ -43,20 +51,32 @@ class EcommerceCrawlStrategyParam(BaseModel):
crawl_strategy: EcommerceCrawlStrategy = Field(
title="Crawl strategy",
description="Determines how the start URL and follow-up URLs are crawled.",
default=EcommerceCrawlStrategy.full,
default=EcommerceCrawlStrategy.default,
json_schema_extra={
"enumMeta": {
EcommerceCrawlStrategy.default: {
"description": (
"Follow pagination, subcategories, and product detail pages. "
"If starting on a homepage, it would attempt to discover other "
"URLs in the page using heuristics."
),
"title": "Default",
},
EcommerceCrawlStrategy.full: {
"title": "Full",
"description": "Follow most links within the domain of URL in an attempt to discover and extract as many products as possible.",
"description": (
"(Deprecated. Use Default instead) Follow most links within the "
"domain of URL in an attempt to discover and extract as many "
"products as possible."
),
},
EcommerceCrawlStrategy.navigation: {
"title": "Navigation",
"description": (
"Follow pagination, subcategories, and product detail "
"pages. Pagination Only is a better choice if the "
"target URL does not have subcategories, or if Zyte "
"API is misidentifying some URLs as subcategories."
"(Deprecated. Use Default instead) Follow pagination, "
"subcategories, and product detail pages. Pagination Only is a "
"better choice if the target URL does not have subcategories, "
"or if Zyte API is misidentifying some URLs as subcategories."
),
},
EcommerceCrawlStrategy.pagination_only: {
Expand Down Expand Up @@ -125,7 +145,10 @@ def get_start_request(self, url):
meta = {
"crawling_logs": {"page_type": "productNavigation"},
}
if self.args.crawl_strategy == EcommerceCrawlStrategy.full:
if self.args.crawl_strategy == EcommerceCrawlStrategy.full or (
self.args.crawl_strategy == EcommerceCrawlStrategy.default
and is_homepage(url)
):
meta["page_params"] = {"full_domain": get_domain(url)}
return Request(
url=url,
Expand All @@ -140,7 +163,7 @@ def start_requests(self) -> Iterable[Request]:
def parse_navigation(
self, response: DummyResponse, navigation: ProductNavigation
) -> Iterable[Request]:
page_params = response.meta.get("page_params")
page_params = self.page_params_for_heuristics(response)

products = navigation.items or []
for request in products:
Expand Down Expand Up @@ -266,3 +289,17 @@ def get_parse_product_request(
)
scrapy_request.meta["allow_offsite"] = True
return scrapy_request

def page_params_for_heuristics(
self, response: DummyResponse
) -> Optional[Dict[str, Any]]:
page_params = response.meta.get("page_params")

# Only allow heuristic extraction of links in non-homepage when on "full" crawl.
if (
self.args.crawl_strategy != EcommerceCrawlStrategy.full
and "full_domain" in (page_params or {})
):
page_params.pop("full_domain")

return page_params

0 comments on commit c3dbcec

Please sign in to comment.