Skip to content

Commit

Permalink
Merge pull request #64 from zytedata/improve-heuristics
Browse files Browse the repository at this point in the history
support subdomain checks in category heuristics
  • Loading branch information
kmike authored Sep 3, 2024
2 parents 9ba145d + a8d5964 commit 84a37a1
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 30 deletions.
6 changes: 6 additions & 0 deletions tests/test_heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@
("https://example.com/terms_of_use", False),
("https://example.com/terms_of_service", False),
("https://example.com/terms_of_conditions", False),
# subdomains
("https://blog.example.com", False),
("https://admin.example.com", False),
("https://cart.example.com", False),
("https://news.example.com", False),
("https://careers.example.com", False),
),
)
def test_might_be_category(test_input, expected):
Expand Down
61 changes: 31 additions & 30 deletions zyte_spider_templates/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,32 @@
LANG_CODES = set(_LANG_CODES)


NO_CONTENT_PATHS = (
"/authenticate",
"/my-account",
"/account",
"/my-wishlist",
"/search",
"/archive",
"/privacy-policy",
"/cookie-policy",
"/terms-conditions",
"/tos",
"/admin",
"/rss.xml",
"/subscribe",
"/newsletter",
"/settings",
"/cart",
"/articles",
"/artykuly", # Polish for articles
"/news",
"/blog",
"/about",
"/about-us",
"/affiliate",
"/press",
"/careers",
NO_CONTENT_KEYWORDS = (
"authenticate",
"my-account",
"account",
"my-wishlist",
"search",
"archive",
"privacy-policy",
"cookie-policy",
"terms-conditions",
"tos",
"admin",
"rss.xml",
"subscribe",
"newsletter",
"settings",
"cart",
"articles",
"artykuly", # Polish for articles
"news",
"blog",
"about",
"about-us",
"affiliate",
"press",
"careers",
)

SUFFIXES = [".html", ".php", ".cgi", ".asp"]
Expand All @@ -51,13 +51,14 @@ def might_be_category(url: str) -> bool:
"""Returns True if the given url might be a category based on its path."""

url = url.lower().rstrip("/")
url_path = urlparse(url).path
parsed_url = urlparse(url)

for suffix in [""] + SUFFIXES:
for path in NO_CONTENT_PATHS:
if url_path.endswith(path + suffix):
for path in NO_CONTENT_KEYWORDS:
if parsed_url.path.endswith(f"/{path}{suffix}"):
return False
if parsed_url.netloc.startswith(f"{path}."):
return False
for suffix in [""] + SUFFIXES:
for rule in NO_CONTENT_RE:
if re.search(rule + suffix, url):
return False
Expand Down

0 comments on commit 84a37a1

Please sign in to comment.