diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py index 58f1352..11e76b4 100644 --- a/tests/test_heuristics.py +++ b/tests/test_heuristics.py @@ -46,6 +46,12 @@ ("https://example.com/terms_of_use", False), ("https://example.com/terms_of_service", False), ("https://example.com/terms_of_conditions", False), + # subdomains + ("https://blog.example.com", False), + ("https://admin.example.com", False), + ("https://cart.example.com", False), + ("https://news.example.com", False), + ("https://careers.example.com", False), ), ) def test_might_be_category(test_input, expected): diff --git a/zyte_spider_templates/heuristics.py b/zyte_spider_templates/heuristics.py index 91e5cad..eba3639 100644 --- a/zyte_spider_templates/heuristics.py +++ b/zyte_spider_templates/heuristics.py @@ -8,32 +8,32 @@ LANG_CODES = set(_LANG_CODES) -NO_CONTENT_PATHS = ( - "/authenticate", - "/my-account", - "/account", - "/my-wishlist", - "/search", - "/archive", - "/privacy-policy", - "/cookie-policy", - "/terms-conditions", - "/tos", - "/admin", - "/rss.xml", - "/subscribe", - "/newsletter", - "/settings", - "/cart", - "/articles", - "/artykuly", # Polish for articles - "/news", - "/blog", - "/about", - "/about-us", - "/affiliate", - "/press", - "/careers", +NO_CONTENT_KEYWORDS = ( + "authenticate", + "my-account", + "account", + "my-wishlist", + "search", + "archive", + "privacy-policy", + "cookie-policy", + "terms-conditions", + "tos", + "admin", + "rss.xml", + "subscribe", + "newsletter", + "settings", + "cart", + "articles", + "artykuly", # Polish for articles + "news", + "blog", + "about", + "about-us", + "affiliate", + "press", + "careers", ) SUFFIXES = [".html", ".php", ".cgi", ".asp"] @@ -51,13 +51,14 @@ def might_be_category(url: str) -> bool: """Returns True if the given url might be a category based on its path.""" url = url.lower().rstrip("/") - url_path = urlparse(url).path + parsed_url = urlparse(url) for suffix in [""] + SUFFIXES: - for path in NO_CONTENT_PATHS: - if url_path.endswith(path + suffix): + for path in NO_CONTENT_KEYWORDS: + if parsed_url.path.endswith(f"/{path}{suffix}"): + return False + if parsed_url.netloc.startswith(f"{path}."): return False - for suffix in [""] + SUFFIXES: for rule in NO_CONTENT_RE: if re.search(rule + suffix, url): return False