Merge pull request #64 from zytedata/improve-heuristics

support subdomain checks in category heuristics
zytedata · Sep 3, 2024 · 84a37a1 · 84a37a1
2 parents 9ba145d + a8d5964
commit 84a37a1
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 30 deletions.
diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py
@@ -46,6 +46,12 @@
         ("https://example.com/terms_of_use", False),
         ("https://example.com/terms_of_service", False),
         ("https://example.com/terms_of_conditions", False),
+        # subdomains
+        ("https://blog.example.com", False),
+        ("https://admin.example.com", False),
+        ("https://cart.example.com", False),
+        ("https://news.example.com", False),
+        ("https://careers.example.com", False),
     ),
 )
 def test_might_be_category(test_input, expected):

diff --git a/zyte_spider_templates/heuristics.py b/zyte_spider_templates/heuristics.py
@@ -8,32 +8,32 @@
 LANG_CODES = set(_LANG_CODES)
 
 
-NO_CONTENT_PATHS = (
-    "/authenticate",
-    "/my-account",
-    "/account",
-    "/my-wishlist",
-    "/search",
-    "/archive",
-    "/privacy-policy",
-    "/cookie-policy",
-    "/terms-conditions",
-    "/tos",
-    "/admin",
-    "/rss.xml",
-    "/subscribe",
-    "/newsletter",
-    "/settings",
-    "/cart",
-    "/articles",
-    "/artykuly",  # Polish for articles
-    "/news",
-    "/blog",
-    "/about",
-    "/about-us",
-    "/affiliate",
-    "/press",
-    "/careers",
+NO_CONTENT_KEYWORDS = (
+    "authenticate",
+    "my-account",
+    "account",
+    "my-wishlist",
+    "search",
+    "archive",
+    "privacy-policy",
+    "cookie-policy",
+    "terms-conditions",
+    "tos",
+    "admin",
+    "rss.xml",
+    "subscribe",
+    "newsletter",
+    "settings",
+    "cart",
+    "articles",
+    "artykuly",  # Polish for articles
+    "news",
+    "blog",
+    "about",
+    "about-us",
+    "affiliate",
+    "press",
+    "careers",
 )
 
 SUFFIXES = [".html", ".php", ".cgi", ".asp"]
@@ -51,13 +51,14 @@ def might_be_category(url: str) -> bool:
     """Returns True if the given url might be a category based on its path."""
 
     url = url.lower().rstrip("/")
-    url_path = urlparse(url).path
+    parsed_url = urlparse(url)
 
     for suffix in [""] + SUFFIXES:
-        for path in NO_CONTENT_PATHS:
-            if url_path.endswith(path + suffix):
+        for path in NO_CONTENT_KEYWORDS:
+            if parsed_url.path.endswith(f"/{path}{suffix}"):
+                return False
+            if parsed_url.netloc.startswith(f"{path}."):
                 return False
-    for suffix in [""] + SUFFIXES:
         for rule in NO_CONTENT_RE:
             if re.search(rule + suffix, url):
                 return False