From 9cb73f21167e10d166a899696132403fb1f5d2a3 Mon Sep 17 00:00:00 2001
From: jourdelune <jourdelune863@gmail.com>
Date: Tue, 23 Jul 2024 20:19:10 +0200
Subject: [PATCH] [update] Add domain filtering to avoid crawling the same
 domain multiple times

---
 src/filter.py | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/routes.py | 51 +++++++++++++++++++++++++++-------------------
 2 files changed, 86 insertions(+), 21 deletions(-)
 create mode 100644 src/filter.py

diff --git a/src/filter.py b/src/filter.py
new file mode 100644
index 0000000..e57828c
--- /dev/null
+++ b/src/filter.py
@@ -0,0 +1,56 @@
+"""
+Filter class
+"""
+
+import urllib.parse
+
+
+class Filter:
+    """
+    Class to filter the domain and avoid to crawl the same domain if it does not contain any song
+    """
+
+    def __init__(self, max_crawl: int = 20, cache_size: int = 500) -> None:
+        """Class to filter the domain
+
+        Args:
+            max_crawl (int, optional): the maximum number of crawl before reject
+            the url if no song is added. Defaults to 20.
+            cache_size (int, optional): the cache size. Defaults to 500.
+        """
+        self._domains = {}
+        self._cache_size = cache_size
+        self._max_crawl = max_crawl
+
+    def check_domain(self, url: str) -> bool:
+        """Check if the domain is has not been visited more than max_crawl times and not songs are added
+
+        Args:
+            url (str): the url to check
+
+        Returns:
+            bool: True if the domain has not been visited more than max_crawl times, False otherwise
+        """
+
+        domain = urllib.parse.urlparse(url).netloc
+
+        if len(self._domains) > self._cache_size:
+            oldest_domain = list(self._domains.keys())[0]
+            del self._domains[oldest_domain]
+
+        if domain not in self._domains:
+            self._domains[domain] = 1
+        else:
+            self._domains[domain] += 1
+
+        return self._domains[domain] <= self._max_crawl
+
+    def valid_domain(self, url: str) -> None:
+        """Reset the counter of the domain
+
+        Args:
+            url (str): the url to check
+        """
+
+        domain = urllib.parse.urlparse(url).netloc
+        self._domains[domain] = 0
diff --git a/src/routes.py b/src/routes.py
index 5b0c0cf..ed2ec67 100644
--- a/src/routes.py
+++ b/src/routes.py
@@ -8,11 +8,13 @@
 from crawlee.basic_crawler import Router
 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext
 
+from filter import Filter
 from robots import RobotTXT
 from utils import is_valid_url
 
 router = Router[BeautifulSoupCrawlingContext]()
 robots_parser = RobotTXT()
+filter_domain = Filter()
 REGEX = r"(https?:)?(\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)\.(mp3|wav|ogg)"
 
 
@@ -44,24 +46,31 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
 
         context.log.info(f"Found audio link: {link}")
         await context.push_data(data)  # save the links
-
-    # get all links in the page
-    requests = []
-    for link in context.soup.select("a"):
-        if link.attrs.get("href") is not None:
-            url = urllib.parse.urljoin(
-                context.request.url, link.attrs.get("href")
-            ).strip()
-
-            if not is_valid_url(url):
-                continue
-
-            authorized = await robots_parser(
-                url, context.log
-            )  # get if robots.txt allow the crawl
-            if authorized:
-                url_trunk = url.split("?")[0].split("#")[0]
-
-                requests.append(url_trunk)
-
-    await context.add_requests(requests)
+        filter_domain.valid_domain(url)  # reset the counter of the domain
+
+    # check if keywords music, audio, sound are in the page
+    keywords = ["music", "audio", "sound", "song", "artist"]
+    text = context.soup.get_text(separator=" ", strip=True)
+
+    if any(
+        keyword in text.lower() for keyword in keywords
+    ) and filter_domain.check_domain(url):
+        requests = []
+        for link in context.soup.select("a"):
+            if link.attrs.get("href") is not None:
+                url = urllib.parse.urljoin(
+                    context.request.url, link.attrs.get("href")
+                ).strip()
+
+                if not is_valid_url(url):
+                    continue
+
+                authorized = await robots_parser(
+                    url, context.log
+                )  # get if robots.txt allow the crawl
+                if authorized:
+                    url_trunk = url.split("?")[0].split("#")[0]
+
+                    requests.append(url_trunk)
+
+        await context.add_requests(requests)