[update] add robots.txt support

WaveGenAI · Jul 22, 2024 · fa55b8f · fa55b8f
1 parent 3ae208a
commit fa55b8f
Show file tree

Hide file tree

Showing 6 changed files with 94 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -163,4 +163,5 @@ cython_debug/
 
 quotes.json
 
-storage/
+storage/
+results.json
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,3 @@
-crawlee[beautifulsoup,playwright]
+crawlee[beautifulsoup,playwright]
+aiohttp
+protego
diff --git a/src/main.py b/src/main.py
@@ -11,9 +11,8 @@ async def main() -> None:
         request_handler=router,
     )
 
-    await crawler.run(
-        ["https://freemusicarchive.org/member/meghan-admin/meet-the-exploding-pea-mix/"]
-    )
+    await crawler.run(["https://freemusicarchive.org/"])
+    await crawler.export_data("results.json")
 
 
 if __name__ == "__main__":

diff --git a/src/robots.py b/src/robots.py
@@ -0,0 +1,44 @@
+""" 
+Class to respect robot.txt file
+"""
+
+import urllib.parse
+
+import aiohttp
+from protego import Protego
+
+
+class RobotTXT:
+    """Class to respect robot.txt file"""
+
+    def __init__(self):
+        self._robots = {}
+        self._user_agent = ["*", "GPTBot", "WaveAICrawler"]
+
+    async def __call__(self, url: str) -> bool:
+        """Check if the url is allowed to be crawled
+
+        Args:
+            url (str): url to be checked
+
+        Returns:
+            bool: True if the url is allowed to be crawled, False otherwise
+        """
+
+        url_parse = urllib.parse.urlparse(url)
+        robots_url = f"{url_parse.scheme}://{url_parse.netloc}/robots.txt"
+
+        if robots_url not in self._robots:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(robots_url) as response:
+                    robots_content = await response.text()
+                    self._robots[robots_url] = Protego.parse(robots_content)
+
+        authorize = []
+        for agent in self._user_agent:
+            authorize.append(self._robots[robots_url].can_fetch(url, agent))
+
+        if len(self._robots) > 100:
+            self._robots.popitem(last=False)
+
+        return all(authorize)
diff --git a/src/routes.py b/src/routes.py
@@ -5,13 +5,18 @@
 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext
 from crawlee.playwright_crawler import PlaywrightCrawlingContext
 
-router = Router[PlaywrightCrawlingContext]()
+from robots import RobotTXT
+from utils import is_valid_url
 
+router = Router[PlaywrightCrawlingContext]()
+robots_parser = RobotTXT()
 regex = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)\.(mp3|wav|ogg)"
 
 
 @router.default_handler
 async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
+    context.log.info(f"Processing page: {context.request.url}")
+
     url = context.request.url
     html_page = str(context.soup).replace("\/", "/")
 
@@ -22,11 +27,21 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
     for link in audio_links:
         link = urllib.parse.urljoin(url, link)
 
-        data = {
-            "url": link,
-            "label": "audio",
-        }
+        data = {"url": link, "src": url}
 
+        context.log.info(f"Found audio link: {link}")
         await context.push_data(data)
 
-    await context.enqueue_links(strategy="all")
+    requests = []
+    for link in context.soup.select("a"):
+        if link.attrs.get("href") is not None:
+            url = urllib.parse.urljoin(context.request.url, link.attrs.get("href"))
+
+            if not is_valid_url(url):
+                continue
+
+            authorized = await robots_parser(url)
+            if authorized:
+                requests.append(url)
+
+    await context.add_requests(requests)
diff --git a/src/utils.py b/src/utils.py
@@ -0,0 +1,22 @@
+"""
+Utils function
+"""
+
+import urllib.parse
+
+
+def is_valid_url(url: str) -> bool:
+    """Check if url is valid
+
+    Args:
+        url (str): the url to check
+
+    Returns:
+        bool: boolean that indicate if url is valid (true) or not
+    """
+
+    try:
+        result = urllib.parse.urlparse(url)
+        return all([result.scheme, result.netloc])
+    except AttributeError:
+        return False