[update] clean robots txt

WaveGenAI · Jul 23, 2024 · 55446a0 · 55446a0
1 parent 6e47cb5
commit 55446a0
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 7 deletions.
diff --git a/src/robots.py b/src/robots.py
@@ -2,6 +2,8 @@
 Class to respect robot.txt file
 """
 
+import asyncio
+import logging
 import urllib.parse
 
 import aiohttp
@@ -15,11 +17,12 @@ def __init__(self):
         self._robots = {}
         self._user_agent = ["*", "GPTBot", "WaveAICrawler"]
 
-    async def __call__(self, url: str) -> bool:
+    async def __call__(self, url: str, log: logging.Logger = None) -> bool:
         """Check if the url is allowed to be crawled
 
         Args:
             url (str): url to be checked
+            log (logging.Logger, optional): logger to log the result. Defaults to None.
 
         Returns:
             bool: True if the url is allowed to be crawled, False otherwise
@@ -29,16 +32,30 @@ async def __call__(self, url: str) -> bool:
         robots_url = f"{url_parse.scheme}://{url_parse.netloc}/robots.txt"
 
         if robots_url not in self._robots:
-            async with aiohttp.ClientSession() as session:
-                async with session.get(robots_url) as response:
-                    robots_content = await response.text()
-                    self._robots[robots_url] = Protego.parse(robots_content)
+            if log is not None:
+                log.info(f"Fetching robots.txt from {robots_url}")
+
+            try:
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(robots_url) as response:
+                        robots_content = await response.text()
+                        self._robots[robots_url] = Protego.parse(robots_content)
+            except (
+                aiohttp.ClientError,
+                asyncio.exceptions.CancelledError,
+                asyncio.exceptions.TimeoutError,
+            ):
+                self._robots[robots_url] = Protego.parse("User-agent: *\nDisallow: /")
 
         authorize = []
         for agent in self._user_agent:
             authorize.append(self._robots[robots_url].can_fetch(url, agent))
 
         if len(self._robots) > 1000:
-            self._robots.popitem(last=False)
+            older_keys = list(self._robots.keys())[-1]
+            self._robots.pop(older_keys)
+
+            if log is not None:
+                log.info(f"Removing robots.txt for {robots_url}")
 
         return all(authorize)
diff --git a/src/routes.py b/src/routes.py
@@ -51,7 +51,9 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
             if not is_valid_url(url):
                 continue
 
-            authorized = await robots_parser(url)  # get if robots.txt allow the crawl
+            authorized = await robots_parser(
+                url, context.log
+            )  # get if robots.txt allow the crawl
             if authorized:
                 url_trunk = url.split("?")[0].split("#")[0]