From 55446a0c25867230bd63aeae256f52cdd5395da3 Mon Sep 17 00:00:00 2001 From: jourdelune Date: Tue, 23 Jul 2024 11:27:57 +0200 Subject: [PATCH] [update] clean robots txt --- src/robots.py | 29 +++++++++++++++++++++++------ src/routes.py | 4 +++- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/robots.py b/src/robots.py index fe88368..becc4a1 100644 --- a/src/robots.py +++ b/src/robots.py @@ -2,6 +2,8 @@ Class to respect robot.txt file """ +import asyncio +import logging import urllib.parse import aiohttp @@ -15,11 +17,12 @@ def __init__(self): self._robots = {} self._user_agent = ["*", "GPTBot", "WaveAICrawler"] - async def __call__(self, url: str) -> bool: + async def __call__(self, url: str, log: logging.Logger = None) -> bool: """Check if the url is allowed to be crawled Args: url (str): url to be checked + log (logging.Logger, optional): logger to log the result. Defaults to None. Returns: bool: True if the url is allowed to be crawled, False otherwise @@ -29,16 +32,30 @@ async def __call__(self, url: str) -> bool: robots_url = f"{url_parse.scheme}://{url_parse.netloc}/robots.txt" if robots_url not in self._robots: - async with aiohttp.ClientSession() as session: - async with session.get(robots_url) as response: - robots_content = await response.text() - self._robots[robots_url] = Protego.parse(robots_content) + if log is not None: + log.info(f"Fetching robots.txt from {robots_url}") + + try: + async with aiohttp.ClientSession() as session: + async with session.get(robots_url) as response: + robots_content = await response.text() + self._robots[robots_url] = Protego.parse(robots_content) + except ( + aiohttp.ClientError, + asyncio.exceptions.CancelledError, + asyncio.exceptions.TimeoutError, + ): + self._robots[robots_url] = Protego.parse("User-agent: *\nDisallow: /") authorize = [] for agent in self._user_agent: authorize.append(self._robots[robots_url].can_fetch(url, agent)) if len(self._robots) > 1000: - self._robots.popitem(last=False) + older_keys = list(self._robots.keys())[-1] + self._robots.pop(older_keys) + + if log is not None: + log.info(f"Removing robots.txt for {robots_url}") return all(authorize) diff --git a/src/routes.py b/src/routes.py index 11cd890..7914990 100644 --- a/src/routes.py +++ b/src/routes.py @@ -51,7 +51,9 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None: if not is_valid_url(url): continue - authorized = await robots_parser(url) # get if robots.txt allow the crawl + authorized = await robots_parser( + url, context.log + ) # get if robots.txt allow the crawl if authorized: url_trunk = url.split("?")[0].split("#")[0]