Skip to content

Commit

Permalink
[update] clean robots txt
Browse files Browse the repository at this point in the history
  • Loading branch information
Jourdelune committed Jul 23, 2024
1 parent 6e47cb5 commit 55446a0
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 7 deletions.
29 changes: 23 additions & 6 deletions src/robots.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Class to respect robot.txt file
"""

import asyncio
import logging
import urllib.parse

import aiohttp
Expand All @@ -15,11 +17,12 @@ def __init__(self):
self._robots = {}
self._user_agent = ["*", "GPTBot", "WaveAICrawler"]

async def __call__(self, url: str) -> bool:
async def __call__(self, url: str, log: logging.Logger = None) -> bool:
"""Check if the url is allowed to be crawled
Args:
url (str): url to be checked
log (logging.Logger, optional): logger to log the result. Defaults to None.
Returns:
bool: True if the url is allowed to be crawled, False otherwise
Expand All @@ -29,16 +32,30 @@ async def __call__(self, url: str) -> bool:
robots_url = f"{url_parse.scheme}://{url_parse.netloc}/robots.txt"

if robots_url not in self._robots:
async with aiohttp.ClientSession() as session:
async with session.get(robots_url) as response:
robots_content = await response.text()
self._robots[robots_url] = Protego.parse(robots_content)
if log is not None:
log.info(f"Fetching robots.txt from {robots_url}")

try:
async with aiohttp.ClientSession() as session:
async with session.get(robots_url) as response:
robots_content = await response.text()
self._robots[robots_url] = Protego.parse(robots_content)
except (
aiohttp.ClientError,
asyncio.exceptions.CancelledError,
asyncio.exceptions.TimeoutError,
):
self._robots[robots_url] = Protego.parse("User-agent: *\nDisallow: /")

authorize = []
for agent in self._user_agent:
authorize.append(self._robots[robots_url].can_fetch(url, agent))

if len(self._robots) > 1000:
self._robots.popitem(last=False)
older_keys = list(self._robots.keys())[-1]
self._robots.pop(older_keys)

if log is not None:
log.info(f"Removing robots.txt for {robots_url}")

return all(authorize)
4 changes: 3 additions & 1 deletion src/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
if not is_valid_url(url):
continue

authorized = await robots_parser(url) # get if robots.txt allow the crawl
authorized = await robots_parser(
url, context.log
) # get if robots.txt allow the crawl
if authorized:
url_trunk = url.split("?")[0].split("#")[0]

Expand Down

0 comments on commit 55446a0

Please sign in to comment.