Skip to content

Commit

Permalink
[update] improve robots.txt agents rule
Browse files Browse the repository at this point in the history
  • Loading branch information
Jourdelune committed Jul 23, 2024
1 parent 2b6f237 commit bacc6ca
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions src/robots.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class RobotTXT:

def __init__(self):
self._robots = {}
self._user_agent = ["*", "GPTBot", "WaveAICrawler"]
self._user_agent = ["gptbot", "waveaicrawler"] # lower case

async def __call__(self, url: str, log: logging.Logger = None) -> bool:
"""Check if the url is allowed to be crawled
Expand Down Expand Up @@ -46,9 +46,15 @@ async def __call__(self, url: str, log: logging.Logger = None) -> bool:
if log is not None:
log.error(f"Error fetching robots.txt from {robots_url}: {e}")

authorize = []
authorize = authorize = self._robots[robots_url].can_fetch(url, "*")
for agent in self._user_agent:
authorize.append(self._robots[robots_url].can_fetch(url, agent))
agents_on_site = [
agent_on_site
for agent_on_site in self._robots[robots_url]._user_agents.keys()
]

if agent in agents_on_site:
authorize = self._robots[robots_url].can_fetch(url, agent)

if len(self._robots) > 1000:
older_keys = list(self._robots.keys())[-1]
Expand All @@ -57,4 +63,4 @@ async def __call__(self, url: str, log: logging.Logger = None) -> bool:
if log is not None:
log.info(f"Removing robots.txt for {robots_url}")

return all(authorize)
return authorize

0 comments on commit bacc6ca

Please sign in to comment.