diff --git a/src/robots.py b/src/robots.py index a566e4e..05d4034 100644 --- a/src/robots.py +++ b/src/robots.py @@ -2,7 +2,6 @@ Class to respect robot.txt file """ -import asyncio import logging import urllib.parse @@ -48,10 +47,7 @@ async def __call__(self, url: str, log: logging.Logger = None) -> bool: authorize = authorize = self._robots[robots_url].can_fetch(url, "*") for agent in self._user_agent: - agents_on_site = [ - agent_on_site - for agent_on_site in self._robots[robots_url]._user_agents.keys() - ] + agents_on_site = list(self._robots[robots_url]._user_agents.keys()) if agent in agents_on_site: authorize = self._robots[robots_url].can_fetch(url, agent) diff --git a/src/routes.py b/src/routes.py index 7914990..5895f5b 100644 --- a/src/routes.py +++ b/src/routes.py @@ -13,7 +13,7 @@ router = Router[BeautifulSoupCrawlingContext]() robots_parser = RobotTXT() -regex = r"(https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)\.(mp3|wav|ogg)" +REGEX = r"(https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)\.(mp3|wav|ogg)" @router.default_handler @@ -29,7 +29,7 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None: url = context.request.url html_page = str(context.soup).replace(r"\/", "/") - matches = re.finditer(regex, html_page) + matches = re.finditer(REGEX, html_page) # get all audios links audio_links = [html_page[match.start() : match.end()] for match in matches] diff --git a/src/utils.py b/src/utils.py index 8c33d53..4119b7b 100644 --- a/src/utils.py +++ b/src/utils.py @@ -12,7 +12,9 @@ def process(file_name: str) -> None: Args: file_name (str): the file name to process """ - data = json.load(open(file_name, encoding="utf-8")) + + with open(file_name, encoding="utf-8") as file: + data = json.load(file) unique_urls = set() unique_data = []