diff --git a/src/main.py b/src/main.py index 45eef8b..de004d8 100644 --- a/src/main.py +++ b/src/main.py @@ -7,6 +7,7 @@ from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler from routes import router +from utils import process async def main() -> None: @@ -18,8 +19,9 @@ async def main() -> None: request_handler=router, ) - await crawler.run(["https://freemusicarchive.org/"]) + await crawler.run(["https://www.ashamaluevmusic.com"]) await crawler.export_data("results.json") + process("results.json") if __name__ == "__main__": diff --git a/src/robots.py b/src/robots.py index 1374bc3..fe88368 100644 --- a/src/robots.py +++ b/src/robots.py @@ -38,7 +38,7 @@ async def __call__(self, url: str) -> bool: for agent in self._user_agent: authorize.append(self._robots[robots_url].can_fetch(url, agent)) - if len(self._robots) > 100: + if len(self._robots) > 1000: self._robots.popitem(last=False) return all(authorize) diff --git a/src/utils.py b/src/utils.py index f71918f..8c33d53 100644 --- a/src/utils.py +++ b/src/utils.py @@ -2,9 +2,34 @@ Utils function """ +import json import urllib.parse +def process(file_name: str) -> None: + """Class to process the json generated file + + Args: + file_name (str): the file name to process + """ + data = json.load(open(file_name, encoding="utf-8")) + + unique_urls = set() + unique_data = [] + + for item in data: + url = item["url"] + if url not in unique_urls: + unique_urls.add(url) + unique_data.append(item) + + audio_files_count = len(unique_data) + print(f"Number of unique musics : {audio_files_count}") + + with open(file_name, "w", encoding="utf-8") as outfile: + json.dump(unique_data, outfile, indent=4) + + def is_valid_url(url: str) -> bool: """Check if url is valid