diff --git a/.gitignore b/.gitignore index a4599c7..2cbf913 100644 --- a/.gitignore +++ b/.gitignore @@ -163,4 +163,5 @@ cython_debug/ quotes.json -storage/ \ No newline at end of file +storage/ +results.json \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8fd2800..08fb174 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ -crawlee[beautifulsoup,playwright] \ No newline at end of file +crawlee[beautifulsoup,playwright] +aiohttp +protego \ No newline at end of file diff --git a/src/main.py b/src/main.py index e54bd20..603c861 100644 --- a/src/main.py +++ b/src/main.py @@ -11,9 +11,8 @@ async def main() -> None: request_handler=router, ) - await crawler.run( - ["https://freemusicarchive.org/member/meghan-admin/meet-the-exploding-pea-mix/"] - ) + await crawler.run(["https://freemusicarchive.org/"]) + await crawler.export_data("results.json") if __name__ == "__main__": diff --git a/src/robots.py b/src/robots.py new file mode 100644 index 0000000..1374bc3 --- /dev/null +++ b/src/robots.py @@ -0,0 +1,44 @@ +""" +Class to respect robot.txt file +""" + +import urllib.parse + +import aiohttp +from protego import Protego + + +class RobotTXT: + """Class to respect robot.txt file""" + + def __init__(self): + self._robots = {} + self._user_agent = ["*", "GPTBot", "WaveAICrawler"] + + async def __call__(self, url: str) -> bool: + """Check if the url is allowed to be crawled + + Args: + url (str): url to be checked + + Returns: + bool: True if the url is allowed to be crawled, False otherwise + """ + + url_parse = urllib.parse.urlparse(url) + robots_url = f"{url_parse.scheme}://{url_parse.netloc}/robots.txt" + + if robots_url not in self._robots: + async with aiohttp.ClientSession() as session: + async with session.get(robots_url) as response: + robots_content = await response.text() + self._robots[robots_url] = Protego.parse(robots_content) + + authorize = [] + for agent in self._user_agent: + authorize.append(self._robots[robots_url].can_fetch(url, agent)) + + if len(self._robots) > 100: + self._robots.popitem(last=False) + + return all(authorize) diff --git a/src/routes.py b/src/routes.py index 9bcb836..cdd1b36 100644 --- a/src/routes.py +++ b/src/routes.py @@ -5,13 +5,18 @@ from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext from crawlee.playwright_crawler import PlaywrightCrawlingContext -router = Router[PlaywrightCrawlingContext]() +from robots import RobotTXT +from utils import is_valid_url +router = Router[PlaywrightCrawlingContext]() +robots_parser = RobotTXT() regex = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)\.(mp3|wav|ogg)" @router.default_handler async def default_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f"Processing page: {context.request.url}") + url = context.request.url html_page = str(context.soup).replace("\/", "/") @@ -22,11 +27,21 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None: for link in audio_links: link = urllib.parse.urljoin(url, link) - data = { - "url": link, - "label": "audio", - } + data = {"url": link, "src": url} + context.log.info(f"Found audio link: {link}") await context.push_data(data) - await context.enqueue_links(strategy="all") + requests = [] + for link in context.soup.select("a"): + if link.attrs.get("href") is not None: + url = urllib.parse.urljoin(context.request.url, link.attrs.get("href")) + + if not is_valid_url(url): + continue + + authorized = await robots_parser(url) + if authorized: + requests.append(url) + + await context.add_requests(requests) diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..f71918f --- /dev/null +++ b/src/utils.py @@ -0,0 +1,22 @@ +""" +Utils function +""" + +import urllib.parse + + +def is_valid_url(url: str) -> bool: + """Check if url is valid + + Args: + url (str): the url to check + + Returns: + bool: boolean that indicate if url is valid (true) or not + """ + + try: + result = urllib.parse.urlparse(url) + return all([result.scheme, result.netloc]) + except AttributeError: + return False