[update] add doc

WaveGenAI · Jul 22, 2024 · eef9bdf · eef9bdf
1 parent fa55b8f
commit eef9bdf
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 5 deletions.
diff --git a/src/__init__.py b/src/__init__.py
@@ -0,0 +1,3 @@
+"""
+Init package
+"""
diff --git a/src/main.py b/src/main.py
@@ -1,12 +1,19 @@
+"""
+main script for the crawler
+"""
+
 import asyncio
 
-from crawlee.playwright_crawler import PlaywrightCrawler
 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
 
 from routes import router
 
 
 async def main() -> None:
+    """
+    Function to launch the crawler
+    """
+
     crawler = BeautifulSoupCrawler(
         request_handler=router,
     )

diff --git a/src/routes.py b/src/routes.py
@@ -1,27 +1,37 @@
+""" 
+Route for the crawler
+"""
+
 import re
 import urllib.parse
 
 from crawlee.basic_crawler import Router
 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext
-from crawlee.playwright_crawler import PlaywrightCrawlingContext
 
 from robots import RobotTXT
 from utils import is_valid_url
 
-router = Router[PlaywrightCrawlingContext]()
+router = Router[BeautifulSoupCrawlingContext]()
 robots_parser = RobotTXT()
 regex = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)\.(mp3|wav|ogg)"
 
 
 @router.default_handler
 async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
+    """Default handler where the result of each page is submit
+
+    Args:
+        context (BeautifulSoupCrawlingContext): the context of the crawler
+    """
+
     context.log.info(f"Processing page: {context.request.url}")
 
     url = context.request.url
     html_page = str(context.soup).replace("\/", "/")
 
     matches = re.finditer(regex, html_page)
 
+    # get all audios links
     audio_links = [html_page[match.start() : match.end()] for match in matches]
 
     for link in audio_links:
@@ -30,8 +40,9 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
         data = {"url": link, "src": url}
 
         context.log.info(f"Found audio link: {link}")
-        await context.push_data(data)
+        await context.push_data(data)  # save the links
 
+    # get all links in the page
     requests = []
     for link in context.soup.select("a"):
         if link.attrs.get("href") is not None:
@@ -40,7 +51,7 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
             if not is_valid_url(url):
                 continue
 
-            authorized = await robots_parser(url)
+            authorized = await robots_parser(url)  # get if robots.txt allow the crawl
             if authorized:
                 requests.append(url)