Skip to content

Commit

Permalink
[update] add doc
Browse files Browse the repository at this point in the history
  • Loading branch information
Jourdelune committed Jul 22, 2024
1 parent fa55b8f commit eef9bdf
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 5 deletions.
3 changes: 3 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""
Init package
"""
9 changes: 8 additions & 1 deletion src/main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
"""
main script for the crawler
"""

import asyncio

from crawlee.playwright_crawler import PlaywrightCrawler
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler

from routes import router


async def main() -> None:
"""
Function to launch the crawler
"""

crawler = BeautifulSoupCrawler(
request_handler=router,
)
Expand Down
19 changes: 15 additions & 4 deletions src/routes.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,37 @@
"""
Route for the crawler
"""

import re
import urllib.parse

from crawlee.basic_crawler import Router
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext
from crawlee.playwright_crawler import PlaywrightCrawlingContext

from robots import RobotTXT
from utils import is_valid_url

router = Router[PlaywrightCrawlingContext]()
router = Router[BeautifulSoupCrawlingContext]()
robots_parser = RobotTXT()
regex = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)\.(mp3|wav|ogg)"


@router.default_handler
async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
"""Default handler where the result of each page is submit
Args:
context (BeautifulSoupCrawlingContext): the context of the crawler
"""

context.log.info(f"Processing page: {context.request.url}")

url = context.request.url
html_page = str(context.soup).replace("\/", "/")

matches = re.finditer(regex, html_page)

# get all audios links
audio_links = [html_page[match.start() : match.end()] for match in matches]

for link in audio_links:
Expand All @@ -30,8 +40,9 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
data = {"url": link, "src": url}

context.log.info(f"Found audio link: {link}")
await context.push_data(data)
await context.push_data(data) # save the links

# get all links in the page
requests = []
for link in context.soup.select("a"):
if link.attrs.get("href") is not None:
Expand All @@ -40,7 +51,7 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
if not is_valid_url(url):
continue

authorized = await robots_parser(url)
authorized = await robots_parser(url) # get if robots.txt allow the crawl
if authorized:
requests.append(url)

Expand Down

0 comments on commit eef9bdf

Please sign in to comment.