Skip to content

Commit

Permalink
[update] add robots.txt support
Browse files Browse the repository at this point in the history
  • Loading branch information
Jourdelune committed Jul 22, 2024
1 parent 3ae208a commit fa55b8f
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 11 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,5 @@ cython_debug/

quotes.json

storage/
storage/
results.json
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
crawlee[beautifulsoup,playwright]
crawlee[beautifulsoup,playwright]
aiohttp
protego
5 changes: 2 additions & 3 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@ async def main() -> None:
request_handler=router,
)

await crawler.run(
["https://freemusicarchive.org/member/meghan-admin/meet-the-exploding-pea-mix/"]
)
await crawler.run(["https://freemusicarchive.org/"])
await crawler.export_data("results.json")


if __name__ == "__main__":
Expand Down
44 changes: 44 additions & 0 deletions src/robots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
Class to respect robot.txt file
"""

import urllib.parse

import aiohttp
from protego import Protego


class RobotTXT:
"""Class to respect robot.txt file"""

def __init__(self):
self._robots = {}
self._user_agent = ["*", "GPTBot", "WaveAICrawler"]

async def __call__(self, url: str) -> bool:
"""Check if the url is allowed to be crawled
Args:
url (str): url to be checked
Returns:
bool: True if the url is allowed to be crawled, False otherwise
"""

url_parse = urllib.parse.urlparse(url)
robots_url = f"{url_parse.scheme}://{url_parse.netloc}/robots.txt"

if robots_url not in self._robots:
async with aiohttp.ClientSession() as session:
async with session.get(robots_url) as response:
robots_content = await response.text()
self._robots[robots_url] = Protego.parse(robots_content)

authorize = []
for agent in self._user_agent:
authorize.append(self._robots[robots_url].can_fetch(url, agent))

if len(self._robots) > 100:
self._robots.popitem(last=False)

return all(authorize)
27 changes: 21 additions & 6 deletions src/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,18 @@
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext
from crawlee.playwright_crawler import PlaywrightCrawlingContext

router = Router[PlaywrightCrawlingContext]()
from robots import RobotTXT
from utils import is_valid_url

router = Router[PlaywrightCrawlingContext]()
robots_parser = RobotTXT()
regex = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)\.(mp3|wav|ogg)"


@router.default_handler
async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f"Processing page: {context.request.url}")

url = context.request.url
html_page = str(context.soup).replace("\/", "/")

Expand All @@ -22,11 +27,21 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
for link in audio_links:
link = urllib.parse.urljoin(url, link)

data = {
"url": link,
"label": "audio",
}
data = {"url": link, "src": url}

context.log.info(f"Found audio link: {link}")
await context.push_data(data)

await context.enqueue_links(strategy="all")
requests = []
for link in context.soup.select("a"):
if link.attrs.get("href") is not None:
url = urllib.parse.urljoin(context.request.url, link.attrs.get("href"))

if not is_valid_url(url):
continue

authorized = await robots_parser(url)
if authorized:
requests.append(url)

await context.add_requests(requests)
22 changes: 22 additions & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""
Utils function
"""

import urllib.parse


def is_valid_url(url: str) -> bool:
"""Check if url is valid
Args:
url (str): the url to check
Returns:
bool: boolean that indicate if url is valid (true) or not
"""

try:
result = urllib.parse.urlparse(url)
return all([result.scheme, result.netloc])
except AttributeError:
return False

0 comments on commit fa55b8f

Please sign in to comment.