From 9cb73f21167e10d166a899696132403fb1f5d2a3 Mon Sep 17 00:00:00 2001 From: jourdelune Date: Tue, 23 Jul 2024 20:19:10 +0200 Subject: [PATCH] [update] Add domain filtering to avoid crawling the same domain multiple times --- src/filter.py | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/routes.py | 51 +++++++++++++++++++++++++++------------------- 2 files changed, 86 insertions(+), 21 deletions(-) create mode 100644 src/filter.py diff --git a/src/filter.py b/src/filter.py new file mode 100644 index 0000000..e57828c --- /dev/null +++ b/src/filter.py @@ -0,0 +1,56 @@ +""" +Filter class +""" + +import urllib.parse + + +class Filter: + """ + Class to filter the domain and avoid to crawl the same domain if it does not contain any song + """ + + def __init__(self, max_crawl: int = 20, cache_size: int = 500) -> None: + """Class to filter the domain + + Args: + max_crawl (int, optional): the maximum number of crawl before reject + the url if no song is added. Defaults to 20. + cache_size (int, optional): the cache size. Defaults to 500. + """ + self._domains = {} + self._cache_size = cache_size + self._max_crawl = max_crawl + + def check_domain(self, url: str) -> bool: + """Check if the domain is has not been visited more than max_crawl times and not songs are added + + Args: + url (str): the url to check + + Returns: + bool: True if the domain has not been visited more than max_crawl times, False otherwise + """ + + domain = urllib.parse.urlparse(url).netloc + + if len(self._domains) > self._cache_size: + oldest_domain = list(self._domains.keys())[0] + del self._domains[oldest_domain] + + if domain not in self._domains: + self._domains[domain] = 1 + else: + self._domains[domain] += 1 + + return self._domains[domain] <= self._max_crawl + + def valid_domain(self, url: str) -> None: + """Reset the counter of the domain + + Args: + url (str): the url to check + """ + + domain = urllib.parse.urlparse(url).netloc + self._domains[domain] = 0 diff --git a/src/routes.py b/src/routes.py index 5b0c0cf..ed2ec67 100644 --- a/src/routes.py +++ b/src/routes.py @@ -8,11 +8,13 @@ from crawlee.basic_crawler import Router from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext +from filter import Filter from robots import RobotTXT from utils import is_valid_url router = Router[BeautifulSoupCrawlingContext]() robots_parser = RobotTXT() +filter_domain = Filter() REGEX = r"(https?:)?(\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)\.(mp3|wav|ogg)" @@ -44,24 +46,31 @@ async def default_handler(context: BeautifulSoupCrawlingContext) -> None: context.log.info(f"Found audio link: {link}") await context.push_data(data) # save the links - - # get all links in the page - requests = [] - for link in context.soup.select("a"): - if link.attrs.get("href") is not None: - url = urllib.parse.urljoin( - context.request.url, link.attrs.get("href") - ).strip() - - if not is_valid_url(url): - continue - - authorized = await robots_parser( - url, context.log - ) # get if robots.txt allow the crawl - if authorized: - url_trunk = url.split("?")[0].split("#")[0] - - requests.append(url_trunk) - - await context.add_requests(requests) + filter_domain.valid_domain(url) # reset the counter of the domain + + # check if keywords music, audio, sound are in the page + keywords = ["music", "audio", "sound", "song", "artist"] + text = context.soup.get_text(separator=" ", strip=True) + + if any( + keyword in text.lower() for keyword in keywords + ) and filter_domain.check_domain(url): + requests = [] + for link in context.soup.select("a"): + if link.attrs.get("href") is not None: + url = urllib.parse.urljoin( + context.request.url, link.attrs.get("href") + ).strip() + + if not is_valid_url(url): + continue + + authorized = await robots_parser( + url, context.log + ) # get if robots.txt allow the crawl + if authorized: + url_trunk = url.split("?")[0].split("#")[0] + + requests.append(url_trunk) + + await context.add_requests(requests)