From 1eac98703ca2f40cc02a79690faf41e876f60fb4 Mon Sep 17 00:00:00 2001 From: Jourdelune Date: Fri, 7 Jun 2024 15:43:05 +0200 Subject: [PATCH] [update] test branch to code a crawler based on playwright --- Crawler/__init__.py | 0 Crawler/items.py | 10 ---- Crawler/middlewares.py | 103 ---------------------------------- Crawler/pipelines.py | 13 ----- Crawler/settings.py | 107 ------------------------------------ Crawler/spiders/__init__.py | 4 -- Crawler/spiders/audios.py | 60 -------------------- Crawler/spiders/quotes.py | 17 ------ README.md | 2 +- crawler/__init__.py | 1 + crawler/browser.py | 93 +++++++++++++++++++++++++++++++ crawler/crawler.py | 52 ++++++++++++++++++ crawler/data/__init__.py | 1 + crawler/data/page.py | 11 ++++ crawler/db_url.py | 69 +++++++++++++++++++++++ main.py | 5 ++ requirements.txt | 4 +- scrapy.cfg | 11 ---- 18 files changed, 235 insertions(+), 328 deletions(-) delete mode 100644 Crawler/__init__.py delete mode 100644 Crawler/items.py delete mode 100644 Crawler/middlewares.py delete mode 100644 Crawler/pipelines.py delete mode 100644 Crawler/settings.py delete mode 100644 Crawler/spiders/__init__.py delete mode 100644 Crawler/spiders/audios.py delete mode 100644 Crawler/spiders/quotes.py create mode 100644 crawler/__init__.py create mode 100644 crawler/browser.py create mode 100644 crawler/crawler.py create mode 100644 crawler/data/__init__.py create mode 100644 crawler/data/page.py create mode 100644 crawler/db_url.py create mode 100644 main.py delete mode 100644 scrapy.cfg diff --git a/Crawler/__init__.py b/Crawler/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/Crawler/items.py b/Crawler/items.py deleted file mode 100644 index 5ba1997..0000000 --- a/Crawler/items.py +++ /dev/null @@ -1,10 +0,0 @@ -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class AudioItem(scrapy.Item): - link = scrapy.Field() diff --git a/Crawler/middlewares.py b/Crawler/middlewares.py deleted file mode 100644 index 8129bad..0000000 --- a/Crawler/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - -# useful for handling different item types with a single interface -from itemadapter import is_item, ItemAdapter - - -class CrawlerSpiderMiddleware: - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, or item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request or item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info("Spider opened: %s" % spider.name) - - -class CrawlerDownloaderMiddleware: - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info("Spider opened: %s" % spider.name) diff --git a/Crawler/pipelines.py b/Crawler/pipelines.py deleted file mode 100644 index e3aa1dc..0000000 --- a/Crawler/pipelines.py +++ /dev/null @@ -1,13 +0,0 @@ -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - - -# useful for handling different item types with a single interface -from itemadapter import ItemAdapter - - -class CrawlerPipeline: - def process_item(self, item, spider): - return item diff --git a/Crawler/settings.py b/Crawler/settings.py deleted file mode 100644 index 90ce594..0000000 --- a/Crawler/settings.py +++ /dev/null @@ -1,107 +0,0 @@ -# Scrapy settings for Crawler project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = "Crawler" - -SPIDER_MODULES = ["Crawler.spiders"] -NEWSPIDER_MODULE = "Crawler.spiders" - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -# USER_AGENT = "Crawler (+http://www.yourdomain.com)" - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -# DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 -# CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -# COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -# TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -# DEFAULT_REQUEST_HEADERS = { -# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", -# "Accept-Language": "en", -# } - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -# SPIDER_MIDDLEWARES = { -# "Crawler.middlewares.CrawlerSpiderMiddleware": 543, -# } - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# DOWNLOADER_MIDDLEWARES = { -# "Crawler.middlewares.CrawlerDownloaderMiddleware": 543, -# } - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -# EXTENSIONS = { -# "scrapy.extensions.telnet.TelnetConsole": None, -# } - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -# ITEM_PIPELINES = { -# "Crawler.pipelines.CrawlerPipeline": 300, -# } - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -# AUTOTHROTTLE_ENABLED = True -# The initial download delay -# AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -# AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -# AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -# HTTPCACHE_ENABLED = True -# HTTPCACHE_EXPIRATION_SECS = 0 -# HTTPCACHE_DIR = "httpcache" -# HTTPCACHE_IGNORE_HTTP_CODES = [] -# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" - -# Set settings whose default value is deprecated to a future-proof value -REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" -TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" -FEED_EXPORT_ENCODING = "utf-8" - -DOWNLOAD_HANDLERS = { - "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", - "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", -} - -TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" - - -def should_abort_request(request): - return request.resource_type != "document" - - -PLAYWRIGHT_ABORT_REQUEST = should_abort_request diff --git a/Crawler/spiders/__init__.py b/Crawler/spiders/__init__.py deleted file mode 100644 index ebd689a..0000000 --- a/Crawler/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/Crawler/spiders/audios.py b/Crawler/spiders/audios.py deleted file mode 100644 index c8e80c1..0000000 --- a/Crawler/spiders/audios.py +++ /dev/null @@ -1,60 +0,0 @@ -import re - -import scrapy - - -class AudiosSpider(scrapy.Spider): - name = "audios" - start_urls = "https://musicforprogramming.net/seventy" - match_audio_ext = (".mp3", ".ogg", ".wav", ".flac") - - def start_requests(self): - yield scrapy.Request( - self.start_urls, - callback=self.parse, - meta={ - "playwright": True, - "playwright_include_page": True, - "playwright_page_goto_kwargs": { - "wait_until": "networkidle", - }, - }, - ) - - async def get_links(self, response): - page = response.meta["playwright_page"] - links = await page.query_selector_all("a") - - full_links = [] - for link in links: - link = await link.get_attribute("href") - - regex = re.compile(r".+(:\/\/)") - if not regex.match(link): - link = response.urljoin(link) - - full_links.append(link) - - return full_links - - async def parse(self, response): - links = await self.get_links(response) - - for link in links: - if link.endswith(self.match_audio_ext): - yield { - "link": link, - } - - for link in links: - yield scrapy.Request( - link, - callback=self.parse, - meta={ - "playwright": True, - "playwright_include_page": True, - "playwright_page_goto_kwargs": { - "wait_until": "networkidle", - }, - }, - ) diff --git a/Crawler/spiders/quotes.py b/Crawler/spiders/quotes.py deleted file mode 100644 index 3f870c2..0000000 --- a/Crawler/spiders/quotes.py +++ /dev/null @@ -1,17 +0,0 @@ -import scrapy - - -class QuotesSpider(scrapy.Spider): - name = "quotes" - start_urls = [ - "https://quotes.toscrape.com/page/1/", - "https://quotes.toscrape.com/page/2/", - ] - - def parse(self, response): - for quote in response.css("div.quote"): - yield { - "text": quote.css("span.text::text").get(), - "author": quote.css("small.author::text").get(), - "tags": quote.css("div.tags a.tag::text").getall(), - } diff --git a/README.md b/README.md index f4a5e78..4c62cc4 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ pip install -r requirements.txt Run the crawler ```bash -scrapy crawl audios +python3 main.py ``` ## License diff --git a/crawler/__init__.py b/crawler/__init__.py new file mode 100644 index 0000000..779ecc5 --- /dev/null +++ b/crawler/__init__.py @@ -0,0 +1 @@ +from crawler.crawler import Crawler diff --git a/crawler/browser.py b/crawler/browser.py new file mode 100644 index 0000000..4a3d0bc --- /dev/null +++ b/crawler/browser.py @@ -0,0 +1,93 @@ +""" +A module to manage the playwright browser in an asyncio context +""" + +import asyncio +import typing + +from playwright.async_api import async_playwright +from crawler.data import Page + + +class Browser: + def __init__(self, callback: typing.Callable): + """Initialize the browser + + Args: + callback (typing.Callable): a function to call when a page is loaded + """ + + self._browser = None + self._callback = callback + self._asyncio_loop = ( + asyncio.new_event_loop() + ) # set the event loop that will be used by the browser + asyncio.set_event_loop(self._asyncio_loop) + + self._asyncio_loop.run_until_complete(self._launch()) + self._tasks = [] # Store the tasks to close them later + + async def _launch(self): + """Launch the browser""" + + self._playwright = ( + await async_playwright().__aenter__() + ) # just like the `with function` but in async (called manually) + + self._browser = await self._playwright.chromium.launch() + + async def _close(self): + """Close the browser and playwright""" + if self._browser is not None: + await self._browser.close() + + @property + def queue_size(self): + """Get the size of the queue""" + return len(self._tasks) + + async def _get_content(self, url: str): + """Get the content of a page + + Args: + url (str): the url to load + + Returns: + str: the content of the page + """ + + page = await self._browser.new_page() + await page.goto(url) + + # Wait for the page to load + await page.wait_for_load_state("domcontentloaded") + + content = await page.content() + await page.close() + + page = Page(url, content) + self._callback(page) + + def get(self, url: str): + """Load a page in the browser + + Args: + url (str): the url to load + """ + + loop = self._asyncio_loop + task = loop.create_task(self._get_content(url)) + self._tasks.append(task) + + self._tasks = [task for task in self._tasks if not task.done()] + + def close(self): + """Close the browser and the asyncio loop""" + + asyncio.set_event_loop(self._asyncio_loop) + loop = self._asyncio_loop + + loop.run_until_complete(asyncio.gather(*self._tasks)) + loop.run_until_complete(self._close()) + + loop.close() diff --git a/crawler/crawler.py b/crawler/crawler.py new file mode 100644 index 0000000..e3fbfd1 --- /dev/null +++ b/crawler/crawler.py @@ -0,0 +1,52 @@ +""" +A class to manage the crawling of a website +""" + +import typing + +from crawler.browser import Browser +from crawler.db_url import DBUrl +from crawler.data import Page + + +class Crawler: + def __init__(self, reset_db: bool = False): + self.browser = Browser(self._on_page_loaded) + self.db_url = DBUrl(reset_db=reset_db) + + def _on_page_loaded(self, page: Page): + """Callback called when a page is loaded + + Args: + content (Page): the content of the page + """ + + print(page) + + def get(self, url: str): + """Get the content of a page + + Args: + url (str): the url to load + """ + + if not self.db_url.is_url_visited(url): + self.db_url.add_url(url) + self.browser.get(url) + + def crawl(self, start_url: typing.List[str]): + """start crawling + + Args: + start_url (typing.List[str]): the list of urls to start crawling + """ + + for url in start_url: + self.get(url) + + def close(self): + """ + Close the browser and the database and end the connection for the crawler + """ + + self.browser.close() diff --git a/crawler/data/__init__.py b/crawler/data/__init__.py new file mode 100644 index 0000000..a25c781 --- /dev/null +++ b/crawler/data/__init__.py @@ -0,0 +1 @@ +from crawler.data.page import Page diff --git a/crawler/data/page.py b/crawler/data/page.py new file mode 100644 index 0000000..43b7fb5 --- /dev/null +++ b/crawler/data/page.py @@ -0,0 +1,11 @@ +from dataclasses import dataclass + + +@dataclass +class Page: + """ + Class that represents a web page. + """ + + url: str + content: str diff --git a/crawler/db_url.py b/crawler/db_url.py new file mode 100644 index 0000000..f646a02 --- /dev/null +++ b/crawler/db_url.py @@ -0,0 +1,69 @@ +""" +A module to store the list of url visited by the crawler +""" + +from typing import List, Union +import redis + + +class DBUrl: + """ + A class to store the list of url visited by the crawler + """ + + def __init__( + self, host: str = "localhost", port: int = 6379, reset_db: bool = False + ): + """Initialize the redis connection to store the list of url visited by the crawler + + Args: + host (str, optional): the host url. Defaults to "localhost". + port (int, optional): the port of the database. Defaults to 6379. + reset_db (bool, optional): if True, reset the database. Defaults to False. + """ + + self.redis = redis.Redis(host=host, port=port) + + if reset_db: + self.redis.delete("urls") + + def add_url(self, url: Union[str, List[str]]): + """Add a url to the list of visited urls + + Args: + url (Union[str, List[str]]): the url to add + """ + + if isinstance(url, list): + for u in url: + self.redis.sadd("urls", u) + else: + self.redis.sadd("urls", url) + + def is_url_visited(self, url: Union[str, List[str]]) -> bool: + """Check if a url is already visited + + Args: + url (Union[str, List[str]]): the url to check + + Returns: + bool: True if the url is already visited, False otherwise + """ + + if isinstance(url, list): + return all(self.redis.sismember("urls", u) for u in url) + return self.redis.sismember("urls", url) + + def filter_url(self, url: Union[str, List[str]]) -> list: + """Filter the list of url to keep only the non visited ones + + Args: + url (Union[str, List[str]]): the list of url to filter + + Returns: + list: the list of non visited url + """ + + if isinstance(url, list): + return [u for u in url if not self.redis.sismember("urls", u)] + return url if not self.redis.sismember("urls", url) else [] diff --git a/main.py b/main.py new file mode 100644 index 0000000..fc82569 --- /dev/null +++ b/main.py @@ -0,0 +1,5 @@ +from crawler import Crawler + +crawler = Crawler(reset_db=True) +crawler.crawl(["https://example.com"]) +crawler.close() diff --git a/requirements.txt b/requirements.txt index bd09967..a97f74b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -scrapy -scrapy-playwright +playwright +redis diff --git a/scrapy.cfg b/scrapy.cfg deleted file mode 100644 index 32522de..0000000 --- a/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = Crawler.settings - -[deploy] -#url = http://localhost:6800/ -project = Crawler