From 1eac98703ca2f40cc02a79690faf41e876f60fb4 Mon Sep 17 00:00:00 2001
From: Jourdelune <jourdelune863@gmail.com>
Date: Fri, 7 Jun 2024 15:43:05 +0200
Subject: [PATCH] [update] test branch to code a crawler based on playwright

---
 Crawler/__init__.py         |   0
 Crawler/items.py            |  10 ----
 Crawler/middlewares.py      | 103 ----------------------------------
 Crawler/pipelines.py        |  13 -----
 Crawler/settings.py         | 107 ------------------------------------
 Crawler/spiders/__init__.py |   4 --
 Crawler/spiders/audios.py   |  60 --------------------
 Crawler/spiders/quotes.py   |  17 ------
 README.md                   |   2 +-
 crawler/__init__.py         |   1 +
 crawler/browser.py          |  93 +++++++++++++++++++++++++++++++
 crawler/crawler.py          |  52 ++++++++++++++++++
 crawler/data/__init__.py    |   1 +
 crawler/data/page.py        |  11 ++++
 crawler/db_url.py           |  69 +++++++++++++++++++++++
 main.py                     |   5 ++
 requirements.txt            |   4 +-
 scrapy.cfg                  |  11 ----
 18 files changed, 235 insertions(+), 328 deletions(-)
 delete mode 100644 Crawler/__init__.py
 delete mode 100644 Crawler/items.py
 delete mode 100644 Crawler/middlewares.py
 delete mode 100644 Crawler/pipelines.py
 delete mode 100644 Crawler/settings.py
 delete mode 100644 Crawler/spiders/__init__.py
 delete mode 100644 Crawler/spiders/audios.py
 delete mode 100644 Crawler/spiders/quotes.py
 create mode 100644 crawler/__init__.py
 create mode 100644 crawler/browser.py
 create mode 100644 crawler/crawler.py
 create mode 100644 crawler/data/__init__.py
 create mode 100644 crawler/data/page.py
 create mode 100644 crawler/db_url.py
 create mode 100644 main.py
 delete mode 100644 scrapy.cfg

diff --git a/Crawler/__init__.py b/Crawler/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/Crawler/items.py b/Crawler/items.py
deleted file mode 100644
index 5ba1997..0000000
--- a/Crawler/items.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/items.html
-
-import scrapy
-
-
-class AudioItem(scrapy.Item):
-    link = scrapy.Field()
diff --git a/Crawler/middlewares.py b/Crawler/middlewares.py
deleted file mode 100644
index 8129bad..0000000
--- a/Crawler/middlewares.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Define here the models for your spider middleware
-#
-# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-
-from scrapy import signals
-
-# useful for handling different item types with a single interface
-from itemadapter import is_item, ItemAdapter
-
-
-class CrawlerSpiderMiddleware:
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the spider middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_spider_input(self, response, spider):
-        # Called for each response that goes through the spider
-        # middleware and into the spider.
-
-        # Should return None or raise an exception.
-        return None
-
-    def process_spider_output(self, response, result, spider):
-        # Called with the results returned from the Spider, after
-        # it has processed the response.
-
-        # Must return an iterable of Request, or item objects.
-        for i in result:
-            yield i
-
-    def process_spider_exception(self, response, exception, spider):
-        # Called when a spider or process_spider_input() method
-        # (from other spider middleware) raises an exception.
-
-        # Should return either None or an iterable of Request or item objects.
-        pass
-
-    def process_start_requests(self, start_requests, spider):
-        # Called with the start requests of the spider, and works
-        # similarly to the process_spider_output() method, except
-        # that it doesn’t have a response associated.
-
-        # Must return only requests (not items).
-        for r in start_requests:
-            yield r
-
-    def spider_opened(self, spider):
-        spider.logger.info("Spider opened: %s" % spider.name)
-
-
-class CrawlerDownloaderMiddleware:
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the downloader middleware does not modify the
-    # passed objects.
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-
-    def process_request(self, request, spider):
-        # Called for each request that goes through the downloader
-        # middleware.
-
-        # Must either:
-        # - return None: continue processing this request
-        # - or return a Response object
-        # - or return a Request object
-        # - or raise IgnoreRequest: process_exception() methods of
-        #   installed downloader middleware will be called
-        return None
-
-    def process_response(self, request, response, spider):
-        # Called with the response returned from the downloader.
-
-        # Must either;
-        # - return a Response object
-        # - return a Request object
-        # - or raise IgnoreRequest
-        return response
-
-    def process_exception(self, request, exception, spider):
-        # Called when a download handler or a process_request()
-        # (from other downloader middleware) raises an exception.
-
-        # Must either:
-        # - return None: continue processing this exception
-        # - return a Response object: stops process_exception() chain
-        # - return a Request object: stops process_exception() chain
-        pass
-
-    def spider_opened(self, spider):
-        spider.logger.info("Spider opened: %s" % spider.name)
diff --git a/Crawler/pipelines.py b/Crawler/pipelines.py
deleted file mode 100644
index e3aa1dc..0000000
--- a/Crawler/pipelines.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-
-
-# useful for handling different item types with a single interface
-from itemadapter import ItemAdapter
-
-
-class CrawlerPipeline:
-    def process_item(self, item, spider):
-        return item
diff --git a/Crawler/settings.py b/Crawler/settings.py
deleted file mode 100644
index 90ce594..0000000
--- a/Crawler/settings.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Scrapy settings for Crawler project
-#
-# For simplicity, this file contains only settings considered important or
-# commonly used. You can find more settings consulting the documentation:
-#
-#     https://docs.scrapy.org/en/latest/topics/settings.html
-#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-
-BOT_NAME = "Crawler"
-
-SPIDER_MODULES = ["Crawler.spiders"]
-NEWSPIDER_MODULE = "Crawler.spiders"
-
-
-# Crawl responsibly by identifying yourself (and your website) on the user-agent
-# USER_AGENT = "Crawler (+http://www.yourdomain.com)"
-
-# Obey robots.txt rules
-ROBOTSTXT_OBEY = True
-
-# Configure maximum concurrent requests performed by Scrapy (default: 16)
-# CONCURRENT_REQUESTS = 32
-
-# Configure a delay for requests for the same website (default: 0)
-# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
-# See also autothrottle settings and docs
-# DOWNLOAD_DELAY = 3
-# The download delay setting will honor only one of:
-# CONCURRENT_REQUESTS_PER_DOMAIN = 16
-# CONCURRENT_REQUESTS_PER_IP = 16
-
-# Disable cookies (enabled by default)
-# COOKIES_ENABLED = False
-
-# Disable Telnet Console (enabled by default)
-# TELNETCONSOLE_ENABLED = False
-
-# Override the default request headers:
-# DEFAULT_REQUEST_HEADERS = {
-#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-#    "Accept-Language": "en",
-# }
-
-# Enable or disable spider middlewares
-# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-# SPIDER_MIDDLEWARES = {
-#    "Crawler.middlewares.CrawlerSpiderMiddleware": 543,
-# }
-
-# Enable or disable downloader middlewares
-# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-# DOWNLOADER_MIDDLEWARES = {
-#    "Crawler.middlewares.CrawlerDownloaderMiddleware": 543,
-# }
-
-# Enable or disable extensions
-# See https://docs.scrapy.org/en/latest/topics/extensions.html
-# EXTENSIONS = {
-#    "scrapy.extensions.telnet.TelnetConsole": None,
-# }
-
-# Configure item pipelines
-# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-# ITEM_PIPELINES = {
-#    "Crawler.pipelines.CrawlerPipeline": 300,
-# }
-
-# Enable and configure the AutoThrottle extension (disabled by default)
-# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
-# AUTOTHROTTLE_ENABLED = True
-# The initial download delay
-# AUTOTHROTTLE_START_DELAY = 5
-# The maximum download delay to be set in case of high latencies
-# AUTOTHROTTLE_MAX_DELAY = 60
-# The average number of requests Scrapy should be sending in parallel to
-# each remote server
-# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
-# Enable showing throttling stats for every response received:
-# AUTOTHROTTLE_DEBUG = False
-
-# Enable and configure HTTP caching (disabled by default)
-# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-# HTTPCACHE_ENABLED = True
-# HTTPCACHE_EXPIRATION_SECS = 0
-# HTTPCACHE_DIR = "httpcache"
-# HTTPCACHE_IGNORE_HTTP_CODES = []
-# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
-
-# Set settings whose default value is deprecated to a future-proof value
-REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
-TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
-FEED_EXPORT_ENCODING = "utf-8"
-
-DOWNLOAD_HANDLERS = {
-    "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
-    "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
-}
-
-TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
-
-
-def should_abort_request(request):
-    return request.resource_type != "document"
-
-
-PLAYWRIGHT_ABORT_REQUEST = should_abort_request
diff --git a/Crawler/spiders/__init__.py b/Crawler/spiders/__init__.py
deleted file mode 100644
index ebd689a..0000000
--- a/Crawler/spiders/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# This package will contain the spiders of your Scrapy project
-#
-# Please refer to the documentation for information on how to create and manage
-# your spiders.
diff --git a/Crawler/spiders/audios.py b/Crawler/spiders/audios.py
deleted file mode 100644
index c8e80c1..0000000
--- a/Crawler/spiders/audios.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import re
-
-import scrapy
-
-
-class AudiosSpider(scrapy.Spider):
-    name = "audios"
-    start_urls = "https://musicforprogramming.net/seventy"
-    match_audio_ext = (".mp3", ".ogg", ".wav", ".flac")
-
-    def start_requests(self):
-        yield scrapy.Request(
-            self.start_urls,
-            callback=self.parse,
-            meta={
-                "playwright": True,
-                "playwright_include_page": True,
-                "playwright_page_goto_kwargs": {
-                    "wait_until": "networkidle",
-                },
-            },
-        )
-
-    async def get_links(self, response):
-        page = response.meta["playwright_page"]
-        links = await page.query_selector_all("a")
-
-        full_links = []
-        for link in links:
-            link = await link.get_attribute("href")
-
-            regex = re.compile(r".+(:\/\/)")
-            if not regex.match(link):
-                link = response.urljoin(link)
-
-            full_links.append(link)
-
-        return full_links
-
-    async def parse(self, response):
-        links = await self.get_links(response)
-
-        for link in links:
-            if link.endswith(self.match_audio_ext):
-                yield {
-                    "link": link,
-                }
-
-        for link in links:
-            yield scrapy.Request(
-                link,
-                callback=self.parse,
-                meta={
-                    "playwright": True,
-                    "playwright_include_page": True,
-                    "playwright_page_goto_kwargs": {
-                        "wait_until": "networkidle",
-                    },
-                },
-            )
diff --git a/Crawler/spiders/quotes.py b/Crawler/spiders/quotes.py
deleted file mode 100644
index 3f870c2..0000000
--- a/Crawler/spiders/quotes.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import scrapy
-
-
-class QuotesSpider(scrapy.Spider):
-    name = "quotes"
-    start_urls = [
-        "https://quotes.toscrape.com/page/1/",
-        "https://quotes.toscrape.com/page/2/",
-    ]
-
-    def parse(self, response):
-        for quote in response.css("div.quote"):
-            yield {
-                "text": quote.css("span.text::text").get(),
-                "author": quote.css("small.author::text").get(),
-                "tags": quote.css("div.tags a.tag::text").getall(),
-            }
diff --git a/README.md b/README.md
index f4a5e78..4c62cc4 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ pip install -r requirements.txt
 
 Run the crawler
 ```bash
-scrapy crawl audios
+python3 main.py
 ```
 
 ## License
diff --git a/crawler/__init__.py b/crawler/__init__.py
new file mode 100644
index 0000000..779ecc5
--- /dev/null
+++ b/crawler/__init__.py
@@ -0,0 +1 @@
+from crawler.crawler import Crawler
diff --git a/crawler/browser.py b/crawler/browser.py
new file mode 100644
index 0000000..4a3d0bc
--- /dev/null
+++ b/crawler/browser.py
@@ -0,0 +1,93 @@
+"""
+A module to manage the playwright browser in an asyncio context
+"""
+
+import asyncio
+import typing
+
+from playwright.async_api import async_playwright
+from crawler.data import Page
+
+
+class Browser:
+    def __init__(self, callback: typing.Callable):
+        """Initialize the browser
+
+        Args:
+            callback (typing.Callable): a function to call when a page is loaded
+        """
+
+        self._browser = None
+        self._callback = callback
+        self._asyncio_loop = (
+            asyncio.new_event_loop()
+        )  # set the event loop that will be used by the browser
+        asyncio.set_event_loop(self._asyncio_loop)
+
+        self._asyncio_loop.run_until_complete(self._launch())
+        self._tasks = []  # Store the tasks to close them later
+
+    async def _launch(self):
+        """Launch the browser"""
+
+        self._playwright = (
+            await async_playwright().__aenter__()
+        )  # just like the `with function` but in async (called manually)
+
+        self._browser = await self._playwright.chromium.launch()
+
+    async def _close(self):
+        """Close the browser and playwright"""
+        if self._browser is not None:
+            await self._browser.close()
+
+    @property
+    def queue_size(self):
+        """Get the size of the queue"""
+        return len(self._tasks)
+
+    async def _get_content(self, url: str):
+        """Get the content of a page
+
+        Args:
+            url (str): the url to load
+
+        Returns:
+            str: the content of the page
+        """
+
+        page = await self._browser.new_page()
+        await page.goto(url)
+
+        # Wait for the page to load
+        await page.wait_for_load_state("domcontentloaded")
+
+        content = await page.content()
+        await page.close()
+
+        page = Page(url, content)
+        self._callback(page)
+
+    def get(self, url: str):
+        """Load a page in the browser
+
+        Args:
+            url (str): the url to load
+        """
+
+        loop = self._asyncio_loop
+        task = loop.create_task(self._get_content(url))
+        self._tasks.append(task)
+
+        self._tasks = [task for task in self._tasks if not task.done()]
+
+    def close(self):
+        """Close the browser and the asyncio loop"""
+
+        asyncio.set_event_loop(self._asyncio_loop)
+        loop = self._asyncio_loop
+
+        loop.run_until_complete(asyncio.gather(*self._tasks))
+        loop.run_until_complete(self._close())
+
+        loop.close()
diff --git a/crawler/crawler.py b/crawler/crawler.py
new file mode 100644
index 0000000..e3fbfd1
--- /dev/null
+++ b/crawler/crawler.py
@@ -0,0 +1,52 @@
+"""
+A class to manage the crawling of a website
+"""
+
+import typing
+
+from crawler.browser import Browser
+from crawler.db_url import DBUrl
+from crawler.data import Page
+
+
+class Crawler:
+    def __init__(self, reset_db: bool = False):
+        self.browser = Browser(self._on_page_loaded)
+        self.db_url = DBUrl(reset_db=reset_db)
+
+    def _on_page_loaded(self, page: Page):
+        """Callback called when a page is loaded
+
+        Args:
+            content (Page): the content of the page
+        """
+
+        print(page)
+
+    def get(self, url: str):
+        """Get the content of a page
+
+        Args:
+            url (str): the url to load
+        """
+
+        if not self.db_url.is_url_visited(url):
+            self.db_url.add_url(url)
+            self.browser.get(url)
+
+    def crawl(self, start_url: typing.List[str]):
+        """start crawling
+
+        Args:
+            start_url (typing.List[str]): the list of urls to start crawling
+        """
+
+        for url in start_url:
+            self.get(url)
+
+    def close(self):
+        """
+        Close the browser and the database and end the connection for the crawler
+        """
+
+        self.browser.close()
diff --git a/crawler/data/__init__.py b/crawler/data/__init__.py
new file mode 100644
index 0000000..a25c781
--- /dev/null
+++ b/crawler/data/__init__.py
@@ -0,0 +1 @@
+from crawler.data.page import Page
diff --git a/crawler/data/page.py b/crawler/data/page.py
new file mode 100644
index 0000000..43b7fb5
--- /dev/null
+++ b/crawler/data/page.py
@@ -0,0 +1,11 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class Page:
+    """
+    Class that represents a web page.
+    """
+
+    url: str
+    content: str
diff --git a/crawler/db_url.py b/crawler/db_url.py
new file mode 100644
index 0000000..f646a02
--- /dev/null
+++ b/crawler/db_url.py
@@ -0,0 +1,69 @@
+"""
+A module to store the list of url visited by the crawler
+"""
+
+from typing import List, Union
+import redis
+
+
+class DBUrl:
+    """
+    A class to store the list of url visited by the crawler
+    """
+
+    def __init__(
+        self, host: str = "localhost", port: int = 6379, reset_db: bool = False
+    ):
+        """Initialize the redis connection to store the list of url visited by the crawler
+
+        Args:
+            host (str, optional): the host url. Defaults to "localhost".
+            port (int, optional): the port of the database. Defaults to 6379.
+            reset_db (bool, optional): if True, reset the database. Defaults to False.
+        """
+
+        self.redis = redis.Redis(host=host, port=port)
+
+        if reset_db:
+            self.redis.delete("urls")
+
+    def add_url(self, url: Union[str, List[str]]):
+        """Add a url to the list of visited urls
+
+        Args:
+            url (Union[str, List[str]]): the url to add
+        """
+
+        if isinstance(url, list):
+            for u in url:
+                self.redis.sadd("urls", u)
+        else:
+            self.redis.sadd("urls", url)
+
+    def is_url_visited(self, url: Union[str, List[str]]) -> bool:
+        """Check if a url is already visited
+
+        Args:
+            url (Union[str, List[str]]): the url to check
+
+        Returns:
+            bool: True if the url is already visited, False otherwise
+        """
+
+        if isinstance(url, list):
+            return all(self.redis.sismember("urls", u) for u in url)
+        return self.redis.sismember("urls", url)
+
+    def filter_url(self, url: Union[str, List[str]]) -> list:
+        """Filter the list of url to keep only the non visited ones
+
+        Args:
+            url (Union[str, List[str]]): the list of url to filter
+
+        Returns:
+            list: the list of non visited url
+        """
+
+        if isinstance(url, list):
+            return [u for u in url if not self.redis.sismember("urls", u)]
+        return url if not self.redis.sismember("urls", url) else []
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..fc82569
--- /dev/null
+++ b/main.py
@@ -0,0 +1,5 @@
+from crawler import Crawler
+
+crawler = Crawler(reset_db=True)
+crawler.crawl(["https://example.com"])
+crawler.close()
diff --git a/requirements.txt b/requirements.txt
index bd09967..a97f74b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
-scrapy
-scrapy-playwright
+playwright
+redis
diff --git a/scrapy.cfg b/scrapy.cfg
deleted file mode 100644
index 32522de..0000000
--- a/scrapy.cfg
+++ /dev/null
@@ -1,11 +0,0 @@
-# Automatically created by: scrapy startproject
-#
-# For more information about the [deploy] section see:
-# https://scrapyd.readthedocs.io/en/latest/deploy.html
-
-[settings]
-default = Crawler.settings
-
-[deploy]
-#url = http://localhost:6800/
-project = Crawler