Core file update

- recognize crawler by hostname - update templates and examples with novel summary and tags - madara template update - require lxml >= 5
dipu-bd · Jan 4, 2025 · aaaf877 · aaaf877
1 parent 6f1db35
commit aaaf877
Show file tree

Hide file tree

Showing 29 changed files with 458 additions and 133 deletions.
diff --git a/lncrawl/bots/console/integration.py b/lncrawl/bots/console/integration.py
@@ -43,21 +43,23 @@ def start(self):
         search_links = [
             str(link)
             for link, crawler in crawler_list.items()
-            if crawler.search_novel != Crawler.search_novel
+            if crawler.search_novel != Crawler.search_novel and link.startswith("http")
         ]
         self.search_mode = True
     else:
-        url = urlparse(self.app.user_input)
-        url = "%s://%s/" % (url.scheme, url.hostname)
-        if url in rejected_sources:
-            display.url_rejected(rejected_sources[url])
-            raise LNException("Fail to init crawler: %s is rejected", url)
+        hostname = urlparse(self.app.user_input).hostname
+        if hostname in rejected_sources:
+            display.url_rejected(rejected_sources[hostname])
+            raise LNException("Fail to init crawler: %s is rejected", hostname)
         try:
             logger.info("Detected URL input")
             self.app.crawler = prepare_crawler(self.app.user_input)
             self.search_mode = False
         except Exception as e:
-            display.url_not_recognized()
+            if "No crawler found for" in str(e):
+                display.url_not_recognized()
+            else:
+                logger.error("Failed to prepare crawler", e)
             logger.debug("Trying to find it in novelupdates", e)
             guess = self.app.guess_novel_title(self.app.user_input)
             display.guessed_url_for_novelupdates()

diff --git a/lncrawl/core/__init__.py b/lncrawl/core/__init__.py
@@ -1,6 +1,7 @@
 """
 Interactive application to take user inputs
 """
+
 import logging
 import os
 import signal
@@ -11,8 +12,7 @@
 from ..assets.version import get_version
 from ..bots import run_bot
 from .arguments import get_args
-from .display import (cancel_method, description, error_message,
-                      input_suppression)
+from .display import cancel_method, description, error_message, input_suppression
 from .logconfig import configure_logging
 from .proxy import load_proxies, start_proxy_fetcher, stop_proxy_fetcher
 from .sources import load_sources

diff --git a/lncrawl/core/app.py b/lncrawl/core/app.py
@@ -79,6 +79,7 @@ def prepare_search(self):
                 str(link)
                 for link, crawler in crawler_list.items()
                 if crawler.search_novel != Crawler.search_novel
+                and link.startswith("http")
             ]
 
     def guess_novel_title(self, url: str) -> str:
@@ -148,7 +149,9 @@ def get_novel_info(self):
             )
 
         source_name = slugify(urlparse(self.crawler.home_url).netloc)
-        self.output_path = Path(C.DEFAULT_OUTPUT_PATH) / source_name / self.good_file_name
+        self.output_path = (
+            Path(C.DEFAULT_OUTPUT_PATH) / source_name / self.good_file_name
+        )
 
     # ----------------------------------------------------------------------- #
 

diff --git a/lncrawl/core/display.py b/lncrawl/core/display.py
@@ -139,9 +139,15 @@ def new_version_news(latest):
 def url_supported_list():
     from .sources import crawler_list
 
-    print("List of %d supported sources:" % len(crawler_list))
-    for url in sorted(crawler_list.keys()):
-        print(Fore.LIGHTGREEN_EX, Chars.RIGHT_ARROW, url, Fore.RESET)
+    crawlers = list(set(crawler_list.values()))
+    print(f"List of supported sources in {len(crawlers)} crawlers:")
+    for crawler in sorted(crawlers, key=lambda x: x.__name__):
+        crawler_name = crawler.__name__.split(".")[-1]
+        crawler_path = getattr(crawler, "file_path", crawler.__module__)
+        print(Fore.LIGHTGREEN_EX + Chars.RIGHT_ARROW, crawler_name + Fore.RESET, end="")
+        print(Style.DIM, "(" + crawler_path + ")", Style.RESET_ALL)
+        for url in crawler.base_url:
+            print("    " + Fore.CYAN + Chars.LINK, url + Fore.RESET)
 
 
 def url_not_recognized():

diff --git a/lncrawl/core/scraper.py b/lncrawl/core/scraper.py
@@ -206,7 +206,7 @@ def get_response(self, url, retry=1, timeout=(7, 301), **kwargs) -> Response:
             **kwargs,
         )
 
-    def post_response(self, url, data={}, retry=1, **kwargs) -> Response:
+    def post_response(self, url, data={}, retry=0, **kwargs) -> Response:
         """Make a POST request and return the response"""
         return self.__process_request(
             "post",

diff --git a/lncrawl/core/sources.py b/lncrawl/core/sources.py
@@ -32,9 +32,9 @@
     "rejected_sources",
 ]
 
-rejected_sources = {}
 template_list: Set[Type[Crawler]] = set()
 crawler_list: Dict[str, Type[Crawler]] = {}
+rejected_sources: Dict[str, str] = {}
 
 # --------------------------------------------------------------------------- #
 # Utilities
@@ -98,7 +98,8 @@ def __load_current_index():
         if not index_file.is_file():
             index_file = __local_data_path / "sources" / "_index.json"
 
-        assert index_file.is_file(), "Invalid index file"
+        if not index_file.is_file():
+            raise LNException("Invalid index file")
 
         logger.debug("Loading current index data from %s", index_file)
         with open(index_file, "r", encoding="utf8") as fp:
@@ -157,6 +158,11 @@ def __check_updates():
 
     global rejected_sources
     rejected_sources = __current_index["rejected"]
+    for url, reason in rejected_sources.items():
+        no_www = url.replace("://www.", "://")
+        rejected_sources[no_www] = reason
+        rejected_sources[urlparse(url).hostname] = reason
+        rejected_sources[urlparse(no_www).hostname] = reason
 
 
 # --------------------------------------------------------------------------- #
@@ -239,8 +245,8 @@ def __import_crawlers(file_path: Path) -> List[Type[Crawler]]:
     if file_path in __cache_crawlers:
         return __cache_crawlers[file_path]
 
-    # logger.debug('+ %s', file_path)
-    assert file_path.is_file(), "Invalid crawler file path"
+    if not file_path.is_file():
+        raise LNException("Invalid crawler file path")
 
     try:
         module_name = hashlib.md5(file_path.name.encode()).hexdigest()
@@ -273,7 +279,8 @@ def __import_crawlers(file_path: Path) -> List[Type[Crawler]]:
         if not urls:
             continue
         for url in urls:
-            assert __url_regex.match(url), f"Invalid base url: {url} @{file_path}"
+            if not __url_regex.match(url):
+                raise LNException(f"Invalid base url: {url} @{file_path}")
 
         for method in ["read_novel_info", "download_chapter_body"]:
             if not hasattr(crawler, method):
@@ -309,8 +316,13 @@ def __add_crawlers_from_path(path: Path):
         crawlers = __import_crawlers(path)
         for crawler in crawlers:
             setattr(crawler, "file_path", str(path.absolute()))
-            for url in getattr(crawler, "base_url"):
+            base_urls: list[str] = getattr(crawler, "base_url")
+            for url in base_urls:
+                no_www = url.replace("://www.", "://")
                 crawler_list[url] = crawler
+                crawler_list[no_www] = crawler
+                crawler_list[urlparse(url).hostname] = crawler
+                crawler_list[urlparse(no_www).hostname] = crawler
     except Exception as e:
         logger.warning("Could not load crawlers from %s. Error: %s", path, e)
 
@@ -351,21 +363,22 @@ def prepare_crawler(url: str) -> Optional[Crawler]:
         return None
 
     parsed_url = urlparse(url)
-    base_url = "%s://%s/" % (parsed_url.scheme, parsed_url.hostname)
-    if base_url in rejected_sources:
-        raise LNException("Source is rejected. Reason: " + rejected_sources[base_url])
+    hostname = parsed_url.hostname
+    home_url = f"{parsed_url.scheme}://{hostname}/"
+
+    if hostname in rejected_sources:
+        raise LNException("Source is rejected. Reason: " + rejected_sources[hostname])
 
-    CrawlerType = crawler_list.get(base_url)
+    CrawlerType = crawler_list.get(hostname)
     if not CrawlerType:
-        raise LNException("No crawler found for " + base_url)
+        raise LNException("No crawler found for " + hostname)
 
     logger.info(
-        "Initializing crawler for: %s [%s]",
-        base_url,
+        f"Initializing crawler for: {home_url} [%s]",
         getattr(CrawlerType, "file_path", "."),
     )
     crawler = CrawlerType()
-    crawler.home_url = base_url
     crawler.novel_url = url
+    crawler.home_url = home_url
     crawler.initialize()
     return crawler
diff --git a/lncrawl/templates/browser/general.py b/lncrawl/templates/browser/general.py
@@ -33,6 +33,17 @@ def read_novel_info_in_scraper(self) -> None:
         except Exception as e:
             logger.warning("Failed to parse novel authors | %s", e)
 
+        try:
+            tags = set(list(self.parse_categories(soup)))
+            self.novel_tags = ", ".join(tags)
+        except Exception as e:
+            logger.warning("Failed to parse novel tags | %s", e)
+
+        try:
+            self.novel_synopsis = self.parse_summary(soup)
+        except Exception as e:
+            logger.warning("Failed to parse novel synopsis | %s", e)
+
         for item in self.parse_chapter_list(soup):
             if isinstance(item, Chapter):
                 self.chapters.append(item)
@@ -59,6 +70,17 @@ def read_novel_info_in_browser(self) -> None:
         except Exception as e:
             logger.warning("Failed to parse novel authors | %s", e)
 
+        try:
+            tags = set(list(self.parse_categories_in_browser()))
+            self.novel_tags = ", ".join(tags)
+        except Exception as e:
+            logger.warning("Failed to parse novel tags | %s", e)
+
+        try:
+            self.novel_synopsis = self.parse_summary_in_browser()
+        except Exception as e:
+            logger.warning("Failed to parse novel synopsis | %s", e)
+
         for item in self.parse_chapter_list_in_browser():
             if isinstance(item, Chapter):
                 self.chapters.append(item)
@@ -73,10 +95,18 @@ def parse_cover_in_browser(self) -> str:
         """Parse and return the novel cover image in the browser"""
         return self.parse_cover(self.browser.soup)
 
-    def parse_authors_in_browser(self) -> Generator[Tag, None, None]:
+    def parse_authors_in_browser(self) -> Generator[str, None, None]:
         """Parse and return the novel author in the browser"""
         yield from self.parse_authors(self.browser.soup)
 
+    def parse_categories_in_browser(self) -> Generator[str, None, None]:
+        """Parse and return the novel categories in the browser"""
+        yield from self.parse_categories(self.browser.soup)
+
+    def parse_summary_in_browser(self) -> str:
+        """Parse and return the novel summary or synopsis in the browser"""
+        return self.parse_summary(self.browser.soup)
+
     def parse_chapter_list_in_browser(
         self,
     ) -> Generator[Union[Chapter, Volume], None, None]:

diff --git a/lncrawl/templates/madara.py b/lncrawl/templates/madara.py
@@ -1,3 +1,4 @@
+import logging
 from urllib.parse import urlencode
 
 from bs4 import BeautifulSoup, Tag
@@ -6,6 +7,8 @@
 from lncrawl.templates.soup.chapter_only import ChapterOnlySoupTemplate
 from lncrawl.templates.soup.searchable import SearchableSoupTemplate
 
+logger = logging.getLogger(__name__)
+
 
 class MadaraTemplate(SearchableSoupTemplate, ChapterOnlySoupTemplate):
     is_template = True
@@ -51,35 +54,51 @@ def parse_cover(self, soup: BeautifulSoup) -> str:
                 return self.absolute_url(tag["data-src"])
             if tag.has_attr("src"):
                 return self.absolute_url(tag["src"])
-        return ''
+        return ""
 
     def parse_authors(self, soup: BeautifulSoup):
         for a in soup.select('.author-content a[href*="manga-author"]'):
             yield a.text.strip()
 
+    def parse_categories(self, soup):
+        for a in soup.select('.genres-content a[rel="tag"]'):
+            yield a.text.strip()
+
+    def parse_summary(self, soup):
+        possible_summary = soup.select_one(".description-summary a")
+        if possible_summary:
+            return self.cleaner.extract_contents(possible_summary)
+
     def select_chapter_tags(self, soup: BeautifulSoup):
         try:
             clean_novel_url = self.novel_url.split("?")[0].strip("/")
-            response = self.submit_form(f"{clean_novel_url}/ajax/chapters/", retry=0)
+            response = self.submit_form(f"{clean_novel_url}/ajax/chapters/")
             soup = self.make_soup(response)
             chapters = soup.select("ul.main .wp-manga-chapter a")
-            if not chapters:
-                raise Exception("No chapters on first URL")
-        except Exception:
+            yield from reversed(chapters)
+            use_alternate = True
+        except Exception as e:
+            use_alternate = True
+            logger.debug("Failed to fetch chapters using ajax", e)
+
+        if use_alternate:
             nl_id = soup.select_one("#manga-chapters-holder[data-id]")
             if not isinstance(nl_id, Tag):
-                raise Exception('No chapter chapter id tag found')
-            response = self.submit_form(
-                f"{self.home_url}wp-admin/admin-ajax.php",
-                data={
-                    "action": "manga_get_chapters",
-                    "manga": nl_id["data-id"],
-                },
-            )
-            soup = self.make_soup(response)
-            chapters = soup.select("ul.main .wp-manga-chapter a")
-
-        yield from reversed(chapters)
+                logger.debug("No chapter id tag found for alternate method")
+                return
+            try:
+                response = self.submit_form(
+                    f"{self.home_url}wp-admin/admin-ajax.php",
+                    data={
+                        "action": "manga_get_chapters",
+                        "manga": nl_id["data-id"],
+                    },
+                )
+                soup = self.make_soup(response)
+                chapters = soup.select("ul.main .wp-manga-chapter a")
+                yield from reversed(chapters)
+            except Exception as e:
+                logger.debug("Failed to fetch chapters using alternate method", e)
 
     def parse_chapter_item(self, tag: Tag, id: int) -> Chapter:
         return Chapter(

diff --git a/lncrawl/templates/soup/general.py b/lncrawl/templates/soup/general.py
@@ -31,6 +31,17 @@ def read_novel_info(self) -> None:
         except Exception as e:
             logger.warning("Failed to parse novel authors | %s", e)
 
+        try:
+            tags = set(list(self.parse_categories(soup)))
+            self.novel_tags = ", ".join(tags)
+        except Exception as e:
+            logger.warning("Failed to parse novel tags | %s", e)
+
+        try:
+            self.novel_synopsis = self.parse_summary(soup)
+        except Exception as e:
+            logger.warning("Failed to parse novel synopsis | %s", e)
+
         for item in self.parse_chapter_list(soup):
             if isinstance(item, Chapter):
                 self.chapters.append(item)
@@ -50,10 +61,17 @@ def parse_cover(self, soup: BeautifulSoup) -> str:
         """Parse and return the novel cover image"""
         raise NotImplementedError()
 
-    @abstractmethod
     def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]:
         """Parse and return the novel authors"""
-        raise NotImplementedError()
+        return []
+
+    def parse_categories(self, soup: BeautifulSoup) -> Generator[str, None, None]:
+        """Parse and return the novel categories"""
+        return []
+
+    def parse_summary(self, soup: BeautifulSoup) -> str:
+        """Parse and return the novel summary or synopsis"""
+        return ""
 
     @abstractmethod
     def parse_chapter_list(

diff --git a/requirements-app.txt b/requirements-app.txt
@@ -13,7 +13,8 @@ PyExecJS>=1.5.1,<2.0.0
 ebooklib>=0.17.0,<1.0.0
 pillow>=6.0.0
 cloudscraper>=1.2.71
-lxml>=4.0.0,<5.0.0
+lxml>=5.0.0
+lxml-html-clean>=0.1.0
 questionary>=1.6.0
 prompt-toolkit~=3.0
 html5lib~=1.1

diff --git a/requirements.txt b/requirements.txt
@@ -13,7 +13,8 @@ PyExecJS>=1.5.1,<2.0.0
 ebooklib>=0.17.0,<1.0.0
 pillow>=6.0.0
 cloudscraper>=1.2.71
-lxml>=4.0.0,<5.0.0
+lxml>=5.0.0
+lxml-html-clean>=0.1.0
 questionary>=1.6.0
 prompt-toolkit~=3.0
 html5lib~=1.1