Skip to content

Commit

Permalink
Core file update
Browse files Browse the repository at this point in the history
- recognize crawler by hostname
- update templates and examples with novel summary and tags
- madara template update
- require lxml >= 5
  • Loading branch information
dipu-bd committed Jan 4, 2025
1 parent 6f1db35 commit aaaf877
Show file tree
Hide file tree
Showing 29 changed files with 458 additions and 133 deletions.
16 changes: 9 additions & 7 deletions lncrawl/bots/console/integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,23 @@ def start(self):
search_links = [
str(link)
for link, crawler in crawler_list.items()
if crawler.search_novel != Crawler.search_novel
if crawler.search_novel != Crawler.search_novel and link.startswith("http")
]
self.search_mode = True
else:
url = urlparse(self.app.user_input)
url = "%s://%s/" % (url.scheme, url.hostname)
if url in rejected_sources:
display.url_rejected(rejected_sources[url])
raise LNException("Fail to init crawler: %s is rejected", url)
hostname = urlparse(self.app.user_input).hostname
if hostname in rejected_sources:
display.url_rejected(rejected_sources[hostname])
raise LNException("Fail to init crawler: %s is rejected", hostname)
try:
logger.info("Detected URL input")
self.app.crawler = prepare_crawler(self.app.user_input)
self.search_mode = False
except Exception as e:
display.url_not_recognized()
if "No crawler found for" in str(e):
display.url_not_recognized()
else:
logger.error("Failed to prepare crawler", e)
logger.debug("Trying to find it in novelupdates", e)
guess = self.app.guess_novel_title(self.app.user_input)
display.guessed_url_for_novelupdates()
Expand Down
4 changes: 2 additions & 2 deletions lncrawl/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Interactive application to take user inputs
"""

import logging
import os
import signal
Expand All @@ -11,8 +12,7 @@
from ..assets.version import get_version
from ..bots import run_bot
from .arguments import get_args
from .display import (cancel_method, description, error_message,
input_suppression)
from .display import cancel_method, description, error_message, input_suppression
from .logconfig import configure_logging
from .proxy import load_proxies, start_proxy_fetcher, stop_proxy_fetcher
from .sources import load_sources
Expand Down
5 changes: 4 additions & 1 deletion lncrawl/core/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def prepare_search(self):
str(link)
for link, crawler in crawler_list.items()
if crawler.search_novel != Crawler.search_novel
and link.startswith("http")
]

def guess_novel_title(self, url: str) -> str:
Expand Down Expand Up @@ -148,7 +149,9 @@ def get_novel_info(self):
)

source_name = slugify(urlparse(self.crawler.home_url).netloc)
self.output_path = Path(C.DEFAULT_OUTPUT_PATH) / source_name / self.good_file_name
self.output_path = (
Path(C.DEFAULT_OUTPUT_PATH) / source_name / self.good_file_name
)

# ----------------------------------------------------------------------- #

Expand Down
12 changes: 9 additions & 3 deletions lncrawl/core/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,15 @@ def new_version_news(latest):
def url_supported_list():
from .sources import crawler_list

print("List of %d supported sources:" % len(crawler_list))
for url in sorted(crawler_list.keys()):
print(Fore.LIGHTGREEN_EX, Chars.RIGHT_ARROW, url, Fore.RESET)
crawlers = list(set(crawler_list.values()))
print(f"List of supported sources in {len(crawlers)} crawlers:")
for crawler in sorted(crawlers, key=lambda x: x.__name__):
crawler_name = crawler.__name__.split(".")[-1]
crawler_path = getattr(crawler, "file_path", crawler.__module__)
print(Fore.LIGHTGREEN_EX + Chars.RIGHT_ARROW, crawler_name + Fore.RESET, end="")
print(Style.DIM, "(" + crawler_path + ")", Style.RESET_ALL)
for url in crawler.base_url:
print(" " + Fore.CYAN + Chars.LINK, url + Fore.RESET)


def url_not_recognized():
Expand Down
2 changes: 1 addition & 1 deletion lncrawl/core/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def get_response(self, url, retry=1, timeout=(7, 301), **kwargs) -> Response:
**kwargs,
)

def post_response(self, url, data={}, retry=1, **kwargs) -> Response:
def post_response(self, url, data={}, retry=0, **kwargs) -> Response:
"""Make a POST request and return the response"""
return self.__process_request(
"post",
Expand Down
41 changes: 27 additions & 14 deletions lncrawl/core/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@
"rejected_sources",
]

rejected_sources = {}
template_list: Set[Type[Crawler]] = set()
crawler_list: Dict[str, Type[Crawler]] = {}
rejected_sources: Dict[str, str] = {}

# --------------------------------------------------------------------------- #
# Utilities
Expand Down Expand Up @@ -98,7 +98,8 @@ def __load_current_index():
if not index_file.is_file():
index_file = __local_data_path / "sources" / "_index.json"

assert index_file.is_file(), "Invalid index file"
if not index_file.is_file():
raise LNException("Invalid index file")

logger.debug("Loading current index data from %s", index_file)
with open(index_file, "r", encoding="utf8") as fp:
Expand Down Expand Up @@ -157,6 +158,11 @@ def __check_updates():

global rejected_sources
rejected_sources = __current_index["rejected"]
for url, reason in rejected_sources.items():
no_www = url.replace("://www.", "://")
rejected_sources[no_www] = reason
rejected_sources[urlparse(url).hostname] = reason
rejected_sources[urlparse(no_www).hostname] = reason


# --------------------------------------------------------------------------- #
Expand Down Expand Up @@ -239,8 +245,8 @@ def __import_crawlers(file_path: Path) -> List[Type[Crawler]]:
if file_path in __cache_crawlers:
return __cache_crawlers[file_path]

# logger.debug('+ %s', file_path)
assert file_path.is_file(), "Invalid crawler file path"
if not file_path.is_file():
raise LNException("Invalid crawler file path")

try:
module_name = hashlib.md5(file_path.name.encode()).hexdigest()
Expand Down Expand Up @@ -273,7 +279,8 @@ def __import_crawlers(file_path: Path) -> List[Type[Crawler]]:
if not urls:
continue
for url in urls:
assert __url_regex.match(url), f"Invalid base url: {url} @{file_path}"
if not __url_regex.match(url):
raise LNException(f"Invalid base url: {url} @{file_path}")

for method in ["read_novel_info", "download_chapter_body"]:
if not hasattr(crawler, method):
Expand Down Expand Up @@ -309,8 +316,13 @@ def __add_crawlers_from_path(path: Path):
crawlers = __import_crawlers(path)
for crawler in crawlers:
setattr(crawler, "file_path", str(path.absolute()))
for url in getattr(crawler, "base_url"):
base_urls: list[str] = getattr(crawler, "base_url")
for url in base_urls:
no_www = url.replace("://www.", "://")
crawler_list[url] = crawler
crawler_list[no_www] = crawler
crawler_list[urlparse(url).hostname] = crawler
crawler_list[urlparse(no_www).hostname] = crawler
except Exception as e:
logger.warning("Could not load crawlers from %s. Error: %s", path, e)

Expand Down Expand Up @@ -351,21 +363,22 @@ def prepare_crawler(url: str) -> Optional[Crawler]:
return None

parsed_url = urlparse(url)
base_url = "%s://%s/" % (parsed_url.scheme, parsed_url.hostname)
if base_url in rejected_sources:
raise LNException("Source is rejected. Reason: " + rejected_sources[base_url])
hostname = parsed_url.hostname
home_url = f"{parsed_url.scheme}://{hostname}/"

if hostname in rejected_sources:
raise LNException("Source is rejected. Reason: " + rejected_sources[hostname])

CrawlerType = crawler_list.get(base_url)
CrawlerType = crawler_list.get(hostname)
if not CrawlerType:
raise LNException("No crawler found for " + base_url)
raise LNException("No crawler found for " + hostname)

logger.info(
"Initializing crawler for: %s [%s]",
base_url,
f"Initializing crawler for: {home_url} [%s]",
getattr(CrawlerType, "file_path", "."),
)
crawler = CrawlerType()
crawler.home_url = base_url
crawler.novel_url = url
crawler.home_url = home_url
crawler.initialize()
return crawler
32 changes: 31 additions & 1 deletion lncrawl/templates/browser/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@ def read_novel_info_in_scraper(self) -> None:
except Exception as e:
logger.warning("Failed to parse novel authors | %s", e)

try:
tags = set(list(self.parse_categories(soup)))
self.novel_tags = ", ".join(tags)
except Exception as e:
logger.warning("Failed to parse novel tags | %s", e)

try:
self.novel_synopsis = self.parse_summary(soup)
except Exception as e:
logger.warning("Failed to parse novel synopsis | %s", e)

for item in self.parse_chapter_list(soup):
if isinstance(item, Chapter):
self.chapters.append(item)
Expand All @@ -59,6 +70,17 @@ def read_novel_info_in_browser(self) -> None:
except Exception as e:
logger.warning("Failed to parse novel authors | %s", e)

try:
tags = set(list(self.parse_categories_in_browser()))
self.novel_tags = ", ".join(tags)
except Exception as e:
logger.warning("Failed to parse novel tags | %s", e)

try:
self.novel_synopsis = self.parse_summary_in_browser()
except Exception as e:
logger.warning("Failed to parse novel synopsis | %s", e)

for item in self.parse_chapter_list_in_browser():
if isinstance(item, Chapter):
self.chapters.append(item)
Expand All @@ -73,10 +95,18 @@ def parse_cover_in_browser(self) -> str:
"""Parse and return the novel cover image in the browser"""
return self.parse_cover(self.browser.soup)

def parse_authors_in_browser(self) -> Generator[Tag, None, None]:
def parse_authors_in_browser(self) -> Generator[str, None, None]:
"""Parse and return the novel author in the browser"""
yield from self.parse_authors(self.browser.soup)

def parse_categories_in_browser(self) -> Generator[str, None, None]:
"""Parse and return the novel categories in the browser"""
yield from self.parse_categories(self.browser.soup)

def parse_summary_in_browser(self) -> str:
"""Parse and return the novel summary or synopsis in the browser"""
return self.parse_summary(self.browser.soup)

def parse_chapter_list_in_browser(
self,
) -> Generator[Union[Chapter, Volume], None, None]:
Expand Down
53 changes: 36 additions & 17 deletions lncrawl/templates/madara.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from urllib.parse import urlencode

from bs4 import BeautifulSoup, Tag
Expand All @@ -6,6 +7,8 @@
from lncrawl.templates.soup.chapter_only import ChapterOnlySoupTemplate
from lncrawl.templates.soup.searchable import SearchableSoupTemplate

logger = logging.getLogger(__name__)


class MadaraTemplate(SearchableSoupTemplate, ChapterOnlySoupTemplate):
is_template = True
Expand Down Expand Up @@ -51,35 +54,51 @@ def parse_cover(self, soup: BeautifulSoup) -> str:
return self.absolute_url(tag["data-src"])
if tag.has_attr("src"):
return self.absolute_url(tag["src"])
return ''
return ""

def parse_authors(self, soup: BeautifulSoup):
for a in soup.select('.author-content a[href*="manga-author"]'):
yield a.text.strip()

def parse_categories(self, soup):
for a in soup.select('.genres-content a[rel="tag"]'):
yield a.text.strip()

def parse_summary(self, soup):
possible_summary = soup.select_one(".description-summary a")
if possible_summary:
return self.cleaner.extract_contents(possible_summary)

def select_chapter_tags(self, soup: BeautifulSoup):
try:
clean_novel_url = self.novel_url.split("?")[0].strip("/")
response = self.submit_form(f"{clean_novel_url}/ajax/chapters/", retry=0)
response = self.submit_form(f"{clean_novel_url}/ajax/chapters/")
soup = self.make_soup(response)
chapters = soup.select("ul.main .wp-manga-chapter a")
if not chapters:
raise Exception("No chapters on first URL")
except Exception:
yield from reversed(chapters)
use_alternate = True
except Exception as e:
use_alternate = True
logger.debug("Failed to fetch chapters using ajax", e)

if use_alternate:
nl_id = soup.select_one("#manga-chapters-holder[data-id]")
if not isinstance(nl_id, Tag):
raise Exception('No chapter chapter id tag found')
response = self.submit_form(
f"{self.home_url}wp-admin/admin-ajax.php",
data={
"action": "manga_get_chapters",
"manga": nl_id["data-id"],
},
)
soup = self.make_soup(response)
chapters = soup.select("ul.main .wp-manga-chapter a")

yield from reversed(chapters)
logger.debug("No chapter id tag found for alternate method")
return
try:
response = self.submit_form(
f"{self.home_url}wp-admin/admin-ajax.php",
data={
"action": "manga_get_chapters",
"manga": nl_id["data-id"],
},
)
soup = self.make_soup(response)
chapters = soup.select("ul.main .wp-manga-chapter a")
yield from reversed(chapters)
except Exception as e:
logger.debug("Failed to fetch chapters using alternate method", e)

def parse_chapter_item(self, tag: Tag, id: int) -> Chapter:
return Chapter(
Expand Down
22 changes: 20 additions & 2 deletions lncrawl/templates/soup/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@ def read_novel_info(self) -> None:
except Exception as e:
logger.warning("Failed to parse novel authors | %s", e)

try:
tags = set(list(self.parse_categories(soup)))
self.novel_tags = ", ".join(tags)
except Exception as e:
logger.warning("Failed to parse novel tags | %s", e)

try:
self.novel_synopsis = self.parse_summary(soup)
except Exception as e:
logger.warning("Failed to parse novel synopsis | %s", e)

for item in self.parse_chapter_list(soup):
if isinstance(item, Chapter):
self.chapters.append(item)
Expand All @@ -50,10 +61,17 @@ def parse_cover(self, soup: BeautifulSoup) -> str:
"""Parse and return the novel cover image"""
raise NotImplementedError()

@abstractmethod
def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]:
"""Parse and return the novel authors"""
raise NotImplementedError()
return []

def parse_categories(self, soup: BeautifulSoup) -> Generator[str, None, None]:
"""Parse and return the novel categories"""
return []

def parse_summary(self, soup: BeautifulSoup) -> str:
"""Parse and return the novel summary or synopsis"""
return ""

@abstractmethod
def parse_chapter_list(
Expand Down
3 changes: 2 additions & 1 deletion requirements-app.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ PyExecJS>=1.5.1,<2.0.0
ebooklib>=0.17.0,<1.0.0
pillow>=6.0.0
cloudscraper>=1.2.71
lxml>=4.0.0,<5.0.0
lxml>=5.0.0
lxml-html-clean>=0.1.0
questionary>=1.6.0
prompt-toolkit~=3.0
html5lib~=1.1
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ PyExecJS>=1.5.1,<2.0.0
ebooklib>=0.17.0,<1.0.0
pillow>=6.0.0
cloudscraper>=1.2.71
lxml>=4.0.0,<5.0.0
lxml>=5.0.0
lxml-html-clean>=0.1.0
questionary>=1.6.0
prompt-toolkit~=3.0
html5lib~=1.1
Expand Down
Loading

0 comments on commit aaaf877

Please sign in to comment.