From 3f41d08bf4c25e2199179e5f5080f8b52d7f689f Mon Sep 17 00:00:00 2001 From: Zahar Yagodin Date: Fri, 22 Sep 2023 12:54:10 +0300 Subject: [PATCH 1/6] some code refactoring --- .gitignore | 2 + AKnigaParser.py | 126 ++++++++++++++++++++++++++++++++++++++++++++++++ akniga_dl.py | 113 ------------------------------------------- main.py | 19 ++++++++ 4 files changed, 147 insertions(+), 113 deletions(-) create mode 100644 .gitignore create mode 100644 AKnigaParser.py delete mode 100644 akniga_dl.py create mode 100644 main.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..be17459 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/.idea/ +/venv/ \ No newline at end of file diff --git a/AKnigaParser.py b/AKnigaParser.py new file mode 100644 index 0000000..58bb3f5 --- /dev/null +++ b/AKnigaParser.py @@ -0,0 +1,126 @@ +import subprocess +import brotli +from selenium.webdriver.chrome.service import Service as ChromeService +from seleniumwire import webdriver +from webdriver_manager.chrome import ChromeDriverManager +import json +import shutil +from pathlib import Path +from pathvalidate import sanitize_filename + + +class BookData: + def __init__(self, items): + self.title = items['title'] + self.res = items['res'] + self.hres = items['hres'] + self.srv = items['srv'] + self.sTextAuthor = items['sTextAuthor'] + self.sTextPerformer = items['sTextPerformer'] + self.sTextFav = items['sTextFav'] + self.items = items['items'] + self.topic_id = items['topic_id'] + self.titleonly = items['titleonly'] + self.slug = items['slug'] + self.version = items['version'] + self.bookurl = items['bookurl'] + self.preview = items['preview'] + self.author = items['author'] + self.sMsgTitle = items['sMsgTitle'] + self.sMsg = items['sMsg'] + self.bStateError = items['bStateError'] + self.m3u8_url = items['m3u8_url'] + self.chapters = items['chapters'] + + +class AKnigaParser: + book_url: str + book_requests: list + book_json: BookData + book_folder: Path + + def __init__(self, url, output_folder): + self.book_url = url + self.book_requests = self.get_book_requests() + book_json, m3u8_url = self.analyse_book_requests() + book_json['m3u8_url'] = m3u8_url + book_json['title'] = sanitize_filename(book_json.title) + book_json['chapters'] = json.loads(book_json.items) + self.book_json = BookData(book_json) + self.create_book_folder(output_folder) + + def create_book_folder(self, output_folder): + output_path = output_folder if Path(output_folder).is_absolute() else Path(__file__).parent / output_folder + self.book_folder = Path(output_path) / self.book_json.title + Path(self.book_folder).mkdir(parents=True) + + def get_book_requests(self) -> list: + print("Getting book requests. Please wait...") + service = ChromeService(executable_path=ChromeDriverManager().install()) + options = webdriver.ChromeOptions() + options.add_argument('headless') + with webdriver.Chrome(service=service, options=options) as driver: + driver.get(self.book_url) + return driver.requests + + def analyse_book_requests(self) -> tuple: + print('Analysing book requests...') + try: + # find request with book json data + book_json_requests = [r for r in self.book_requests if r.method == 'POST' and r.path.startswith('/ajax/b/')] + # assert that we have only 1 request for book data found + assert len(book_json_requests) == 1, 'Error: Book data not found. Exiting.' + print('Book data found') + # find request with m3u8 file + m3u8_file_requests = [r for r in self.book_requests if 'm3u8' in r.url] + # assert that we have only 1 request for m3u8 file found + assert len(m3u8_file_requests) == 1, 'Error: m3u8 file request not found. Exiting.' + print('m3u8 file found') + book_json = json.loads(brotli.decompress(book_json_requests[0].response.body)) + return book_json, m3u8_file_requests[0].url + except AssertionError as message: + print(message) + exit() + + def separate_into_chapters(self, full_mp3_filepath: Path): + print('Separating chapters. Please wait...') + for chapter in self.book_json.chapters: + chapter_path = self.book_folder / sanitize_filename(chapter['title']) + ffmpeg_command = ['ffmpeg', '-i', f'{full_mp3_filepath}.mp3', '-acodec', 'copy', '-ss', + str(chapter['time_from_start']), '-to', str(chapter['time_finish']), + f'{chapter_path}.mp3'] + subprocess.run(ffmpeg_command) + + def download_book(self, single_chapter: bool = False): + print('Downloading book. Please wait...') + if single_chapter: + filepath = self.book_folder / self.book_json.chapters[0]['title'] + else: + filepath = self.book_folder / self.book_json.title + ffmpeg_command = ['ffmpeg', '-i', self.book_json.m3u8_url, f'{filepath}.mp3'] + subprocess.run(ffmpeg_command) + + def run(self, delete_full_book_folder: bool = False, separate_into_chapters: bool = True): + if len(self.book_json.chapters) < 1: + return + if len(self.book_json.chapters) == 1 or not separate_into_chapters: + if len(self.book_json.chapters) == 1: + print("Only 1 chapter found") + else: + print("Multiple chapters found") + + print("Downloading full book without chapters separation") + self.download_book(single_chapter=True) # download directly in book folder + return + + print("Multiple chapters found") + full_book_folder = self.book_folder / 'full_book' + Path(full_book_folder).mkdir() + + print(f"Downloading full book with chapters separation, {'deleting' if delete_full_book_folder else 'keeping'} full book folder afterwards") + + self.download_book(single_chapter=False) + self.separate_into_chapters(full_book_folder / self.book_json.title) + + if delete_full_book_folder: + shutil.rmtree(full_book_folder, ignore_errors=True) diff --git a/akniga_dl.py b/akniga_dl.py deleted file mode 100644 index fa55ff0..0000000 --- a/akniga_dl.py +++ /dev/null @@ -1,113 +0,0 @@ -import argparse -import json -import shutil -import subprocess -import brotli -from pathlib import Path -from pathvalidate import sanitize_filename -from selenium.webdriver.chrome.service import Service as ChromeService -from seleniumwire import webdriver -from webdriver_manager.chrome import ChromeDriverManager - - -def get_book_requests(book_url: str) -> list: - print("Getting book requests. Please wait...") - service = ChromeService(executable_path=ChromeDriverManager().install()) - options = webdriver.ChromeOptions() - options.add_argument('headless') - with webdriver.Chrome(service=service, options=options) as driver: - driver.get(book_url) - return driver.requests - - -def analyse_book_requests(book_requests: list) -> tuple: - print('Analysing book requests...') - try: - # find request with book json data - book_json_requests = [r for r in book_requests if r.method == 'POST' and r.path.startswith('/ajax/b/')] - # assert that we have only 1 request for book data found - assert len(book_json_requests) == 1, 'Error: Book data not found. Exiting.' - print('Book data found') - # find request with m3u8 file - m3u8_file_requests = [r for r in book_requests if 'm3u8' in r.url] - # assert that we have only 1 request for m3u8 file found - assert len(m3u8_file_requests) == 1, 'Error: m3u8 file request not found. Exiting.' - print('m3u8 file found') - book_json = json.loads(brotli.decompress(book_json_requests[0].response.body)) - return book_json, m3u8_file_requests[0].url - except AssertionError as message: - print(message) - exit() - - -def separate_into_chapters(book_json: dict, full_mp3_filepath: Path, book_folder: Path): - print('Separating chapters. Please wait...') - for chapter in book_json['chapters']: - chapter_path = book_folder / sanitize_filename(chapter['title']) - ffmpeg_command = ['ffmpeg', '-i', f'{full_mp3_filepath}.mp3', '-acodec', 'copy', '-ss', - str(chapter['time_from_start']), '-to', str(chapter['time_finish']), f'{chapter_path}.mp3'] - subprocess.run(ffmpeg_command) - - -def download_book(book_json: dict, target_folder: Path, single_chapter: bool = False): - print('Downloading book. Please wait...') - if single_chapter: - filepath = target_folder / book_json['chapters'][0]['title'] - else: - filepath = target_folder / book_json['title'] - ffmpeg_command = ['ffmpeg', '-i', book_json['m3u8_url'], f'{filepath}.mp3'] - subprocess.run(ffmpeg_command) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Download a book from akniga.org') - parser.add_argument('url', help='Book\'s url for downloading') - parser.add_argument('output', help='Absolute or relative path where book will be downloaded') - group = parser.add_mutually_exclusive_group() - group.add_argument('-d', '--delete', action='store_true', - help='Delete full book folder, after chapter separation is done') - group.add_argument('-f', '--full', action='store_true', - help='Do not separate the book into multiple chapters, if any') - args = parser.parse_args() - print(args) - - book_requests = get_book_requests(args.url) - book_json, m3u8_url = analyse_book_requests(book_requests) - book_json['m3u8_url'] = m3u8_url - book_json['title'] = sanitize_filename(book_json['title']) - book_json['chapters'] = json.loads(book_json['items']) - - # check if output folder is an absolute or relative - if Path(args.output).is_absolute(): - output_path = args.output - else: - output_path = Path(__file__).parent / args.output - - # create book folder - book_folder = Path(output_path) / book_json['title'] - Path(book_folder).mkdir(parents=True) - - if len(book_json['chapters']) == 1: - print('Only one chapter found') - # download book directly in book folder - download_book(book_json, book_folder, single_chapter=True) - elif len(book_json['chapters']) >= 2: - print('Multiple chapters found') - if args.f: - print("Downloading full book without chapters separation") - # download book directly in book folder - download_book(book_json, book_folder, single_chapter=True) - else: - # create full book folder - full_book_folder = book_folder / 'full_book' - Path(full_book_folder).mkdir() - if args.d: - print("Downloading full book with chapters separation, deleting full book folder afterwards") - # download book in full book folder, delete it afterward - download_book(book_json, full_book_folder, single_chapter=False) - separate_into_chapters(book_json, full_book_folder / book_json['title'], book_folder) - shutil.rmtree(full_book_folder, ignore_errors=True) - else: - print("Downloading full book with chapters separation and keeping full book folder afterwards") - download_book(book_json, full_book_folder, single_chapter=False) - separate_into_chapters(book_json, full_book_folder / book_json['title'], book_folder) diff --git a/main.py b/main.py new file mode 100644 index 0000000..e7ea581 --- /dev/null +++ b/main.py @@ -0,0 +1,19 @@ +import argparse +from AKnigaParser import AKnigaParser + + +def parse_from_console(): + parser = argparse.ArgumentParser(description='Download a book from akniga.org') + parser.add_argument('url', help='Book\'s url for downloading') + parser.add_argument('output', help='Absolute or relative path where book will be downloaded') + group = parser.add_mutually_exclusive_group() + group.add_argument('-d', '--delete', action='store_true', + help='Delete full book folder, after chapter separation is done') + group.add_argument('-f', '--full', action='store_true', + help='Do not separate the book into multiple chapters, if any') + args = parser.parse_args() + AKnigaParser(args.url, args.output).run(args.d, not args.f) + + +if __name__ == "__main__": + parse_from_console() From 2c65d615f1341ee81a23ac35e9d92f66dd55307c Mon Sep 17 00:00:00 2001 From: Zahar Yagodin Date: Fri, 22 Sep 2023 13:09:29 +0300 Subject: [PATCH 2/6] fixed some things --- .gitignore | 3 ++- AKnigaParser.py | 4 ++-- main.py | 6 ++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index be17459..a7988f1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /.idea/ -/venv/ \ No newline at end of file +/venv/ +/__pycache__/ \ No newline at end of file diff --git a/AKnigaParser.py b/AKnigaParser.py index 58bb3f5..10386f6 100644 --- a/AKnigaParser.py +++ b/AKnigaParser.py @@ -44,8 +44,8 @@ def __init__(self, url, output_folder): self.book_requests = self.get_book_requests() book_json, m3u8_url = self.analyse_book_requests() book_json['m3u8_url'] = m3u8_url - book_json['title'] = sanitize_filename(book_json.title) - book_json['chapters'] = json.loads(book_json.items) + book_json['title'] = sanitize_filename(book_json['title']) + book_json['chapters'] = json.loads(book_json['items']) self.book_json = BookData(book_json) self.create_book_folder(output_folder) diff --git a/main.py b/main.py index e7ea581..c2a3af7 100644 --- a/main.py +++ b/main.py @@ -15,5 +15,7 @@ def parse_from_console(): AKnigaParser(args.url, args.output).run(args.d, not args.f) -if __name__ == "__main__": - parse_from_console() +# if __name__ == "__main__": +# parse_from_console() + +AKnigaParser("https://akniga.org/magonote-rifudzin-tekuschee-polozhenie-1", "./").run(False, True) From 24800d70dde1ade3ef69f64bf5c4d57bbf30b3f0 Mon Sep 17 00:00:00 2001 From: Zahar Yagodin Date: Fri, 22 Sep 2023 13:11:24 +0300 Subject: [PATCH 3/6] fixed main command --- main.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index c2a3af7..e7ea581 100644 --- a/main.py +++ b/main.py @@ -15,7 +15,5 @@ def parse_from_console(): AKnigaParser(args.url, args.output).run(args.d, not args.f) -# if __name__ == "__main__": -# parse_from_console() - -AKnigaParser("https://akniga.org/magonote-rifudzin-tekuschee-polozhenie-1", "./").run(False, True) +if __name__ == "__main__": + parse_from_console() From 8cb1cb38d2186bb621a57cbf97b45116f70f71bb Mon Sep 17 00:00:00 2001 From: Zahar Yagodin Date: Sat, 30 Sep 2023 00:28:50 +0300 Subject: [PATCH 4/6] edited code --- AKnigaParser.py => akniga_parser.py | 42 ++++++++++++++++------------- 1 file changed, 23 insertions(+), 19 deletions(-) rename AKnigaParser.py => akniga_parser.py (81%) diff --git a/AKnigaParser.py b/akniga_parser.py similarity index 81% rename from AKnigaParser.py rename to akniga_parser.py index 10386f6..b32347b 100644 --- a/AKnigaParser.py +++ b/akniga_parser.py @@ -1,5 +1,6 @@ import subprocess import brotli +import requests from selenium.webdriver.chrome.service import Service as ChromeService from seleniumwire import webdriver from webdriver_manager.chrome import ChromeDriverManager @@ -36,7 +37,7 @@ def __init__(self, items): class AKnigaParser: book_url: str book_requests: list - book_json: BookData + book_data: BookData book_folder: Path def __init__(self, url, output_folder): @@ -46,19 +47,14 @@ def __init__(self, url, output_folder): book_json['m3u8_url'] = m3u8_url book_json['title'] = sanitize_filename(book_json['title']) book_json['chapters'] = json.loads(book_json['items']) - self.book_json = BookData(book_json) + self.book_data = BookData(book_json) self.create_book_folder(output_folder) - def create_book_folder(self, output_folder): - output_path = output_folder if Path(output_folder).is_absolute() else Path(__file__).parent / output_folder - self.book_folder = Path(output_path) / self.book_json.title - Path(self.book_folder).mkdir(parents=True) - def get_book_requests(self) -> list: print("Getting book requests. Please wait...") service = ChromeService(executable_path=ChromeDriverManager().install()) options = webdriver.ChromeOptions() - options.add_argument('headless') + # options.add_argument('headless') with webdriver.Chrome(service=service, options=options) as driver: driver.get(self.book_url) return driver.requests @@ -82,9 +78,14 @@ def analyse_book_requests(self) -> tuple: print(message) exit() + def create_book_folder(self, output_folder: str): + output_path = output_folder if Path(output_folder).is_absolute() else Path(__file__).parent / output_folder + self.book_folder = Path(output_path) / self.book_data.title + Path(self.book_folder).mkdir(parents=True, exist_ok=True) + def separate_into_chapters(self, full_mp3_filepath: Path): print('Separating chapters. Please wait...') - for chapter in self.book_json.chapters: + for chapter in self.book_data.chapters: chapter_path = self.book_folder / sanitize_filename(chapter['title']) ffmpeg_command = ['ffmpeg', '-i', f'{full_mp3_filepath}.mp3', '-acodec', 'copy', '-ss', str(chapter['time_from_start']), '-to', str(chapter['time_finish']), @@ -94,17 +95,19 @@ def separate_into_chapters(self, full_mp3_filepath: Path): def download_book(self, single_chapter: bool = False): print('Downloading book. Please wait...') if single_chapter: - filepath = self.book_folder / self.book_json.chapters[0]['title'] + filepath = self.book_folder / self.book_data.chapters[0]['title'] else: - filepath = self.book_folder / self.book_json.title - ffmpeg_command = ['ffmpeg', '-i', self.book_json.m3u8_url, f'{filepath}.mp3'] + filepath = self.book_folder / self.book_data.title + + requests.get(self.book_data.m3u8_url) + ffmpeg_command = ['ffmpeg', '-i', self.book_data.m3u8_url, f'{filepath}.mp3'] subprocess.run(ffmpeg_command) def run(self, delete_full_book_folder: bool = False, separate_into_chapters: bool = True): - if len(self.book_json.chapters) < 1: + if len(self.book_data.chapters) < 1: return - if len(self.book_json.chapters) == 1 or not separate_into_chapters: - if len(self.book_json.chapters) == 1: + if len(self.book_data.chapters) == 1 or not separate_into_chapters: + if len(self.book_data.chapters) == 1: print("Only 1 chapter found") else: print("Multiple chapters found") @@ -114,13 +117,14 @@ def run(self, delete_full_book_folder: bool = False, separate_into_chapters: boo return print("Multiple chapters found") - full_book_folder = self.book_folder / 'full_book' - Path(full_book_folder).mkdir() + full_book_folder = self.book_folder + Path(full_book_folder).mkdir(exist_ok=True) - print(f"Downloading full book with chapters separation, {'deleting' if delete_full_book_folder else 'keeping'} full book folder afterwards") + print( + f"Downloading full book with chapters separation, {'deleting' if delete_full_book_folder else 'keeping'} full book folder afterwards") self.download_book(single_chapter=False) - self.separate_into_chapters(full_book_folder / self.book_json.title) + self.separate_into_chapters(full_book_folder / self.book_data.title) if delete_full_book_folder: shutil.rmtree(full_book_folder, ignore_errors=True) From c55ac2e48390b9047d83bdba6434a324e666f625 Mon Sep 17 00:00:00 2001 From: Zahar Yagodin Date: Mon, 2 Oct 2023 13:26:45 +0300 Subject: [PATCH 5/6] fixed code --- akniga_parser.py | 11 +++-------- main.py | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/akniga_parser.py b/akniga_parser.py index b32347b..b5a3af2 100644 --- a/akniga_parser.py +++ b/akniga_parser.py @@ -1,6 +1,5 @@ import subprocess import brotli -import requests from selenium.webdriver.chrome.service import Service as ChromeService from seleniumwire import webdriver from webdriver_manager.chrome import ChromeDriverManager @@ -54,7 +53,7 @@ def get_book_requests(self) -> list: print("Getting book requests. Please wait...") service = ChromeService(executable_path=ChromeDriverManager().install()) options = webdriver.ChromeOptions() - # options.add_argument('headless') + options.add_argument('headless') with webdriver.Chrome(service=service, options=options) as driver: driver.get(self.book_url) return driver.requests @@ -99,11 +98,10 @@ def download_book(self, single_chapter: bool = False): else: filepath = self.book_folder / self.book_data.title - requests.get(self.book_data.m3u8_url) ffmpeg_command = ['ffmpeg', '-i', self.book_data.m3u8_url, f'{filepath}.mp3'] subprocess.run(ffmpeg_command) - def run(self, delete_full_book_folder: bool = False, separate_into_chapters: bool = True): + def run(self, separate_into_chapters: bool = True): if len(self.book_data.chapters) < 1: return if len(self.book_data.chapters) == 1 or not separate_into_chapters: @@ -121,10 +119,7 @@ def run(self, delete_full_book_folder: bool = False, separate_into_chapters: boo Path(full_book_folder).mkdir(exist_ok=True) print( - f"Downloading full book with chapters separation, {'deleting' if delete_full_book_folder else 'keeping'} full book folder afterwards") + f"Downloading full book with chapters separation, keeping full book afterwards") self.download_book(single_chapter=False) self.separate_into_chapters(full_book_folder / self.book_data.title) - - if delete_full_book_folder: - shutil.rmtree(full_book_folder, ignore_errors=True) diff --git a/main.py b/main.py index e7ea581..def16ce 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,5 @@ import argparse -from AKnigaParser import AKnigaParser +from akniga_parser import AKnigaParser def parse_from_console(): @@ -7,13 +7,21 @@ def parse_from_console(): parser.add_argument('url', help='Book\'s url for downloading') parser.add_argument('output', help='Absolute or relative path where book will be downloaded') group = parser.add_mutually_exclusive_group() - group.add_argument('-d', '--delete', action='store_true', - help='Delete full book folder, after chapter separation is done') group.add_argument('-f', '--full', action='store_true', help='Do not separate the book into multiple chapters, if any') args = parser.parse_args() - AKnigaParser(args.url, args.output).run(args.d, not args.f) + AKnigaParser(args.url, args.output).run(not args.f) if __name__ == "__main__": parse_from_console() + + +# OR +# files = [ +# 'https://akniga.org/url1', +# 'https://akniga.org/url2', +# ] +# +# for url in files: +# AKnigaParser(url, '').run(True) From 4c6163162d63a7cb3b5631bef7e71afeba397208 Mon Sep 17 00:00:00 2001 From: Zahar Yagodin <112830043+HuTao1Love@users.noreply.github.com> Date: Mon, 2 Oct 2023 13:31:01 +0300 Subject: [PATCH 6/6] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0f9e4ee..a6611da 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,9 @@ ffmpeg is also required * Make sure you've added ffmpeg.exe path to PATH environment variable as on the video # Usage: -usage: akniga_dl.py [-h] [-d | -f] url output +usage: akniga_dl.py [-h] [-f] url output +if you know what to do, you can do easier: +watch commented lines in main.py Download a book from akniga.org @@ -21,7 +23,6 @@ positional arguments: options: -h, --help show this help message and exit - -d, --delete Delete full book folder, after chapter separation is done -f, --full Do not separate the book into multiple chapters, if any Where: