-
Notifications
You must be signed in to change notification settings - Fork 1
/
zippy.py
106 lines (82 loc) · 3.27 KB
/
zippy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup, NavigableString
import argparse
import os
import warnings
import logging
__version__ = "1.0.1"
# Configure logging
logging.basicConfig(level=logging.ERROR)
def parse_algorithm(algorithm):
try:
res = {
'exclude': True,
'sizes': [],
'restRatio': 0.4
}
parts = algorithm.split(" ")
if parts[0] == "+":
res['exclude'] = False
res['restRatio'] = float(parts[-1])
for i in range(1, len(parts) - 1):
res['sizes'].append(int(parts[i]))
return res
except:
defaultRes = {
'exclude': True,
'sizes': [1, 1, 2],
'restRatio': 0.4
}
return defaultRes
def bionify_word(word, algorithm, common_words):
def is_common(word):
return word.lower() in common_words
index = len(word) - 1
num_bold = 1
if len(word) <= 3 and algorithm['exclude']:
if is_common(word):
return word
if index < len(algorithm['sizes']):
num_bold = algorithm['sizes'][index]
else:
num_bold = int(len(word) * algorithm['restRatio'])
return f"<b>{word[:num_bold]}</b>{word[num_bold:]}"
def bionify_text(text, algorithm, common_words):
res = ""
for word in text.split(" "):
res += bionify_word(word, algorithm, common_words) + " "
return res.strip()
def bionify_node(node, algorithm, common_words):
if node is None or (isinstance(node, str) and node.strip() == "") or node.name in ["script", "style"]:
return
if isinstance(node, NavigableString):
bionified_html = bionify_text(node, algorithm, common_words)
new_soup = BeautifulSoup(bionified_html, 'html.parser')
node.replace_with(new_soup)
else:
for child in list(node.children):
bionify_node(child, algorithm, common_words)
def bionify_ebook(input_path, output_path, algorithm_str):
with warnings.catch_warnings():
warnings.filterwarnings('ignore', message='This search incorrectly ignores the root element, and will be fixed in a future version')
book = epub.read_epub(input_path, options={'ignore_ncx': True})
common_words = ["the", "be", "to", "of", "and", "a", "an", "it", "at", "on", "he", "she", "but", "is", "my"]
algorithm = parse_algorithm(algorithm_str)
for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
soup = BeautifulSoup(item.get_body_content(), 'html.parser')
if soup.body is not None:
bionify_node(soup.body, algorithm, common_words)
item.set_content(str(soup))
try:
epub.write_epub(output_path, book)
except Exception as e:
logging.error(f"Could not write ePub: {e}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Bionify ePub eBooks.")
parser.add_argument('input', help="Input ePub file path")
parser.add_argument('output', help="Output ePub file path")
parser.add_argument('--algorithm', default="- 0 1 1 2 0.4", help="Bionification algorithm")
parser.add_argument('-v', '--version', action='version', version=f'%(prog)s {__version__}')
args = parser.parse_args()
bionify_ebook(args.input, args.output, args.algorithm)