-
Notifications
You must be signed in to change notification settings - Fork 66
/
text_transform.py
101 lines (81 loc) · 2.9 KB
/
text_transform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import re
import lxml.html
import lxml
def fix_html(text, remove_wrapper=False):
"""Fix invalid html, remove unnecessary attribs, tags and whitespace"""
text = text.strip()
text = text.replace("\r", "") # remove \r as lxml escapes it
doc = lxml.html.document_fromstring(text)
anchors = doc.body.findall(".//a")
for anchor in anchors:
anchor.attrib.pop("id", None)
anchor.attrib.pop("name", None)
children = []
for elem in doc.body:
if not elem.text:
continue
children.append(lxml.etree.tostring(elem, encoding="unicode"))
text = "\n".join(children)
if remove_wrapper:
text = re.sub(r"^<p>", "", text)
text = re.sub(r"</p>$", "", text)
text = re.sub(r"</?c_q\d+>", "", text) # remove <c_q10> like tags
return text
def standardize_terms(text):
terms = [
("PBUH", "\ufdfa"),
("P.B.U.H.", "\ufdfa"),
("peace_be_upon_him", "\ufdfa"),
("(may peace be upon him)", "(\ufdfa)"),
("(saws)", "(\ufdfa)"),
("(SAW)", "(\ufdfa)"),
("(saw)", "(\ufdfa)"),
("he Apostle of Allah", "he Messenger of Allah"),
("he Apostle of Allaah", "he Messenger of Allah"),
("Allah's Apostle", "Allah's Messenger"),
("he Holy Prophet ", "he Prophet "),
]
for old, new in terms:
text = text.replace(old, new)
text = re.sub(r"Allah\'s Messenger (?!\()", "Allah's Messenger (\ufdfa) ", text)
text = re.sub(r"he Messenger of Allah (?!\()", "he Messenger of Allah (\ufdfa) ", text)
text = re.sub(r"he Prophet (?!\()", "he Prophet (\ufdfa) ", text)
return text
def fix_hyperlinks(text):
"""Converts links to ayah and hadith to quran.com and sunnah.com respectively"""
text = text.replace('href="/', 'href="https://sunnah.com/')
quran_links = re.findall(r"javascript:openquran\((.+?)\)", text)
for link_match in quran_links:
surah, begin, end = link_match.split(",")
text = text.replace("javascript:openquran({})".format(link_match), "https://quran.com/{}/{}-{}".format(int(surah) + 1, begin, end))
return text
def cleanup_text(text):
if not text:
return text
text = re.sub(r"\n+", "\n", text)
text = re.sub(r" +", " ", text)
text = fix_html(text)
text = fix_hyperlinks(text)
text = text.strip()
return text
def cleanup_en_text(text):
if not text:
return text
text = cleanup_text(text)
text = standardize_terms(text)
return text
def cleanup_chapter_title(text):
if not text:
return text
text = re.sub(r"\n+", "\n", text)
text = re.sub(r" +", " ", text)
text = fix_html(text, remove_wrapper=True)
text = fix_hyperlinks(text)
text = text.strip()
return text
def cleanup_en_chapter_title(text):
if not text:
return text
text = cleanup_chapter_title(text)
text = standardize_terms(text)
return text