-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
html_fix.py
71 lines (57 loc) · 2.15 KB
/
html_fix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pdfkit
import requests
import re
import bs4
def remove_tags(html, val, element_type):
"""
Removes unwanted tags like sidebar, leaderboard, menus
"""
def remove_by_id(soup, id_value, tag="div"):
""""
Removes html tag by id
"""
if soup.find(tag, id=id_value) is not None:
soup.find(tag, id=id_value).decompose()
return soup
def remove_by_class(soup, class_value, tag="div"):
"""
Removes html tag by class
"""
for div in soup.find_all(tag, {"class": class_value}):
div.decompose()
return soup
soup = bs4.BeautifulSoup(html, "html.parser")
if element_type == "id":
return str(remove_by_id(soup, id_value=val))
if element_type == "class":
return str(remove_by_class(soup, class_value=val))
def fix_broken_links(LINK):
html = requests.get(LINK, stream=True).content
to_replace = [b'href="//',
b'src="//',
br'\/scripts\/(\w+)\/en\/codeforces-options.js"',
b"href='//",
b'src = "//',
br'src="/predownloaded/',
]
replace_with = [b'href="https://',
b'src="https://',
br"https://codeforces.com/scripts/\1/en/codeforces-options.js",
b"href='https://",
b'src = "https://',
br'src="https://codeforces.com/predownloaded/'
]
# Fix css and js
# Take css and js from cf server
for i in range(len(to_replace)):
html = re.sub(to_replace[i], replace_with[i], html)
# Decode html bytes to string using UTF-8
html = html.decode(encoding="utf-8")
tags_to_remove = [("id", "sidebar"),
("class", "roundbox menu-box"),
("id", "footer"),
("class", "second-level-menu"),
]
for element_type, element in tags_to_remove:
html = remove_tags(html, val=element, element_type=element_type)
return html