-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathhtml_checker.py
executable file
·181 lines (148 loc) · 5.53 KB
/
html_checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/python3
"""
Checks html syntax.
"""
from html.parser import HTMLParser
from html_content_spec import content_spec, _ANY_CONTENT, _NO_CONTENT
import re
import argparse
try:
from typing import List, Set, Dict # noqa F401
except ImportError:
print("WARNING: Typing module is not found.")
DEV_FEATURE_ON = False # type: bool
ARG_ERROR = 1 # type: int
PARSE_ERROR = 2 # type: int
MAX_LINE = 80 # type: int
EXCEPS = "_EXCEPTIONS"
tag_stack = [] # type: List[str]
line_no = 0 # type: int
saw_error = False # type: bool
tag_error = False # type: bool
tag_check = False # type: bool
void_tags = {"area", "base", "br", "col", "hr", "img", "input", "link",
"meta", "param"} # type: Set[str]
in_sig_tag = {"pre": False, "script": False, "a": False,
"style": False} # that's all for now!
def line_msg(): # type: () -> str
"""
A little func to regularize reporting line #s for errors.
"""
return " at line number " + str(line_no)
def is_tag_in_spec(tag): # (str) -> bool
"""
func to see if the tag is in content_spec
"""
if tag not in content_spec and tag not in content_spec[EXCEPS]:
print("WARNING: " + tag + " not found in content_spec")
# Not necessarily an error, more like a warning
# saw_error = True
return False
return True
def is_valid_content(tag, attrs): # type: (str, str) -> bool
"""
Checks if the given tag is valid or can be placed within the parent tag
"""
# print("IS_VALID_CONTENT ==========")
# print("TAG: " + tag)
# print("tag_stack: " + str(tag_stack))
# print("tag_stack len: " + str(len(tag_stack)))
# If we don't know about the tag, we will not do any checks
# Just inform the user
if not is_tag_in_spec(tag):
return True
if len(tag_stack) > 0 and tag not in content_spec[EXCEPS]:
do_while = True
parent_index = -1
parent_model = []
# Processes content models that are transparent
# Must get model from an older parent
while do_while or "transparent" in parent_model:
do_while = False
ptag = tag_stack[parent_index]
if (is_tag_in_spec(ptag) and ptag not in content_spec[EXCEPS]):
parent_model = content_spec[ptag]["content_model"]
parent_index -= 1
else:
# Parent tag not in spec or is part of exceptions:
return True
tag_categories = content_spec[tag]["categories"]
for model in parent_model:
for category in tag_categories:
# If parent expects no children tags, then tag is illegal
if model == _NO_CONTENT:
return False
if model == _ANY_CONTENT or model == tag or model == category:
return True
return False
return True
class OurHTMLParser(HTMLParser):
"""
Our descendant of base HTMLParser class: we override just the methods we
need to.
"""
def __init__(self): # type: () -> None
super(OurHTMLParser, self).__init__(convert_charrefs=False)
def handle_starttag(self, tag, attrs): # type: (str, object) -> None
"""
This is a callback function that is used by HTMLParser for start tags:
it is called!
"""
if tag in in_sig_tag:
in_sig_tag[tag] = True
if tag not in void_tags:
if DEV_FEATURE_ON:
if is_valid_content(tag, attrs) is False:
print("ERROR: illegal tag" + line_msg() + ". ")
tag_stack.append(tag)
def handle_endtag(self, tag): # type: (str) -> None
global saw_error # type :bool
if not tag_stack:
print("ERROR: unmatched close tag '" + tag + "'" + line_msg())
saw_error = True
elif tag not in void_tags:
open_tag = tag_stack.pop()
if tag != open_tag:
print("ERROR: " +
"Close tag '" + tag +
"' does not match open tag '" + open_tag +
"'" + line_msg())
saw_error = True
if tag in in_sig_tag:
in_sig_tag[tag] = False
def handle_data(self, data): # type: (str) -> None
"""
Here we can look for long lines or other such problems.
"""
global saw_error # type :bool
if(not in_sig_tag["pre"] and not in_sig_tag["a"] and not
in_sig_tag["script"]):
if len(data) > MAX_LINE:
print("WARNING: long line found" + line_msg())
print(data)
if re.search('\x09', data):
print("WARNING: tab character found" + line_msg() +
"; please uses spaces instead of tabs.")
if not in_sig_tag["script"] and re.search('[<>]', data):
print("ERROR: Use > or < instead of < or >" + line_msg())
saw_error = True
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("html_filename")
arg_parser.add_argument("-t", action="store_true")
arg_parser.add_argument("-d", action="store_true",
help="turns on dev features")
args = arg_parser.parse_args()
parser = OurHTMLParser()
file_nm = args.html_filename
tag_check = args.t
if args.d:
DEV_FEATURE_ON = True
file = open(file_nm, "r")
for line in file:
line_no += 1
parser.feed(line)
if saw_error:
exit(PARSE_ERROR)
else:
exit(0)