forked from html5lib/html5lib-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
executable file
·244 lines (199 loc) · 8.99 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/usr/bin/env python
"""usage: %prog [options] filename
Parse a document to a tree, with optional profiling
"""
import sys
import traceback
from optparse import OptionParser
from html5lib import html5parser
from html5lib import treebuilders, serializer, treewalkers
from html5lib import constants
from html5lib import _utils
def parse():
optParser = getOptParser()
opts, args = optParser.parse_args()
encoding = "utf8"
try:
f = args[-1]
# Try opening from the internet
if f.startswith('http://'):
try:
import urllib.request
import urllib.parse
import urllib.error
import cgi
f = urllib.request.urlopen(f)
contentType = f.headers.get('content-type')
if contentType:
(mediaType, params) = cgi.parse_header(contentType)
encoding = params.get('charset')
except:
pass
elif f == '-':
f = sys.stdin
if sys.version_info[0] >= 3:
encoding = None
else:
try:
# Try opening from file system
f = open(f, "rb")
except IOError as e:
sys.stderr.write("Unable to open file: %s\n" % e)
sys.exit(1)
except IndexError:
sys.stderr.write("No filename provided. Use -h for help\n")
sys.exit(1)
treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)
p = html5parser.HTMLParser(tree=treebuilder, debug=opts.log)
if opts.fragment:
parseMethod = p.parseFragment
else:
parseMethod = p.parse
if opts.profile:
import cProfile
import pstats
cProfile.runctx("run(parseMethod, f, encoding, scripting)", None,
{"run": run,
"parseMethod": parseMethod,
"f": f,
"encoding": encoding,
"scripting": opts.scripting},
"stats.prof")
# XXX - We should use a temp file here
stats = pstats.Stats('stats.prof')
stats.strip_dirs()
stats.sort_stats('time')
stats.print_stats()
elif opts.time:
import time
t0 = time.time()
document = run(parseMethod, f, encoding, opts.scripting)
t1 = time.time()
if document:
printOutput(p, document, opts)
t2 = time.time()
sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1))
else:
sys.stderr.write("\n\nRun took: %fs" % (t1 - t0))
else:
document = run(parseMethod, f, encoding, opts.scripting)
if document:
printOutput(p, document, opts)
def run(parseMethod, f, encoding, scripting):
try:
document = parseMethod(f, override_encoding=encoding, scripting=scripting)
except:
document = None
traceback.print_exc()
return document
def printOutput(parser, document, opts):
if opts.encoding:
print("Encoding:", parser.tokenizer.stream.charEncoding)
for item in parser.log:
print(item)
if document is not None:
if opts.xml:
tb = opts.treebuilder.lower()
if tb == "dom":
document.writexml(sys.stdout, encoding="utf-8")
elif tb == "lxml":
import lxml.etree
sys.stdout.write(lxml.etree.tostring(document, encoding="unicode"))
elif tb == "etree":
sys.stdout.write(_utils.default_etree.tostring(document, encoding="unicode"))
elif opts.tree:
if not hasattr(document, '__getitem__'):
document = [document]
for fragment in document:
print(parser.tree.testSerializer(fragment))
elif opts.html:
kwargs = {}
for opt in serializer.HTMLSerializer.options:
try:
kwargs[opt] = getattr(opts, opt)
except:
pass
if not kwargs['quote_char']:
del kwargs['quote_char']
if opts.sanitize:
kwargs["sanitize"] = True
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
if sys.version_info[0] >= 3:
encoding = None
else:
encoding = "utf-8"
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding=encoding):
sys.stdout.write(text)
if not text.endswith('\n'):
sys.stdout.write('\n')
if opts.error:
errList = []
for pos, errorcode, datavars in parser.errors:
errList.append("Line %i Col %i" % pos + " " + constants.E.get(errorcode, 'Unknown error "%s"' % errorcode) % datavars)
sys.stdout.write("\nParse errors:\n" + "\n".join(errList) + "\n")
def getOptParser():
parser = OptionParser(usage=__doc__)
parser.add_option("-p", "--profile", action="store_true", default=False,
dest="profile", help="Use the hotshot profiler to "
"produce a detailed log of the run")
parser.add_option("-t", "--time",
action="store_true", default=False, dest="time",
help="Time the run using time.time (may not be accurate on all platforms, especially for short runs)")
parser.add_option("-b", "--treebuilder", action="store", type="string",
dest="treebuilder", default="etree")
parser.add_option("-e", "--error", action="store_true", default=False,
dest="error", help="Print a list of parse errors")
parser.add_option("-f", "--fragment", action="store_true", default=False,
dest="fragment", help="Parse as a fragment")
parser.add_option("-s", "--scripting", action="store_true", default=False,
dest="scripting", help="Handle noscript tags as if scripting was enabled")
parser.add_option("", "--tree", action="store_true", default=False,
dest="tree", help="Output as debug tree")
parser.add_option("-x", "--xml", action="store_true", default=False,
dest="xml", help="Output as xml")
parser.add_option("", "--no-html", action="store_false", default=True,
dest="html", help="Don't output html")
parser.add_option("-c", "--encoding", action="store_true", default=False,
dest="encoding", help="Print character encoding used")
parser.add_option("", "--inject-meta-charset", action="store_true",
default=False, dest="inject_meta_charset",
help="inject <meta charset>")
parser.add_option("", "--strip-whitespace", action="store_true",
default=False, dest="strip_whitespace",
help="strip whitespace")
parser.add_option("", "--omit-optional-tags", action="store_true",
default=False, dest="omit_optional_tags",
help="omit optional tags")
parser.add_option("", "--quote-attr-values", action="store_true",
default=False, dest="quote_attr_values",
help="quote attribute values")
parser.add_option("", "--use-best-quote-char", action="store_true",
default=False, dest="use_best_quote_char",
help="use best quote character")
parser.add_option("", "--quote-char", action="store",
default=None, dest="quote_char",
help="quote character")
parser.add_option("", "--no-minimize-boolean-attributes",
action="store_false", default=True,
dest="minimize_boolean_attributes",
help="minimize boolean attributes")
parser.add_option("", "--use-trailing-solidus", action="store_true",
default=False, dest="use_trailing_solidus",
help="use trailing solidus")
parser.add_option("", "--space-before-trailing-solidus",
action="store_true", default=False,
dest="space_before_trailing_solidus",
help="add space before trailing solidus")
parser.add_option("", "--escape-lt-in-attrs", action="store_true",
default=False, dest="escape_lt_in_attrs",
help="escape less than signs in attribute values")
parser.add_option("", "--escape-rcdata", action="store_true",
default=False, dest="escape_rcdata",
help="escape rcdata element values")
parser.add_option("", "--sanitize", action="store_true", default=False,
dest="sanitize", help="sanitize")
parser.add_option("-l", "--log", action="store_true", default=False,
dest="log", help="log state transitions")
return parser
if __name__ == "__main__":
parse()