-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhtmlmatch.py
137 lines (124 loc) · 3.08 KB
/
htmlmatch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python
import sys
import urllib2
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
import HTMLParser
def htmlmatch(page, pattern):
"""Finds all the occurrencies of the pattern tree into the given html page"""
isoup = BeautifulSoup(page)
psoup = BeautifulSoup(pattern)
def untiltag(gen):
node = gen.next()
while True:
if isinstance(node, Tag):
break
elif len(node.lstrip()) == 0:
node = gen.next()
else:
break
return node
pgen = psoup.recursiveChildGenerator()
pnode = untiltag(pgen)
igen = isoup.recursiveChildGenerator()
inode = untiltag(igen)
variables = []
lastvars = {}
while True:
newvars = nodematch(inode, pnode)
if newvars != None:
if len(newvars) > 0:
lastvars.update(newvars)
try:
pnode = untiltag(pgen)
except StopIteration:
pgen = psoup.recursiveChildGenerator()
pnode = untiltag(pgen)
if len(lastvars) > 0:
variables.append(lastvars)
lastvars = {}
else:
pgen = psoup.recursiveChildGenerator()
pnode = untiltag(pgen)
try:
inode = untiltag(igen)
except StopIteration:
if variables != None:
return variables
return None
return variables
def nodematch(input, pattern):
"""Matches two tags: returns True if the tags are of the same kind, and if
the first tag has AT LEAST all the attributes of the second one
(the pattern) and if these attributes match as strings, as defined in
strmatch function."""
if input.__class__ != pattern.__class__:
return None
if isinstance(input, NavigableString):
return strmatch(input, pattern)
if isinstance(input, Tag) and input.name != pattern.name:
return None
variables = {}
for attr, value in pattern._getAttrMap().iteritems():
if input.has_key(attr):
newvars = strmatch(input.get(attr), value)
if newvars != None:
variables.update(newvars)
else:
return None
else:
return None
return variables
def strmatch(input, pattern):
"""Matches the input string with the pattern string. For example:
input: "python and ocaml are great languages"
pattern: "$lang1$ and $lang2$ are great languages"
gives as output the map:
{"lang1": "python", "lang2": "ocaml"}
The function returns None if the strings don't match."""
var, value = None, None
i, j = 0, 0
map = {}
input_len = len(input)
pattern_len = len(pattern)
while i < input_len:
if var == None:
if pattern[j] == '$':
var = ""
value = ""
j += 1
elif input[i] != pattern[j]:
return None
else:
i += 1
j += 1
else:
while pattern[j] != '$':
var += pattern[j]
j += 1;
j += 1
if j == pattern_len:
while i < input_len:
value += input[i]
i += 1
else:
while i < input_len and input[i] != pattern[j]:
value += input[i]
i += 1
i +=1
j +=1
map[var] = value
var = None
return map
def main(argv):
if len(argv) < 2:
print "example: ./htmlmatch.py input.html pattern.html"
return
page = open(argv[0], "r")
pattern = open(argv[1], "r")
l = htmlmatch(page, pattern)
for m in l:
for k, v in m.iteritems():
print k, v
print
if __name__ == "__main__":
main(sys.argv[1:])