-
Notifications
You must be signed in to change notification settings - Fork 0
/
parsetree.py
169 lines (157 loc) · 5.52 KB
/
parsetree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import re
from numpy import prod
from binarylr import BinaryLR
topNodePtn = re.compile("^\((\S+) (.+)\)$")
class ParseTree(object):
def __init__(self, textRepresentation, tagList, classifier, splitProbPredictionThreshold=0.5, parent=None):
"""
Constructor for ParseTree class.
textRepresentation:
A string describing the parse tree's structure and content. Example from data set:
"(ROOT (S (NP (PRP He)) (VP (VBD fought) (PP (IN in) (NP (DT the) (NNP Vietnam) (NNP War)))) (. .)))"
tagList:
The set of all possible tags (e.g. ROOT, S, NP, PRP, VP, etc.) in the sentence parser's specifications.
classifier:
The classifier to be used for predicting content removal.
parent:
The parent node. Should be omitted in most cases when manually creating a ParseTree instance.
"""
self.splitProbPredictionThreshold = splitProbPredictionThreshold
self.tagList = tagList
self.classifier = classifier
self.parent = parent
match = topNodePtn.match(textRepresentation)
if match is None:
raise ValueError("The provided textRepresentation does not match the required format.")
self.tag = match.group(1)
contentStr = match.group(2)
self.children = []
if contentStr[0] == "(":
self.hasChild = True
parenthSliceEndpoints = [0, 0]
while True:
parenthSliceEndpoints = ParseTree.findFirstTopLevelParenthPair(contentStr, parenthSliceEndpoints[1])
if not parenthSliceEndpoints:
break
self.children.append(ParseTree(contentStr[parenthSliceEndpoints[0]:parenthSliceEndpoints[1]], self.tagList, self.classifier, self.splitProbPredictionThreshold, self))
self.word = ""
else:
self.hasChild = False
self.word = contentStr
if parent is None:
self.initFeatures()
def calcNumDescendants(self):
"""
Returns the number of descendants of self.
"""
if self.hasChild:
self.numDescendants = sum(child.calcNumDescendants() + 1 for child in self.children)
else:
self.numDescendants = 0
return self.numDescendants
def calcMaxDescendantDepth(self):
"""
Recursively finds, records, and returns the maximum depth of any descendant of self.
Assumes that node depths have been calculated.
"""
if self.hasChild:
self.maxDescendantDepth = max([child.calcMaxDescendantDepth() for child in self.children])
else:
self.maxDescendantDepth = self.depth
return self.maxDescendantDepth
def initFeatures(self):
"""
Initializes features used for content-removal prediction.
"""
tagArray = [0] * len(self.tagList)
tagArray[self.tagList.index(self.tag)] = 1
parentTagArray = [0] * len(self.tagList)
if self.parent is not None:
parentTagArray[self.tagList.index(self.parent.tag)] = 1
numSiblings = len(self.parent.children) - 1
self.depth = self.parent.depth + 1
else:
numSiblings = 0
self.depth = 0
numChildren = len(self.children)
self.calcNumDescendants()
for child in self.children:
child.initFeatures()
self.calcMaxDescendantDepth()
dropChildProb = 0
self.featureArray = tagArray + parentTagArray + [
numSiblings,
self.depth,
numChildren,
self.numDescendants,
self.maxDescendantDepth,
dropChildProb
]
def getFeatureVector(self):
"""
Returns a list of features for content-removal prediction.
"""
if(self.hasChild):
self.featureArray[-1] = 1 - prod([1 - child.predictDropProb() for child in self.children])
else:
self.featureArray[-1] = 0 # The above line results in errors in this case. Set to 0 explicitly instead.
return self.featureArray
def predictDropProb(self):
"""
Returns the classifier's prediction (as a probability in the range [0, 1]) for whether or not some part of the node should be dropped.
"""
return self.classifier.classify(self.getFeatureVector(), True)
def predictSplitProb(self):
"""
Returns the classifier's prediction (as a probability in the range [0, 1]) for whether or not the sentence should be split.
"""
return self.children[0].predictDropProb() # ROOT.children[0] should be the S node. (This should only be called on ROOT.)
def predictSplit(self):
"""
Returns the classifier's prediction (either 0 or 1) for whether or not the sentence should be split.
"""
return True if self.predictSplitProb() > self.splitProbPredictionThreshold else False
def learn(self, shouldHaveSplit):
"""
Calls the classifier's learn() method to teach it whether or not the most recently classified sentence should have been split.
"""
self.classifier.learn(1 if shouldHaveSplit else 0)
def printIndented(self, indentStr=""):
"""
Prints an indented textual representation of the parse tree.
"""
headIndentStr = indentStr[:-4] + (" +- " if indentStr else "")
mainIndentStr = indentStr + " | "
lastIndentStr = indentStr + " "
if self.hasChild:
print(headIndentStr + self.tag)
for child in self.children[:-1]:
child.printIndented(mainIndentStr)
for child in self.children[-1:]:
child.printIndented(lastIndentStr)
else:
print(headIndentStr + self.tag + " " + self.word)
def findFirstTopLevelParenthPair(string, startIdx=0):
"""
Returns endpoints [open, close] of a slice such that string[open:close] == "(...)"
"""
openParenth = -1
closeParenth = -1
for i in range(startIdx, len(string)):
if string[i] == "(":
openParenth = i
break
if openParenth == -1:
return None
nestingDepth = 1
for i in range(openParenth + 1, len(string)):
if string[i] == ")":
nestingDepth -= 1
if nestingDepth == 0:
closeParenth = i
break
elif string[i] == "(":
nestingDepth += 1
if closeParenth == -1:
return None
return [openParenth, closeParenth + 1]