-
Notifications
You must be signed in to change notification settings - Fork 7
/
featureTemplate.py
124 lines (88 loc) · 3.34 KB
/
featureTemplate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from collections import defaultdict
import operator
import json
import re
nlpFeatureConstants = {
"oov" : "#UNK",
"padleft" : "#PadLeft" ,
"padright" : "#PadRight" ,
"special": "#Special"
}
##***this leaves self.featureFunction as abstract. when creating instances of FeatureTemplate, implement this
##**it's also expected that you proved a string self.name
class FeatureTemplate:
#skipSpecialChars = False # the default is to add a special feature if you see a special character
spec = re.compile('^#')
useSpecialWords = False
def __init__(self,allowOOV):
self.buildCounts = True
self.counts = defaultdict(int)
self.domain = None
self.assertInDomain = not allowOOV
def isSpecial(self,tokStr):
return self.spec.match(tokStr)
def extractFeature(self,normalizedString):
feat = None
if(self.isSpecial(normalizedString)):
if(self.useSpecialWords):
feat = normalizedString
else:
feat = nlpFeatureConstants["special"]
else:
feat = self.featureFunction(normalizedString)
if(self.buildCounts):
self.counts[feat]+= 1
return feat
def writeDomain(self,file):
print "writing: " + file
data = {
'name':self.name,
'domain' : self.domain
}
with open(file, 'w') as outfile:
json.dump(data, outfile)
def constructDomain(self,featureCountThreshold):
filteredKeys = {k: v for k, v in self.counts.iteritems() if v > featureCountThreshold}
sortedKeysByFrequency = sorted(filteredKeys.items(),key = operator.itemgetter(1),reverse=True)
self.domain = dict(map (lambda t: (t[1], t[0]), enumerate( map (lambda x: x[0], sortedKeysByFrequency)))) ##map from key to index
if(not self.assertInDomain):
self.domain[nlpFeatureConstants["oov"]] = len(self.domain)
def convertToInt(self,feat):
if(feat in self.domain):
return self.domain[feat]
else:
assert not self.assertInDomain, "input value " + feat + " not in domain" + " name = " + self.name
return self.domain[nlpFeatureConstants["oov"]]
def loadDomain(self,file):
with open(file, 'r') as datfile:
data = json.load(datfile)
assert data['name'] == self.name
self.domain = data['domain']
class FeatureTemplates:
def __init__(self,useTokenFeatures,featureTemplates,featureCountThreshold):
self.candidateTemplates = []
self.featureTemplates = featureTemplates
self.useFeats = useTokenFeatures
self.featureCountThreshold = featureCountThreshold
for d in self.featureTemplates:
d.buildCounts = True
def convertToInt(self,tokStringFeats):
return map(lambda x: x[1].convertToInt(x[0]),zip(tokStringFeats,self.featureTemplates))
def loadDomains(self,domainFileBase):
for template in self.featureTemplates:
template.loadDomain(domainFileBase + "." + template.name)
def writeDomains(self,domainFileBase):
for template in self.featureTemplates:
fn = domainFileBase + "." + template.name
template.writeDomain(fn)
def constructDomains(self):
for d in self.featureTemplates:
d.constructDomain(self.featureCountThreshold)
d.buildCounts = False
def extractFeatures(self,normalizedString):
return map(lambda x: x.extractFeature(normalizedString),self.featureTemplates)
def convertFeaturesForPrinting(self,sentenceFeatures):
if(not self.useFeats):
return " ".join(map(lambda x: str(x[0]),sentenceFeatures))
else:
return " ".join(map(lambda x: ",".join(map(lambda y: str(y),x)),sentenceFeatures))