-
Notifications
You must be signed in to change notification settings - Fork 337
/
Copy pathaveraged_perceptron.py
163 lines (138 loc) · 4.66 KB
/
averaged_perceptron.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from nltk.data import load
from collections import defaultdict
EXPORT_TO_TEXT = False
IMPORT_FROM_TEXT = False
UNIT_TEST = False
class AveragedPerceptron:
def __init__(self, weights=None):
# Each feature gets its own weight vector, so weights is a dict-of-dicts
self.weights = weights if weights else {}
self.classes = set()
def predict(self, features):
"""Dot-product the features and current weights and return the best label."""
scores = defaultdict(float)
for feat, value in features.items():
if feat not in self.weights or value == 0:
continue
weights = self.weights[feat]
for label, weight in weights.items():
scores[label] += value * weight
# Do a secondary alphabetic sort, for stability
best_label = max(self.classes, key=lambda label: (scores[label], label))
return best_label
model = AveragedPerceptron()
def export_to_text():
weights, tagdict, classes = load("averaged_perceptron_tagger.pickle")
f = open("averaged_perceptron_tagger_weights.txt", "w")
for feat in weights.keys():
feat_weights = weights[feat]
for label, weight in feat_weights.items():
f.write(feat + "\n" + label + "\n" + str(weight) + "\n")
f.close()
f = open("averaged_perceptron_tagger_tagdict.txt", "w")
for tag in tagdict.keys():
f.write(tag + "\n" + tagdict[tag] + "\n")
f.close()
f = open("averaged_perceptron_tagger_classes.txt", "w")
for cls in classes:
f.write(cls + "\n")
f.close()
def import_from_text():
f = open("averaged_perceptron_tagger_weights.txt", "r")
weights = {}
lines = f.read().split('\n')[:-1]
i = 0
while i < len(lines):
feat = lines[i+0]
label = lines[i+1]
weight = lines[i+2]
i = i + 3
if not (feat in weights):
weights[feat] = {}
weights[feat][label] = float(weight)
f.close()
f = open("averaged_perceptron_tagger_tagdict.txt", "r")
tagdict = {}
lines = f.read().split('\n')[:-1]
i = 0
while i < len(lines):
tag = lines[i+0]
v = lines[i+1]
i = i + 2
tagdict[tag] = v
f.close()
f = open("averaged_perceptron_tagger_classes.txt", "r")
classes = f.read().split('\n')[:-1]
f.close()
return weights, tagdict, classes
if EXPORT_TO_TEXT:
export_to_text()
if IMPORT_FROM_TEXT:
model.weights, tagdict, model.classes = import_from_text()
else:
model.weights, tagdict, model.classes = load("averaged_perceptron_tagger.pickle")
START = ["-START-", "-START2-"]
END = ["-END-", "-END2-"]
def _get_features(i, word, context, prev, prev2):
"""Map tokens into a feature representation, implemented as a
{hashable: int} dict. If the features change, a new model must be
trained.
"""
def add(name, *args):
features[" ".join((name,) + tuple(args))] += 1
i += len(START)
features = defaultdict(int)
# It's useful to have a constant feature, which acts sort of like a prior
add("bias")
add("i suffix", word[-3:])
add("i pref1", word[0] if word else "")
add("i-1 tag", prev)
add("i-2 tag", prev2)
add("i tag+i-2 tag", prev, prev2)
add("i word", context[i])
add("i-1 tag+i word", prev, context[i])
add("i-1 word", context[i - 1])
add("i-1 suffix", context[i - 1][-3:])
add("i-2 word", context[i - 2])
add("i+1 word", context[i + 1])
add("i+1 suffix", context[i + 1][-3:])
add("i+2 word", context[i + 2])
return features
def normalize(word):
"""
Normalization used in pre-processing.
- All words are lower cased
- Groups of digits of length 4 are represented as !YEAR;
- Other digits are represented as !DIGITS
:rtype: str
"""
if "-" in word and word[0] != "-":
return "!HYPHEN"
if word.isdigit() and len(word) == 4:
return "!YEAR"
if word and word[0].isdigit():
return "!DIGITS"
return word.lower()
def tag(tokens):
"""
Tag tokenized sentences.
:params tokens: list of word
:type tokens: list(str)
"""
prev, prev2 = START
output = []
context = START + [normalize(w) for w in tokens] + END
for i, word in enumerate(tokens):
tag = tagdict.get(word)
if not tag:
features = _get_features(i, word, context, prev, prev2)
tag = model.predict(features)
output.append((word, tag))
prev2 = prev
prev = tag
return output
if UNIT_TEST:
words = ["i'm", 'an', 'activationist', '.']
output = tag(words)
print(output)
#[("i'm", 'VB'), ('an', 'DT'), ('activationist', 'NN'), ('.', '.')]