-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtry_my_own_crf.py
415 lines (254 loc) · 9.25 KB
/
try_my_own_crf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 20 15:56:38 2020
Largely inspired by
https://www.analyticsvidhya.com/blog/2018/08/nlp-guide-conditional-random-fields-text-classification/
https://www.depends-on-the-definition.com/named-entity-recognition-conditional-random-fields-python/
https://github.com/AiswaryaSrinivas/DataScienceWithPython/blob/master/CRF%20POS%20Tagging.ipynb
@author: sbuer
"""
# data handling
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 20)
# machine learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
# NLP
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
# Misc
import os
from collections import Counter
import time
import random
# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
# Use the tagged NYT ingredients dataset
path_dir = r"/mnt/d/data science/nutrition"
filename = 'nyt-ingredients-snapshot-2015.csv'
data = pd.read_csv(os.sep.join([path_dir, filename]),
encoding="utf-8", index_col=None)
data.tail(10)
# Tags: 'name', 'qty', 'range_end', 'unit', 'comment'
# Divide data into training and test set
train_set, test_set = train_test_split(tagged_sentence,test_size=0.2,
random_state=1234)
print("Number of phrases in training set ", len(train_set))
print("Number of phrases in test set ", len(test_set))
# I have to make sure to convert the quantities in the input (strings) to
# floats to match the output (I can then convert back to strings)
# e.g. 1/4 --> 0.25
# Consider this function from ingredient-phrase-tagger:
def _parseNumbers(s):
"""
Parses a string that represents a number into a decimal data type so that
we can match the quantity field in the db with the quantity that appears
in the display name. Rounds the result to 2 places.
"""
ss = utils.unclump(s)
m3 = re.match('^\d+$', ss)
if m3 is not None:
return decimal.Decimal(round(float(ss), 2))
m1 = re.match(r'(\d+)\s+(\d)/(\d)', ss)
if m1 is not None:
num = int(m1.group(1)) + (float(m1.group(2)) / float(m1.group(3)))
return decimal.Decimal(str(round(num, 2)))
m2 = re.match(r'^(\d)/(\d)$', ss)
if m2 is not None:
num = float(m2.group(1)) / float(m2.group(2))
return decimal.Decimal(str(round(num, 2)))
return None
# Try with one row
sentence = []
# Input words
input_words = WordPunctTokenizer().tokenize(row.input)
# Words in specific Tag
names = str(row.name)
qties = str(row.qty).strip().split()
range_ends = str(row.range_end).strip().split()
comments = str(row.comment).strip().split()
for word in input_words:
print(word)
if word in names:
sentence.append((word, 'NAME'))
elif word in qties:
sentence.append((word, 'QTY'))
elif word in range_ends:
sentence.append((word, 'RANGE_END'))
elif word in comments:
sentence.append((word, 'COMMENT'))
else:
sentence.append((word, 'OTHER'))
doc = []
for i, row in enumerate(data.itertuples()):
sentence = []
for word in input_words:
for i, row in enumerate(data.itertuples()):
print(i, row)
if i == 2:
break
def formatData(input_col, tag_cols):
'''
DESCRIPTION:
Transform Ingredient DataFrame into list of lists of tuples, with each
tuple denoting a word and its tag and each inner list denoting a
sentence.
INPUT:
input_col (str): Name of sentence column
tag_cols (str): Column
'''
# Check how it looks for one sentence
getter = SentenceGetter(data)
sent = getter.get_next()
print(sent)
# Get all sentences with Tags
sentences = getter.sentences
# Define features
def word2features(sent, i):
word = sent[i][0]
postag = sent[i][1]
features = {
'bias': 1.0,
'word.lower()': word.lower(),
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
'word.isupper()': word.isupper(),
'word.istitle()': word.istitle(),
'word.isdigit()': word.isdigit(),
'postag': postag,
'postag[:2]': postag[:2],
}
if i > 0:
word1 = sent[i-1][0]
postag1 = sent[i-1][1]
features.update({
'-1:word.lower()': word1.lower(),
'-1:word.istitle()': word1.istitle(),
'-1:word.isupper()': word1.isupper(),
'-1:postag': postag1,
'-1:postag[:2]': postag1[:2],
})
else:
features['BOS'] = True
if i < len(sent)-1:
word1 = sent[i+1][0]
postag1 = sent[i+1][1]
features.update({
'+1:word.lower()': word1.lower(),
'+1:word.istitle()': word1.istitle(),
'+1:word.isupper()': word1.isupper(),
'+1:postag': postag1,
'+1:postag[:2]': postag1[:2],
})
else:
features['EOS'] = True
return features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
return [label for token, postag, label in sent]
def sent2tokens(sent):
return [token for token, postag, label in sent]
# Craft features
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
# Fit the CRF
crf = CRF(algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=False)
from sklearn.cross_validation import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)
crf.fit(X, y)
# Inspect the model
import eli5
eli5.show_weights(crf, top=30)
# Improve CRF with regularization
crf = CRF(algorithm='lbfgs',
c1=10,
c2=0.1,
max_iterations=100,
all_possible_transitions=False)
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)
crf.fit(X, y)
eli5.show_weights(crf, top=30)
# Generate features. These are the default features that NER algorithm uses in
# nltk. One can modify it for customization.
data = []
for i, doc in enumerate(docs):
tokens = [t for t, label in doc]
tagged = nltk.pos_tag(tokens)
data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])
# Now we’ll build features and create train and test data frames.
X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Let’s test our model.
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]
# You can inspect any predicted value by selecting the corresponding row
# number “i”.
i = 0
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
print("%s (%s)" % (y, x))
# Check the performance of the model.
# Create a mapping of labels to indices
labels = {"claim_number": 1, "claimant": 1,"NA": 0}
# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])
# Print out the classification report. Based on the model performance, build
# better features to improve the performance.
print(classification_report(
truths, predictions,
target_names=["claim_number", "claimant","NA"]))
# predict new data
with codecs.open("D:/ SampleEmail6.xml", "r", "utf-8") as infile:
soup_test = bs(infile, "html5lib")
docs = []
sents = []
for d in soup_test.find_all("document"):
for wrd in d.contents:
tags = []
NoneType = type(None)
if isinstance(wrd.name, NoneType) == True:
withoutpunct = remov_punct(wrd)
temp = word_tokenize(withoutpunct)
for token in temp:
tags.append((token,'NA'))
else:
withoutpunct = remov_punct(wrd)
temp = word_tokenize(withoutpunct)
for token in temp:
tags.append((token,wrd.name))
#docs.append(tags)
sents = sents + tags # puts all the sentences of a document in one element of the list
docs.append(sents) #appends all the individual documents into one list
data_test = []
for i, doc in enumerate(docs):
tokens = [t for t, label in doc]
tagged = nltk.pos_tag(tokens)
data_test.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])
data_test_feats = [extract_features(doc) for doc in data_test]
tagger.open('crf.model')
newdata_pred = [tagger.tag(xseq) for xseq in data_test_feats]
# Let's check predicted data
i = 0
for x, y in zip(newdata_pred[i], [x[1].split("=")[1] for x in data_test_feats[i]]):
print("%s (%s)" % (y, x))
# eof