-
Notifications
You must be signed in to change notification settings - Fork 22
/
token_classification_tutorial_utils.py
169 lines (137 loc) · 5.69 KB
/
token_classification_tutorial_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import numpy as np
import os
def create_folds(
sentences: list,
k: int = 10,
path: str = 'folds/',
seed: int = 0
) -> list:
"""
Creates `k` folds from `sentences`
Parameters
----------
sentences: list
sentences of the original document(s) in a nested list format, such that `sentences[i]` is
a list of strings that makes up that i'th sentence. The string is the same corresponding line
in the original document(s).
k: int, default=10
number of folds
path: string, default='folds/'
path where the folds are created. For example, by default the first fold is created at
'folds/fold0`, and its corresponding train/test pair is located at 'folds/fold0/train.txt' and
'folds/fold0/test.txt'. If `path` already exists, the operation is skipped.
seed: int, default=0
random seed for `np.random.seed(seed)` for reproducibility.
Returns
---------
indices: list
indices in nested list format to represent the partition, such that `indices[i]` is a list of
indices that are included in the testing dataset for the i'th fold, and training dataset for
other folds.
"""
np.random.seed(seed)
indices = [i for i in range(len(sentences))]
np.random.shuffle(indices)
partitions = [[sentences[index] for index in indices[i::k]] for i in range(k)]
if os.path.exists(path):
print("'%s' already exists, skipping..." % path)
else:
os.system('mkdir folds')
for i in range(k):
os.system('mkdir folds/fold%d' % i)
def write_sentences_to_file(sentences, path):
for sentence in sentences:
for line in sentence:
path.write(line)
path.write('\n')
for i in range(k):
train = open('folds/fold%d/train.txt' % i, "a")
test = open('folds/fold%d/test.txt' % i, "a")
for j in range(k):
write_sentences_to_file(partitions[j], test if i == j else train)
train.close()
test.close()
indices = [[index for index in indices[i::k]] for i in range(k)]
return indices
def modified(given_words: list, sentence_tokens: list) -> bool:
"""
Checks whether given words have been modified by the tokenizer.
Parameters
----------
given_words: list
given words in nested list format, such that `given_words[i]` is a list of given words for the
i'th sentence in the original dataset
sentence_tokens: list
tokens generated by the tokenizers in nested list format, such that `sentence_tokens[i]` is a
list of tokens for the i'th sentence processed by the tokenizer
Returns
---------
is_modified: bool
whether the given words have been modified
"""
for word, token in zip(given_words, sentence_tokens):
if ''.join(word) != ''.join(token):
return True
return False
def get_pred_probs(scores: np.ndarray, tokens: list, given_token: list, weighted: bool = False) -> np.ndarray:
"""
Obtain `pred_probs` for one particular sentence. Maps and reduces subword-level tokens to the given
word-level tokens in the original dataset.
Parameters
----------
scores: np.array
np.array with shape `(N', K)`, where N' is the number of tokens of the sentence generated by the
tokenizer, and K is the number of classes of the model prediction. `scores[i][j]` indicates the
model-predicted probability that the i'th token belongs to class j.
tokens: list
list of tokens with length N' generated by the tokenizer.
given_token: list
list of given tokens with length N, where N is the number of tokens of the sentence from the
original dataset.
weighted: bool, default=False
whether to merge the probabilities using a weighted average (or unweighted average). The weight
is proportional to the length of the subword-level token.
Returns
---------
pred_probs: np.array
np.array with shape `(N, K)`, where `pred_probs[i][j]` is the model-predicted probability that the
i'th token belongs to class j after processing (reducing subwords to words, and spliting words
merged by the tokenizers).
"""
i, j = 0, 0
pred_probs = []
for token in given_token:
i_new, j_new = i, j
acc = 0
weights = []
while acc != len(token):
token_len = len(tokens[i_new][j_new:])
remain = len(token) - acc
weights.append(min(remain, token_len))
if token_len > remain:
acc += remain
j_new += remain
else:
acc += token_len
i_new += 1
j_new = 0
if i != i_new:
probs = np.average(scores[i:i_new], axis=0, weights=weights if weighted else None)
else:
probs = scores[i]
i, j = i_new, j_new
pred_probs.append(probs)
return np.array(pred_probs)
def to_dict(nl: list) -> dict:
"""
Convert nested list to a dictionary for storing `.npz`
Parameter
----------
nl: list
information in nested list structure
Returns
---------
d: dictionary
dictionary with keys as index (converted to string) of the nested list, such that `d[str(i)] == nl[i]`.
"""
return {str(i): l for i, l in enumerate(nl)}