-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathtokenize_text.py
135 lines (110 loc) · 4.79 KB
/
tokenize_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import pandas as pd
import pickle
import ast
from opt import opt
import numpy as np
import torch
from utils import *
def pad_sequences(sequences: list, batch_first: bool = True, padding_value: int = 0, max_len: int = 0):
tmp = torch.Tensor(sequences[0])
max_size = tmp.size()
trailing_dims = max_size[1:]
if batch_first:
out_dims = (len(sequences), max_len) + trailing_dims
else:
out_dims = (max_len, len(sequences)) + trailing_dims
out_tensor = tmp.data.new(*out_dims).fill_(padding_value)
for i, list in enumerate(sequences):
tensor = torch.Tensor(list)
length = tensor.size(0)
if batch_first:
out_tensor[i, :length, ...] = tensor
else:
out_tensor[:length, i, ...] = tensor
return out_tensor.long().numpy()
def reg_encoding(cleaned: list, labels: list, hash_token, end_token) -> list:
label_l = []
for oindex, x in enumerate(cleaned):
tlist = []
for index, j in enumerate(x):
for s in j:
if s[0]=='#':
tlist.append(hash_token)
else:
tlist.append(labels[oindex][index])
label_l.append(tlist)
return label_l
def bio_encoding(cleaned: list, labels: list) -> list:
offset = 1
label_l = []
for oindex, x in enumerate(cleaned):
tlist = []
prev=labels[oindex][0]
for index, j in enumerate(x):
#if index==30:
#ipdb.set_trace()
for s in j:
if s[0]=='#':
tlist.append(hash_token)
else:
if (index==0 and labels[oindex][index]!=0):
tlist.append(labels[oindex][index]+offset)
prev = labels[oindex][index]
if (prev!=labels[oindex][index] and labels[oindex][index]!= 0):
tlist.append(labels[oindex][index]+offset)
prev = labels[oindex][index]
else:
tlist.append(labels[oindex][index])
prev = labels[oindex][index]
label_l.append(tlist)
return label_l
def concatenate_list_data(cleaned: list) -> list:
result= []
for element in cleaned:
result += element
return result
def make_set(p2id, data_dir: str, tokenizer, single_class: str,
hash_token, end_token, bio: bool = False) -> list:
#dataset = pd.read_csv(data_dir, sep='\t', header=None, converters={1:ast.literal_eval, 2:ast.literal_eval})
data_dict = pickle.load(open(data_dir, "rb"))
dataset = corpus2list(p2id, data_dict["ID"], data_dict["Text"],
data_dict["Label"], single_class, bio)
# Shuffle samples
#dataset = dataset.sample(frac=1)
terms = list(dataset[1])
labels = list(dataset[2])
cleaned = [[tokenizer.tokenize(words) for words in sent] for sent in terms]
tokenized_texts = [concatenate_list_data(sent) for sent in cleaned]
if bio:
label_l = bio_encoding(cleaned, labels)
else:
label_l = reg_encoding(cleaned, labels, hash_token, end_token)
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
padding_value=0.0, max_len=opt.maxLen)
tags = pad_sequences(label_l, padding_value=end_token, max_len=opt.maxLen)
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
return input_ids, tags, attention_masks, label_l
def make_val_set(p2id, data_dir: str, tokenizer, single_class: str,
hash_token, end_token, bio: bool = False) -> list:
#dataset = pd.read_csv(data_dir, sep='\t', header=None, converters={1:ast.literal_eval, 2:ast.literal_eval})
data_dict = pickle.load(open(data_dir, "rb"))
if not bio:
dataset = corpus2list(p2id, data_dict["ID"], data_dict["Text"],
data_dict["Label"], single_class, bio)
# Shuffle samples
#dataset = dataset.sample(frac=1)
ids = (dataset[0])
terms = (dataset[1])
labels = (dataset[2])
spacy = (dataset[3])
cleaned = [[tokenizer.tokenize(words) for words in sent] for sent in terms]
tokenized_texts = [concatenate_list_data(sent) for sent in cleaned]
if bio:
label_l = bio_encoding(cleaned, labels)
else:
label_l = reg_encoding(cleaned, labels, hash_token, end_token)
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
padding_value=0.0, max_len=opt.maxLen)
tags = pad_sequences(label_l, padding_value=end_token, max_len=opt.maxLen)
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
return input_ids, tags, attention_masks, cleaned, ids, terms, spacy, label_l