-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset_Exp3.py
150 lines (125 loc) · 4.65 KB
/
dataset_Exp3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
class Dataset(Dataset):
def __init__(self, file_name, train=True):
if train:
train_threshold = 3000
inputs, outputs = load_dataset(file_name, train_threshold)
inputs, outputs, in_vocab, out_vocab = preprocess_data(
inputs, outputs, train=True
)
self.inputs = inputs
self.outputs = outputs
self.in_vocab = in_vocab
self.out_vocab = out_vocab
else:
inputs, outputs = load_dataset(file_name, 500)
inputs, outputs = preprocess_data(inputs, outputs, train=False)
self.inputs = inputs
self.outputs = outputs
def __len__(self):
return self.inputs.shape[0]
def __getitem__(self, index):
src = get_input_indices(self.inputs[index], self.in_vocab[1])
trg = get_output_indices(self.outputs[index], self.out_vocab[1])
return src, trg
PAD_TOKEN = "<pad>"
SOS_TOKEN = "<w>"
EOS_TOKEN = "</w>"
UNK_TOKEN = "<unk>"
def load_dataset(file_name, train_threshold):
inputs = []
outputs = []
counter = 0
with open(file_name) as f:
for line in f:
counter += 1
if counter < train_threshold + 1:
try:
l = line.strip().split("\t")
l[0] = l[0].lower() # source
l[2] = l[2].lower() # target
l[1] = l[1].split(";") # source features
l[3] = l[3].split(";") # target features
inputs.append([l[1], l[0], l[3]])
outputs.append(l[2])
except IndexError:
# then the line is probably missing some features
pass
print("og len: " + str(len(inputs)))
return np.array(inputs), np.array(list(outputs))
def enhance_dataset(inputs, outputs):
inputs_cpy = inputs.copy()
outputs_cpy = outputs.copy()
inputs_cpy[:, [0, 2]] = inputs_cpy[:, [2, 0]]
inputs_cpy[:, 1], outputs_cpy[:] = outputs_cpy[:], inputs_cpy[:, 1]
inputs = np.concatenate((inputs, inputs_cpy), axis=0)
outputs = np.concatenate((outputs, outputs_cpy), axis=0)
return inputs, outputs
def preprocess_data(inputs, outputs, train):
if train:
if len(inputs) < 20000:
inputs, outputs = enhance_dataset(inputs, outputs)
inputs = edit_tags(inputs)
inputs[:, [1, 2]] = inputs[:, [2, 1]]
inputs = transform_to_sequences(inputs)
input_vocab = make_input_vocab(inputs)
output_vocab = make_output_vocab(outputs)
return inputs, outputs, input_vocab, output_vocab
else:
inputs = edit_tags(inputs)
inputs[:, [1, 2]] = inputs[:, [2, 1]]
inputs = transform_to_sequences(inputs)
return inputs, outputs
def edit_tags(inputs):
for i in range(0, inputs.shape[0]):
inputs[i, 0] = np.array(["IN=" + x for x in inputs[i, 0]])
inputs[i, 2] = np.array(["OUT=" + x for x in inputs[i, 2]])
return inputs
def transform_to_sequences(inputs):
input_seq = np.array(
[
np.concatenate((inputs[i, 0], inputs[i, 1], list(inputs[i, 2].split(" "))))
for i in range(inputs.shape[0])
]
)
return input_seq
def make_input_vocab(data):
idx_to_char = {0: PAD_TOKEN, 1: SOS_TOKEN, 2: EOS_TOKEN, 3: UNK_TOKEN}
char_to_idx = {PAD_TOKEN: 0, SOS_TOKEN: 1, EOS_TOKEN: 2, UNK_TOKEN: 3}
char_set = set([])
for i in range(0, data.shape[0]):
char_set.update(data[i])
char_set = sorted(char_set)
for i in range(0, len(char_set)):
idx_to_char[i + 4] = char_set[i]
char_to_idx[char_set[i]] = i + 4
return idx_to_char, char_to_idx
def make_output_vocab(data):
idx_to_char = {0: PAD_TOKEN, 1: SOS_TOKEN, 2: EOS_TOKEN, 3: UNK_TOKEN}
char_to_idx = {PAD_TOKEN: 0, SOS_TOKEN: 1, EOS_TOKEN: 2, UNK_TOKEN: 3}
char_set = set([])
for i in range(0, data.shape[0]):
char_set.update(data[i].split(" "))
char_set = sorted(char_set)
for i in range(0, len(char_set)):
idx_to_char[i + 4] = char_set[i]
char_to_idx[char_set[i]] = i + 4
return idx_to_char, char_to_idx
def get_input_indices(inputs, vocab):
v = []
for ch in inputs:
try:
v.append(vocab[ch])
except KeyError:
v.append(vocab[UNK_TOKEN])
return v + [vocab[EOS_TOKEN]]
def get_output_indices(output, vocab):
v = []
for ch in output.split(" "):
try:
v.append(vocab[ch])
except KeyError:
v.append(vocab[UNK_TOKEN])
return v + [vocab[EOS_TOKEN]]