-
Notifications
You must be signed in to change notification settings - Fork 0
/
lstmmodel_20180419.py
337 lines (310 loc) · 14.2 KB
/
lstmmodel_20180419.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
import nltk
import sklearn
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import json
import numpy as np
import os
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
torch.manual_seed(1)
# convert it to LSTMText2Word
class LSTMText2Word(nn.Module):
def __init__(self, embedding_dim, hidden_dim, labelset_size, lstm_layers, training_epochs, batch_size, pack_dim):
super(LSTMText2Word, self).__init__()
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.output_dim = labelset_size
self.lstm_layers = lstm_layers
self.training_epochs = training_epochs
self.batch_size = batch_size
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_dim.
self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, self.lstm_layers, \
bias=False, batch_first=False, dropout=0, bidirectional=False)
self.pack_dim = pack_dim
# The linear layer that maps from hidden state space to label space
self.hidden2label = nn.Linear(self.hidden_dim, self.output_dim)
self.hidden = self.init_hidden()
def init_hidden(self):
# Before we've done anything, we dont have any hidden state.
# Refer to the Pytorch documentation to see exactly
# why they have this dimensionality.
# The axes semantics are (num_layers, minibatch_size, hidden_dim)
return (autograd.Variable(torch.zeros(self.lstm_layers, self.batch_size, self.hidden_dim)),
autograd.Variable(torch.zeros(self.lstm_layers, self.batch_size, self.hidden_dim)))
# return (autograd.Variable(torch.zeros(self.lstm_layers, 1, self.hidden_dim).double()),
# autograd.Variable(torch.zeros(self.lstm_layers, 1, self.hidden_dim).double()))
def forward(self, sequence):
'''
embeds = self.word_embeddings(sentence)
lstm_out, self.hidden = self.lstm(
embeds.view(len(sentence), 1, -1), self.hiddenq)
label_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
label_scores = F.log_softmax(label_space, dim=1)
return label_scores
'''
# print("Print sequence and size:")
# print(sequence.size())
# print(sequence.view(len(sequence), 1, -1).size())
# lstm_out, self.hidden = self.lstm(sequence.view(len(sequence), 1, -1), self.hidden)
# -1 means inferring from other dimensions
# lstm_out, self.hidden = self.lstm(sequence.view(len(sequence), self.batch_size, -1))
lstm_out, self.hidden = self.lstm(sequence)
unpacked, unpacked_sequence_lengths = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
# get the last timestep
unpacked_sequence_lengths = torch.FloatTensor(unpacked_sequence_lengths)
idx = (unpacked_sequence_lengths - torch.ones(unpacked.size(0))).view(-1, 1).expand(unpacked.size(0), unpacked.size(2)).unsqueeze(1)
idx = autograd.Variable(idx.long())
decoded = unpacked.gather(1, idx).squeeze() # batch size * hidden dimension
# lstm_last = lstm_out[-1, :, :]
# print("Print sliced lstm output size:", lstm_last.size())
label_space = self.hidden2label(decoded.view(self.batch_size, -1))
# print("Size of label space:", label_space.size())
predicted_labels = F.log_softmax(label_space, dim=1)
# print("Print predicted label:")
# print(predicted_label)
# print("Print predicted label size:")
# print(predicted_label.size())
return predicted_labels
def load_json(label_to_index, path):
'''
This function processes Felicity's json file.
For each id, extract a list of tuples, where each tuple consists of (body, majority_type).
Note that "body" is a list of tokenized, lemmatized, lower-case, no-punctuation words.
'''
data_tuple = []
data = json.load(open(path))
for id in data:
if data[id]['body'] != None and data[id]['majority_type'] != None:
body = data[id]['body']
# body = text_to_word_list(body)
majority_type = data[id]['majority_type']
majority_type = label_to_index[majority_type]
data_tuple.append((body, majority_type))
print("Data file is loaded.")
with open('training_tuples.txt', 'w') as result:
json.dump(data_tuple, result)
print("The data tuples are saved as 'training_tuples.txt'.")
def text_to_word_list(text):
'''
This function is implemented within load_json().
It cleans the raw comment from 'body' feature into a list
of tokenized, lemmatized, lower-case, no-punctuation words.
'''
# word_list = tokenizer.tokenize(text)
word_list = nltk.word_tokenize(text)
word_list = [lemmatizer.lemmatize(word.lower()) if word != 'ni_zhao_bu_dao_de' else word for word in word_list]
return word_list
def load_glove():
'''
This function loads the pretrained GloVe vectors trained from CommonCrawl.
'''
embeddings = {}
cmc_fname = 'glove.840B.300d.txt'
wiki_fname = 'glove.6B.300d.txt'
glove_name = 'glove_wiki.txt'
DIR_PATH = '/home/zsong/working/data/glove'
with open(os.path.join(DIR_PATH, wiki_fname)) as glove:
for line in glove:
values = line.split()
# word, vector = values[0], np.asarray(values[1:], dtype='float32')
word, vector = values[0], values[1:]
raw_string = ''.join(vector)
if 'com' in raw_string or '@' in raw_string:
continue
if '.' in vector:
continue
if '[email protected]' in vector:
continue
vector = [float(i) for i in vector]
embeddings[word] = vector
print("Gloved is loaded.")
with open(glove_name, 'w') as result:
json.dump(embeddings, result)
print("Glove word vectors are saved as {}.".format(glove_name))
def pad_nonwords(longest_len, sorted_tokenized_sequence):
'''
This function pads a comment that is less than 4856 words with non-words placeholders.
'''
pad_counts = longest_len - len(sorted_tokenized_sequence)
output = sorted_tokenized_sequence + ['ni_zhao_bu_dao_de'] * pad_counts
return output
def prepare_sequence(comment, glove):
'''
This function converts a sequence of words into a list of word vectors.
The output is contained in a PyTorch variable.
'''
word_list = comment
# word_list = text_to_word_list(comment)
# word_list = pad_comment(word_list) # padding
sequence = [get_word_vectors(word, glove) for word in word_list]
# sequence = np.array(sequence)
# tensor = torch.from_numpy(sequence)
tensor = torch.FloatTensor(sequence)
# tensor = torch.DoubleTensor(sequence)
# return autograd.Variable(tensor)
return tensor
def prepare_label_vector(label):
'''
This function specifies input format.
'''
tensor = [0] * 9
tensor[label] = 1
tensor = torch.LongTensor(tensor)
# tensor = torch.LongTensor(label)
# return autograd.Variable(tensor)
return tensor
def get_word_vectors(word, glove):
'''
This function retrieves the respective word vector for the word.
If the word is not in the model, then return a Numpy array of zeros.
Return: a word vector of 1 * 300.
'''
if word in glove:
# return torch.from_numpy(np.array(glove[word]))
return glove[word]
else:
# return autograd.Variable(torch.from_numpy(np.zeros(300)))
return [0.0] * 300
def batch_processing(chunk_tuple, batch_size):
'''
This function takes the input and returns a packed padded sequence of data and a list of their corresponding labels.
The input is a chunk of data tuples (sequence, label) with the size batch_size.
'''
strings, labels = zip(*chunk_tuple)
word_lists = [text_to_word_list(string) for string in strings]
tuples = zip(word_lists, labels)
# sorted_by_length = tuples.sort(key=lambda t: len(t[0]), reverse=True)
sorted_by_length = sorted(tuples, key=lambda t: len(t[0]), reverse=True)
word_lists, labels = zip(*sorted_by_length)
sequence_lengths = [len(w_list) for w_list in word_lists]
longest_length = len(word_lists[0])
word_lists = [pad_nonwords(longest_length, w_list) for w_list in word_lists]
###
# sequence_inputs = torch.zeros((batch_size, longest_length, 300))
sequence_inputs = [prepare_sequence(w_list, vectors) for w_list in word_lists]
sequence_inputs = torch.stack(sequence_inputs, dim=0)
sequence_inputs = autograd.Variable(sequence_inputs)
###
# The size of sequence inputs should be batch_size * longest_length * num_embeddings
pack = nn.utils.rnn.pack_padded_sequence(sequence_inputs, sequence_lengths, batch_first=True)
labels = [prepare_label_vector(label) for label in labels]
labels = autograd.Variable(torch.stack(labels, dim=0))
return pack, labels
def training(train_data, vectors):
'''
This function replicates the training process below.
There is no return value. The model will be saved with torch.save() and later loaded with torch.load().
'''
model = LSTMText2Word(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, LAYER_NUM, EPOCHS, BATCH_SIZE, PACK_DIM)
print("Print LSTM architecture:")
print(model)
# loss_function = nn.CrossEntropyLoss()
loss_function = nn.NLLLoss(reduce=True)
optimizer = optim.SGD(model.parameters(), lr=0.1)
training_epochs = model.training_epochs
label_scores = 0
print('Initialize label score', label_scores)
print("Start training:")
for epoch in range(training_epochs):
loss = 0
i = 0
for idx in range(0, len(train_data), model.batch_size):
print("Training example {}".format(i + 1))
# Step 1. Remember that Pytorch accumulates gradients.
# We need to clear them out before each instance
model.zero_grad()
# Also, we need to clear out the hidden state of the LSTM,
# detaching it from its history on the last instance.
model.hidden = model.init_hidden()
# Step 2. Get our inputs ready for the network.
chunks = train_data[idx : idx + model.batch_size]
sequence_pack, labels = batch_processing(chunks, model.batch_size)
# Step 3. Run our forward pass.
# print(model(sequence_input))
# print((model(sequence_input)).size())
predicted_labels = model(sequence_pack)
# Step 4. Compute the loss, gradients, and update the parameters by
# calling optimizer.step()
# temp = nn.LogSoftmax(predicted_labels)
print('Print first element of predicted_labels:', predicted_labels[0])
print('Print first element of labels:', labels[0])
print('The size of predicted_labels:', predicted_labels.size())
print('The size of labels:', labels.size())
loss = loss_function(predicted_labels, labels) # need to convert predicted label from Variable to a label
loss.backward()
optimizer.step()
i += model.batch_size
if (i + 1) % 100 == 0:
print('\nEpoch [%d / %d], Loss: %.4f' %(epoch + 1, training_epochs, loss.data[0]))
print("Training is done.")
torch.save(model, 'model_batch_10_glove_5000.pkl')
# torch.save(model.state_dict(), 'model.pkl')
print("The model is saved as 'model_wiki.pkl'.")
def separate_data(data_tuple):
data, labels = zip(* data_tuple)
X_train, X_test, y_train, y_test = train_test_split(data, labels, train_size=0.9, test_size=0.1, random_state=56)
training = list(zip(X_train, y_train))
test = list(zip(X_test, y_test))
print("The data is split.")
with open('train.txt', 'w') as out_file1:
json.dump(training, out_file1)
with open('test.txt', 'w') as out_file2:
json.dump(test, out_file2)
print("The training and test sets are saved as 'train.txt' and 'test.txt'.")
def testing(test_data, vectors):
'''
model = torch.load('model.pkl')
'''
model = torch.load('model_glove_5000.pkl')
correct = 0
total = 0
loss_function = nn.NLLLoss()
# prediction_result = []
for sequence, label in test_data:
sequence_input = prepare_sequence(sequence, vectors)
output = model(sequence_input)
_, prediction = torch.max(output.data, 1)
total += prepare_label_vector(label).size(0)
label_vector = prepare_label_vector(label)
# predicted_label = label_to_index[np.argmax(prediction)]
# golden_rule_label = label_to_index[np.argmax(label_vector)]
# prediction_result.append((predicted_label, golden_rule_label))
correct += prediction.eq(label_vector.data.view_as(prediction)).sum()
print('Accuracy of model trained on 5000 examples on 1000 test labels: %d %%' % (100 * correct / total))
# with open('result.txt', 'w') as cf:
# json.dump(prediction_result, cf)
data_tuple = []
DATA_PATH = "/home/zsong/working/my_trial_DIR/post_df_parent_text_unified.json"
vectors = []
label_to_index = {"agreement": 0, "announcement": 1, "answer": 2, "appreciation": 3, "disagreement": 4, "elaboration": 5, "humor": 6, "negativereaction": 7, "question": 8}
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
EMBEDDING_DIM = 300
HIDDEN_DIM = 300
OUTPUT_DIM = len(label_to_index)
LAYER_NUM = 2
EPOCHS = 2
BATCH_SIZE = 10
PACK_DIM = 1
if __name__ == "__main__":
# load_json(label_to_index, DATA_PATH)
# load_glove()
data_tuple = json.load(open('training_tuples.txt'))
# separate_data(data_tuple)
# train = json.load(open('train.txt'))
# test = json.load(open('test.txt'))
train = data_tuple[: 1000]
test = data_tuple[1000 : 1100]
########################################## Loading different glove embeddings
# vectors = json.load(open('glove.txt'))
vectors = json.load(open('glove_wiki.txt')) # test with short corpus
# vectors = [] # test
##########################################
training(train, vectors)
# testing(test, vectors)