-
Notifications
You must be signed in to change notification settings - Fork 20
/
diagnosis_rnn.py
163 lines (131 loc) · 5.69 KB
/
diagnosis_rnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
'''
Trains a RNN on medical diagnosis of diseases dataset
data is obtained from various online sources
Memory network needs to predict the disease using many symptoms listed as
natural language sentences
'''
from __future__ import print_function
from functools import reduce
import re
import tarfile
import os.path
import pickle
import h5py
import pdb
from itertools import izip_longest
import random
import numpy as np
np.random.seed(1337) # for reproducibility
random.seed(1337)
from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Merge, Dropout, RepeatVector
from keras.layers import recurrent
from keras.layers.recurrent import LSTM, GRU
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, Callback
from utils import create_vectors_dataset, get_spacy_vectors
from glove import Glove
from spacy.en import English
RNN = recurrent.LSTM
NUM_HIDDEN_UNITS = 128
BATCH_SIZE = 32
EPOCHS = 10
DROPOUT_FACTOR = 0.5
print('RNN / HIDDENS = {}, {}'.format(RNN, NUM_HIDDEN_UNITS))
max_len = 500
word_vec_dim = 300
vocab_size = 2350
training_set_file = 'data/training_set.dat'
test_set_file = 'data/test_set.dat'
train_stories = pickle.load(open(training_set_file,'r'))
test_stories = pickle.load(open(test_set_file,'r'))
train_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in train_stories]
test_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in test_stories]
answer_vocab = sorted(reduce(lambda x, y: x | y, (set([answer]) for _, answer in train_stories + test_stories)))
# Reserve 0 for masking via pad_sequences
answer_dict = dict((word, i) for i, word in enumerate(answer_vocab))
print('Answers dict len: {0}'.format(len(answer_dict)))
# I need to check also if this exist
#word_vectors_dir = 'word_vectors/glove.42B.300d.txt'
#word_vectors_model = Glove.load_stanford(word_vectors_dir)
nlp = English()
print('Build model...')
model = Sequential()
model.add(GRU(output_dim = NUM_HIDDEN_UNITS, activation='tanh',
return_sequences=True, input_shape=(max_len, word_vec_dim)))
model.add(Dropout(DROPOUT_FACTOR))
model.add(GRU(NUM_HIDDEN_UNITS, return_sequences=False))
model.add(Dense(vocab_size, init='uniform',activation='softmax'))
#json_string = model.to_json()
#model_file_name = 'models/lstm_num_hidden_units_' + str(NUM_HIDDEN_UNITS) + '_num_lstm_layers_' + str(2) + '_dropout_' + str(0.3)
#open(model_file_name + '.json', 'w').write(json_string)
print('Compiling model...')
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Compilation done...')
print('Training')
base_dir = '.'
NUM_DATA_TRAIN = len(train_stories)
NUM_DATA_TEST = len(test_stories)
random.shuffle(train_stories)
valid_stories = train_stories[int(len(train_stories)*0.95):]
train_stories = train_stories[:int(len(train_stories)*0.95)]
print('Validation size: {0}'.format(len(valid_stories)))
print('Training size: {0}'.format(len(train_stories)))
def grouper(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return izip_longest(*args, fillvalue=fillvalue)
acc_hist = []
acc_hist.append(0.)
show_batch_interval = 50
for k in xrange(EPOCHS):
for b,train_batch in enumerate(zip(grouper(train_stories, BATCH_SIZE, fillvalue=train_stories[-1]))):
X,Y = get_spacy_vectors(train_batch[0], answer_dict,
max_len, nlp)
loss = model.train_on_batch(X, Y)
if b % show_batch_interval == 0:
print('Epoch: {0}, Batch: {1}, loss: {2}'.format(k,b,loss))
X,Y = get_spacy_vectors(valid_stories, answer_dict,
max_len, nlp)
loss, acc = model.evaluate(X, Y, batch_size=BATCH_SIZE)
print('Epoch{0}, Valid loss / valid accuracy = {1:.4f} / {2:.4f}'.format(k,loss, acc))
#Logging results
with open(base_dir + '/logs/log_{0}_{1}_drop_{2}.txt'.format(
'GRU',str(NUM_HIDDEN_UNITS),str(DROPOUT_FACTOR)),'a') as fil:
fil.write(str(loss) + ' ' + str(acc) + '\n')
#Saving model
if max(acc_hist) < acc:
model.save_weights(base_dir + '/models/weights_{0}_{1}_drop_{2}.hdf5'.format(
'GRU',str(NUM_HIDDEN_UNITS),str(DROPOUT_FACTOR)),overwrite=True)
acc_hist.append(acc)
# Obtaining test results
# Evaluatin Best 5 accuracy and best accuracy
SAVE_ERRORS = False
acc_5 = 0.
acc = 0.
for b,test_batch in enumerate(zip(grouper(test_stories, BATCH_SIZE, fillvalue=test_stories[-1]))):
X,Y = get_spacy_vectors(test_batch[0], answer_dict,
max_len, nlp)
answers_test = Y if b == 0 else np.vstack((answers_test,Y))
preds = model.predict(X)
# Saving in order to make some more visualizations
all_predictions = preds if b == 0 else np.vstack((all_predictions,preds))
if b % 50 == 0:
print('Batch: {0}'.format(b))
all_predictions = all_predictions[:len(test_stories)]
answers_test = answers_test[:len(test_stories)]
for k,(pred,answer) in enumerate(zip(all_predictions,answers_test)):
prediction = np.argsort(pred)[-5:][::-1]
pred_words = [answer_dict.keys()[answer_dict.values().index(pred)] for pred in prediction]
answer_word = answer_dict.keys()[answer_dict.values().index(answer.argmax())]
if answer_word in pred_words:
acc_5 += 1.
if pred_words[0] == answer_word:
acc += 1.
all_err = -np.log(all_predictions[range(all_predictions.shape[0]),answers_test.argmax(axis=1)])
np.savetxt('logs/error.dat',all_err)
acc /= len(test_stories)
acc_5 /= len(test_stories)
print('Accuracy: {0}'.format(acc))
print('5 most prob. accuracy: {0}'.format(acc_5))