-
Notifications
You must be signed in to change notification settings - Fork 1
/
train_spliceAI2k.py
119 lines (87 loc) · 3.49 KB
/
train_spliceAI2k.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from model import *
from utils import *
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
from keras.optimizers import Adam
from keras.callbacks import LearningRateScheduler
import math
import numpy as np
import time
print('eagerly?', tf.executing_eagerly())
# TRAINING PARAMETERS
batch_size = 128
num_classes = 3
epochs = 1
class DataGenerator(keras.utils.Sequence):
def __init__(self, x_set, y_set, batch_size):
self.x, self.y = x_set, y_set
self.batch_size = batch_size
def __len__(self):
return math.ceil(len(self.x) / self.batch_size)
def __getitem__(self, idx):
batch_x = self.x[idx * self.batch_size:(idx + 1) *
self.batch_size]
batch_y = self.y[idx * self.batch_size:(idx + 1) *
self.batch_size]
return np.array(batch_x), np.array(batch_y)
start_time = time.time()
# importing the data
transcripts = np.loadtxt('./data/transcripts_chr1_3', dtype='str', delimiter='\t')
labels = np.loadtxt('./data/labels_chr1_3', dtype='str', delimiter='\t')
# one-hot-encoding
transcripts, labels = transform_input(transcripts, labels)
transcripts = np.array(transcripts)
labels = np.array(labels)
(x_train, x_test, y_train, y_test) = train_test_split(transcripts,
labels, test_size=0.2)
input_shape = x_train.shape[1:]
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
print('y_train shape:', y_train.shape)
print("Data prep: {} seconds".format(time.time() - start_time))
lr_scheduler = LearningRateScheduler(lr_schedule)
model = spliceAI_model(input_shape=input_shape)
model.compile(loss=custom_crossentropy_loss,
optimizer=Adam(learning_rate=lr_schedule(0)),
metrics=['accuracy'])
print(model.summary())
start_time = time.time()
training_generator = DataGenerator(x_train, y_train, batch_size)
for e in range(1, 11):
model.fit(training_generator, epochs=e+1, initial_epoch=e, callbacks=[lr_scheduler], shuffle=True)
y_pred = model.predict(x_test)
acc = topk_accuracy_(y_test, y_pred)
print('Current top-k accuracy: {:.2f}'.format(acc))
model.save('./data/model_spliceAI2k_chr1_3')
if acc>0.90:
break
print("Fitting: {} seconds".format(time.time() - start_time))
scores = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])
y_pred = model.predict(x_test)
y_test, y_pred = transform_output(y_test, y_pred)
donor_t_p, acceptor_t_p, blank_t_p = 0, 0, 0
donor, acceptor, blank = 0, 0, 0
for i in range(len(y_test)):
for j in range(len(y_test[0])):
if y_test[i][j] == y_pred[i][j] and y_test[i][j] == 'd':
donor += 1
donor_t_p += 1
elif y_test[i][j] == y_pred[i][j] and y_test[i][j] == 'a':
acceptor += 1
acceptor_t_p += 1
elif y_test[i][j] == y_pred[i][j] and y_test[i][j] == 'b':
blank += 1
blank_t_p += 1
elif y_test[i][j] == 'd':
donor += 1
elif y_test[i][j] == 'a':
acceptor += 1
elif y_test[i][j] == 'b':
blank += 1
print(
"Out of {} blank {} TP, out of {} donor {} TP, out of {} acceptor {} TP".format(blank, blank_t_p, donor, donor_t_p,
acceptor, acceptor_t_p))