-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_train_test.py
87 lines (68 loc) · 2.69 KB
/
make_train_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import numpy as np
from numpy.random import seed
import sys
import os
from sklearn.externals import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif
seed(913824)
def encode_categories(data, class_dict):
arry = np.array([], dtype = 'i4')
for item in data:
temp = str(item)
temp = int(''.join(filter(str.isdigit, temp)))
for index in range(len(class_dict)):
check = class_dict[index]
check = int(''.join(filter(str.isdigit, check)))
#print(check)
if temp == check:
temp = index
arry = np.append(arry,temp)
#print(arry)
return arry
if __name__ == "__main__":
#######################################################
# Creates a total of 5 sets of training/testing data
# for each drug. Does feature selection using given
# number of features.
#
# Call with "time python make_train_test.py <numfeats>"
#######################################################
NUM_FEATS = sys.argv[1] # defualt = 270
df = joblib.load("amr_data/mic_class_dataframe.pkl") # Matrix of experimental MIC values
mic_class_dict = joblib.load("amr_data/mic_class_order_dict.pkl") # Matrix of classes for each drug
df_cols = df.columns
for drug in df_cols:
print("\n********************",drug,"*******************")
num_classes = len(mic_class_dict[drug])
matrix = np.load('amr_data/'+drug+'/kmer_matrix.npy')
rows_mic = np.load('amr_data/'+drug+'/kmer_rows_mic.npy')
rows_gen = np.load('amr_data/'+drug+'/kmer_rows_genomes.npy')
X = SelectKBest(f_classif, k=int(NUM_FEATS)).fit_transform(matrix, rows_mic)
Y = rows_mic
Z = rows_gen
cv = StratifiedKFold(n_splits=5, random_state=913824)
if not os.path.exists('./amr_data/'+drug+'/'+str(NUM_FEATS)+'feats/'):
os.mkdir('./amr_data/'+drug+'/'+str(NUM_FEATS)+'feats/')
loop = 1
for train,test in cv.split(X,Y,Z):
filepath = './amr_data/'+drug+'/'+str(NUM_FEATS)+'feats/'
Y[train] = encode_categories(Y[train], mic_class_dict[drug])
Y[test] = encode_categories(Y[test], mic_class_dict[drug])
#y_train = to_categorical(Y[train], num_classes)
#y_test = to_categorical(Y[test], num_classes)
### to_categorical for hyperas' data function, svm and xgboost etc dont need it (just neural net thing)
y_train = Y[train]
y_test = Y[test]
x_train = X[train]
x_test = X[test]
filepath=filepath+'fold'+str(loop)+'/'
if not os.path.exists(filepath):
os.mkdir(filepath)
np.save(filepath+'x_train.npy', x_train)
np.save(filepath+'x_test.npy', x_test)
np.save(filepath+'y_train.npy', y_train)
np.save(filepath+'y_test.npy', y_test)
np.save(filepath+'genome_train.npy', Z[train])
np.save(filepath+'genome_test.npy', Z[test])
loop+=1