forked from ratschlab/RGAN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
eICU_task.py
218 lines (211 loc) · 10.6 KB
/
eICU_task.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/usr/bin/env ipython
#
# Scripts for helping us to identify doable tasks in the eICU.
#
#
import pandas as pd
import numpy as np
import pdb
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
import itertools
def get_train_data(df, n_hours, seq_length, resample_time,
future_window_size=0,
sao2_low=96, heartrate_low=75, respiration_low=15, systemicmean_low=75,
heartrate_high=100, respiration_high=20, systemicmean_high=100):
"""
seq_length is how many measurements we use for training
"""
patients = set(df.pid)
window_size = int(n_hours*60/resample_time) # this is how many rows in the window
X = np.empty(shape=(len(patients), 4*seq_length)) # we have 4*seq_length features
Y = np.empty(shape=(len(patients), 7)) # we have 7 labels
i = 0
kept_patients = [-1337]*len(patients)
for pat in patients:
df_pat_withlabels = get_labels(df, pat, seq_length, window_size,
future_window_size, sao2_low, heartrate_low, respiration_low,
systemicmean_low, heartrate_high, respiration_high, systemicmean_high)
if df_pat_withlabels is None:
print('Skipping patient', pat, 'for having too little data')
continue
# subset to train period
df_pat_train = df_pat_withlabels.head(seq_length)
if df_pat_train.shape[0] < seq_length:
print('Skipping patient', pat, 'for having too little data')
continue
X_pat = df_pat_train[['sao2', 'heartrate', 'respiration', 'systemicmean']].values.reshape(4*seq_length)
if np.isnan(X_pat).any():
# this should not happen any more btw
print('Dropping patient', pat, 'for having NAs')
# just ignore this row
else:
X[i, :] = X_pat
Y[i, :] = df_pat_train.tail(1)[['low_sao2', 'low_heartrate', 'low_respiration', \
'low_systemicmean', 'high_heartrate', 'high_respiration', 'high_systemicmean']].values.reshape(7)*1
kept_patients[i] = pat
i += 1
print('Kept data on', i, 'patients (started with', len(patients), ')')
# delete the remaining rows
X = X[:i]
Y = Y[:i]
return X, Y, kept_patients
def train_model(X, Y, X_test=None, Y_test=None):
"""
simple classification task
"""
results = []
n_labels = Y.shape[1]
if X_test is None and Y_test is None:
n_samples = X.shape[0]
n_train = int(0.8*n_samples)
perm = np.random.permutation(n_samples)
train_indices = perm[:n_train]
test_indices = perm[n_train:]
X_test = X[test_indices]
X_train = X[train_indices]
Y_test = Y[test_indices]
Y_train = Y[train_indices]
else:
X_train = X
Y_train = Y
for label in range(n_labels):
y = Y[:, label]
model = RandomForestClassifier(n_estimators=50).fit(X_train, Y_train[:, label])
predict = model.predict(X_test)
accuracy = sklearn.metrics.accuracy_score(Y_test[:, label], predict)
precision = sklearn.metrics.precision_score(Y_test[:, label], predict)
recall = sklearn.metrics.recall_score(Y_test[:, label], predict)
results.append([accuracy, precision, recall])
return results
#sao2_high = 105 # does not exist
heartrate_low = 75
heartrate_high = 100
respiration_low = 15
respiration_high = 20
systemicmean_low = 65
systemicmean_high = 100
def get_df(resample_time=15, n_hours=1, seq_length=16):
derived_dir = 'REDACTED'
print('getting patients')
patients = list(map(int, np.loadtxt(derived_dir + 'cohort_complete_resampled_pats_' + str(resample_time) + 'min.csv')))
data = derived_dir + 'resampled_pats_' + str(resample_time) + 'min.csv'
print('getting data')
df = pd.read_csv(data)
print('subsetting to "complete" patients')
max_offset = 1.5*(seq_length*resample_time + 60*n_hours) # for good measure
print('subsetting by time')
df = df[df.offset < max_offset]
# drop patients missing any data in this region (this is slow but it's worth it)
df = df.groupby('pid').filter(lambda x: np.all(np.isfinite(x.values)) and x.shape[0] > seq_length)
return df
def run_grid(seq_length=16, future_window_size=0):
sao2_low_vals = [95, 90, 80]
heartrate_low_vals = [50, 60, 75]
respiration_low_vals = [5, 10, 15]
systemicmean_low_vals = [65, 75]
heartrate_high_vals = [90, 100, 110]
respiration_high_vals = [20, 30]
systemicmean_high_vals = [110, 130, 150]
derived_dir = 'REDACTED'
grid_file = open('grid_out.txt', 'w')
grid_file.write('resample_time n_hours seq_length future_window_size sao2_low heartrate_low \
respiration_low systemicmean_low heartrate_high respiration_high \
systemicmean_high \
sao2_low_accuracy sao2_low_precision sao2_low_recall \
heartrate_low_accuracy heartrate_low_precision heartrate_low_recall \
respiration_low_accuracy respiration_low_precision respiration_low_recall \
systemicmean_low_accuracy systemicmean_low_precision systemicmean_low_recall \
heartrate_high_accuracy heartrate_high_precision heartrate_high_recall \
respiration_high_accuracy respiration_high_precision respiration_high_recall \
systemicmean_high_accuracy systemicmean_high_precision systemicmean_high_recall\n')
for resample_time in [15, 30]:
print('getting patients')
patients = list(map(int, np.loadtxt(derived_dir + 'cohort_complete_resampled_pats_' + str(resample_time) + 'min.csv')))
data = derived_dir + 'resampled_pats_' + str(resample_time) + 'min.csv'
print('getting data')
df = pd.read_csv(data)
print('subsetting to "complete" patients')
df = df[df.pid.isin(patients)]
for n_hours in [1, 2, 3, 4, 5, 10]:
max_offset = 1.5*(seq_length*resample_time + 60*n_hours) # for good measure
print('subsetting by time')
df = df[df.offset < max_offset]
# drop patients missing any data in this region (this is slow but it's worth it)
print('deleting patients missing data and with too little data')
df = df.groupby('pid').filter(lambda x: np.all(np.isfinite(x.values)) and x.shape[0] > seq_length)
for (sao2_low, heartrate_low, respiration_low, systemicmean_low, heartrate_high, respiration_high, systemicmean_high) in itertools.product(sao2_low_vals, heartrate_low_vals, respiration_low_vals, systemicmean_low_vals, heartrate_high_vals, respiration_high_vals, systemicmean_high_vals):
# restrict
settings = [resample_time, n_hours, seq_length, future_window_size, sao2_low,
heartrate_low, respiration_low, systemicmean_low,
heartrate_high, respiration_high, systemicmean_high]
print(settings)
grid_file.write(' '.join(map(str, settings)) + ' ')
X, Y = get_train_data(df, n_hours, seq_length, resample_time, future_window_size,
sao2_low, heartrate_low, respiration_low, systemicmean_low,
heartrate_high, respiration_high, systemicmean_high)
results = train_model(X, Y)
for accuracy, precision, recall in results:
grid_file.write(str(accuracy) + ' ' + str(precision) + ' ' + str(recall))
grid_file.write('\n')
grid_file.flush()
return True
def get_labels(df, patient, seq_length, window_size, future_window_size,
sao2_low, heartrate_low, respiration_low, systemicmean_low,
heartrate_high, respiration_high, systemicmean_high):
df_pat = df[df.pid == patient]
if df_pat.shape[0] < seq_length + window_size:
return None
df_pat.set_index('offset', inplace=True)
df_pat.sort_index(inplace=True)
df_pat_rollmins = df_pat.fillna(1337).rolling(window_size).min()
df_pat_rollmaxs = df_pat.fillna(-1337).rolling(window_size).max()
# get thresholds
low_sao2 = df_pat_rollmins.sao2 < sao2_low
low_heartrate = df_pat_rollmins.heartrate < heartrate_low
low_respiration = df_pat_rollmins.respiration < respiration_low
low_systemicmean = df_pat_rollmins.systemicmean < systemicmean_low
high_heartrate = df_pat_rollmaxs.heartrate > heartrate_high
high_respiration = df_pat_rollmaxs.respiration > respiration_high
high_systemicmean = df_pat_rollmaxs.systemicmean > systemicmean_high
# extremes
df_pat_labels = pd.DataFrame({'low_sao2': low_sao2.values,
'low_heartrate': low_heartrate.values,
'low_respiration': low_respiration.values,
'low_systemicmean': low_systemicmean.values,
'high_heartrate': high_heartrate.values,
'high_respiration': high_respiration.values,
'high_systemicmean': high_systemicmean.values})
# now we need to align it - first move it back to 0 (subtract window_size),
# then shift it forward by future_window_size (when we want to make
# predictions about)
df_pat_labels_aligned = df_pat_labels.shift(-window_size + 1 + future_window_size)
df_pat_labels_aligned.index = df_pat.index
df_pat_withlabels = pd.concat([df_pat, df_pat_labels_aligned], axis=1)
return df_pat_withlabels
def gen_data():
""" just run the whole thing """
df = get_df(resample_time=15, n_hours=1, seq_length=16)
X, Y, pids = get_train_data(df, 1, 16, 15, sao2_low=95, respiration_low=13, respiration_high=25, heartrate_low=70, heartrate_high=100, systemicmean_low=70, systemicmean_high=110)
extreme_heartrate = Y[:, 1] + Y[:, 4]
extreme_respiration = Y[:, 2] + Y[:, 5]
extreme_MAP = Y[:, 3] + Y[:, 6]
Y_OR = np.vstack((extreme_heartrate, extreme_respiration, extreme_MAP)).T
Y_OR = (Y_OR>0)*1
pp = [p for p in pids if not p == -1337]
X_train, X_test, Y_train, Y_test, pids_train, pids_test = sklearn.model_selection.train_test_split(X, Y, pp, test_size=0.2, stratify=Y)
X_train, X_vali, Y_train, Y_vali, pids_train, pids_vali = sklearn.model_selection.train_test_split(X_train, Y_train, pids_train, test_size=0.25, stratify=Y_train)
Y_columns = ['low_sao2', 'low_heartrate', 'low_respiration', 'low_systemicmean', 'high_heartrate', 'high_respiration', 'high_systemicmean']
data = dict()
data['X_train'] = X_train
data['X_vali'] = X_vali
data['X_test'] = X_test
data['Y_train'] = Y_train
data['Y_vali'] = Y_vali
data['Y_test'] = Y_test
data['pids_train'] = pids_train
data['pids_vali'] = pids_vali
data['pids_test'] = pids_test
data['Y_columns'] = Y_columns
data['Y_ORs'] = Y_OR
np.save('eICU_task_data2.npy', data)