-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProjectivePrediction_Eval.R
226 lines (184 loc) · 9.88 KB
/
ProjectivePrediction_Eval.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#Run this script after BRM fit and running cv_varsel to find reduced variable ranking
#------------------------------------------------------------------------
#Now we evaluate the PROJECTIVE PREDICTION model - we first build a projection
#1.The model is evaluated first using ALL 69 dummy variables in cv_varsel with LOO.
#The reduced version is done at end of script
#2.We then examine these variables and create list of reduced variable candidates where
#we remove any variables associated with a biomarker if the '_NA' category has the highest ranking
#for improving AUC
#3 We fit a second BRM model using this list of reduced variables
#with all conisdered variables, then we
#------------------------------------------------------------------------
#helper functions for understanding var sel results
#summary.vsel(), print.vsel(), plot.vsel(), suggest_size.vsel(), solution_terms.vsel()
cv_freq=vs$pct_solution_terms_cv
if (!is.null(cv_freq)) {
cv_freq = cv_freq[, 2:dim(cv_freq)[2] ] #remove size column
}
if (!exists('b_style')) { b_style = 'NA' }
sink(paste(save_path, b_style, '_MaxTerms', nt_max, '_K', N_k, 'Bayes-varSelect-Summary.txt',sep = ''))
#examine variable selection results
print('LOG ODDS')
print(refmodel$fit)
print('ODDS')
print(exp(fixef(refmodel$fit)))
print('VARIABLE SELECTION RESULT')
print(vs)
out=summary(vs, stats = 'elpd', type = c("mean", "se", "lower", "upper", "diff", "diff.se"))
print('ELPD SUMMARY')
print(out)
print('ELPD DETAILED SELECTION')
print(out$selection)
out=summary(vs, stats = 'auc', type = c("mean", "se", "lower", "upper", "diff", "diff.se"))
print('AUC SUMMARY')
print(out)
print('AUC DETAILED SELECTION')
print(out$selection)
#should have frequencies of variables selected
print('VARIABLE SELECTION LOO FREQUENCY')
print(cv_freq)
sink()
if (!is.null(cv_freq)) {
#flips it upside down so urea is at the top
cv_freq2 = apply(cv_freq,2,rev)
colnames(cv_freq2) = rev(colnames(cv_freq))
dmelt = melt(cv_freq2)
colnames(dmelt) = c('Rank','Biomarker','value')
ggplot(dmelt, aes(x = Rank,
y = Biomarker,
fill = value)) + geom_tile()
ggsave(paste(save_path, b_style, '_MaxTerms', nt_max, '_K', N_k, 'Bayes_VarSelect_FrequencyHeatmap.pdf',sep = ''), device = 'pdf',
width = 20, height = 20, units = 'cm', dpi = 300)
}
plot(vs, stats = c('auc', 'elpd'))
ggsave(paste(save_path, b_style, '_MaxTerms', nt_max, '_K', N_k, 'Bayes-varSelect-AUC-ELPD.pdf',sep = ''), device = 'pdf',
width = 20, height = 20, units = 'cm', dpi = 300)
#number of terms to use #default alpha is 0.32 (68%) so variables in 1SD
#nv <- dim(cv_freq)[2] # suggest_size(vs,alpha = 0.2) #0.2 maps to 80
#Note sometimes suggest_size returns NA even when everything else goes fine, bug?
# Visualise the most relevant variables in the full model -->
mcmc_areas(as.matrix(refmodel$fit),
pars = c("b_Intercept", paste0("b_", solution_terms(vs)))) +
coord_cartesian(xlim = c(-3, 3))
ggsave(paste(save_path, b_style, '_MaxTerms', nt_max, '_K', N_k, 'Bayes-MCMC_ReducedVar.pdf',sep = ''), device = 'pdf',
width = 20, height = 20, units = 'cm', dpi = 300)
#------------------------------------------------------------------------------
#We make predictions with the projected submodels. For point estimates we can use method
#proj_predict. Test inputs can be provided using the keyword newdata.
#It also the test targets ynew are provided in newdata, then the function evaluates
#the log predictive density at these points. For instance, the following computes
#the mean of the predictive distribution and evaluates the log density at the training
#points using the most relevant variables.
nv = length(vs$solution_terms)
pp_str = paste('ProjPred_V',nv, '_', sep = '')
proj <- project(vs, nterms = nv ,seed = 123456,ns = 2000)
#---------------------------------------------
#eval internal training data performance
SelectedData_str ='Train_FullTrainData_Test_TrainData'
pp_data = tr_backup
source(paste(work_path, 'ProjectivePrediction_Perf.R', sep = ''))
glm_traindata_batch_df9[m_ctr,] <- c('ProjPred', readingwanted_str, dateRange, outcome_str,
out_acc,out_auc, out_brier, out_sen, out_spec)
write.table(glm_traindata_batch_df9, file = paste(output_path, pp_str, SelectedData_str,
'Batch_ProjPred_Summary_Table.csv',
sep = ''),
row.names = FALSE, sep = ',')
#---------------------------------------------
#Now eval external validation data performance
SelectedData_str ='Train_FullTrainData_Test_Generalise'
pp_data = test_backup
source(paste(work_path, 'ProjectivePrediction_Perf.R', sep = ''))
glm_traindata_batch_df10[m_ctr,] <- c('ProjPred', readingwanted_str, dateRange, outcome_str,
out_acc,out_auc, out_brier, out_sen, out_spec)
write.table(glm_traindata_batch_df10, file = paste(output_path, pp_str, SelectedData_str,
'Batch_ProjPred_Summary_Table.csv',
sep = ''),
row.names = FALSE, sep = ',')
#---------------------------------------------
#----Clinician advised variables to select
#To be able to select these we need to run variable selection with all the variables (~70)
# several NAs as the top contributor in improving AUC - how to handle?
#AUD=G 0.71 / I AUC=0.84 with all variables
solterms = c( "UreaAbnormal", "UreaNA", ##AUC=0.62
#"poctLACNA" #NA first ranked for biomarker
#"O2NA" #NA first ranked for biomarker
#"CO2NA" #NA first ranked for biomarker
"Age", #AUC=0.64
"PTAbnormal","PTNA", ##AUC=0.68
"NLRMild","NLRModerate","NLRSevere","NLRNA" #AUC = G 0.70 // I 0.75 **
#"LDHNA" #NA first ranked for biomarker
#"poctpHAbnormal","poctpHNA", #AUC=0.70**
#"LymphocytesMild","LymphocytesModerate","LymphocytesSevere","LymphocytesNA", #AUC= I 0.76/ G 0.70**
#"APTTMild", "APTTModerate", "APTTNA", #AUC=0.68
#"eGFRAbnormal", "eGFRNA", #.68
#"NeutrophilsMild", "NeutrophilsModerate", "NeutrophilsSevere", "NeutrophilsNA", #AUC= I 0.77 / G 0.69
#"FERNA" #NA first ranked for biomarker
#"fibNA" #NA first ranked for biomarker
#"CRPAbnormal", "CRPNA", #AUC=0.69
#"DDMAbnormal", "DDMNA", #AUC=0.69
#"HBSevere", "HBModerate", "HBMild", "HBNA", #AUC=0.69
#'WCCSevere', "WCCModerate", "WCCMild", "WCCNA", #AUC=0.69
#'PLTSevere', "PLTModerate", "PLTMild", "PLTCNA" #AUC G 0.70 // AUC I 0.80 **
)
#We make predictions with the projected submodels. For point estimates we can use method
#proj_predict. Test inputs can be provided using the keyword newdata.
#It also the test targets ynew are provided in newdata, then the function evaluates
#the log predictive density at these points. For instance, the following computes
#the mean of the predictive distribution and evaluates the log density at the training
#points using the most relevant variables.
nv <- length(solterms)
pp_str = paste('ProjPred_V',nv, '_', sep = '')
proj <- project(vs, solution_terms = solterms,ns = 2000)
#---------------------------------------------
#eval internal training data performance
SelectedData_str ='Train_FullTrainData_Test_TrainData'
pp_data = tr_backup
source(paste(work_path, 'ProjectivePrediction_Perf.R', sep = ''))
glm_traindata_batch_df11[m_ctr,] <- c('ProjPred', readingwanted_str, dateRange, outcome_str,
out_acc,out_auc, out_brier, out_sen, out_spec)
write.table(glm_traindata_batch_df11, file = paste(output_path, pp_str, SelectedData_str,
'Batch_ProjPred_Summary_Table.csv',
sep = ''),
row.names = FALSE, sep = ',')
#---------------------------------------------
#Now eval external validation data performance
SelectedData_str ='Train_FullTrainData_Test_Generalise'
pp_data = test_backup
source(paste(work_path, 'ProjectivePrediction_Perf.R', sep = ''))
glm_traindata_batch_df12[m_ctr,] <- c('ProjPred', readingwanted_str, dateRange, outcome_str,
out_acc,out_auc, out_brier, out_sen, out_spec)
write.table(glm_traindata_batch_df12, file = paste(output_path, pp_str, SelectedData_str,
'Batch_ProjPred_Summary_Table.csv',
sep = ''),
row.names = FALSE, sep = ',')
#Get the model coefficients
beta_store <- matrix(data = NA, nrow = nv, ncol = length(proj$submodl))
alpha_store <- matrix(data = NA, nrow = 1, ncol = length(proj$submodl))
for (xcx in seq(1,length(proj$submodl))) {
beta_store[,xcx] = exp(proj$submodl[[xcx]]$beta)
alpha_store[,xcx] = exp(proj$submodl[[xcx]]$alpha)
}
rownames(beta_store) <- rownames(proj$submodl[[1]]$beta)
#get coefficient and upper/lower bound
c2_5 = round(dim(beta_store)[2]*0.025)
c50 = round(dim(beta_store)[2]*0.5)
c97_5 = round(dim(beta_store)[2]*0.975)
beta_coef_store <- matrix(nrow = nv, ncol = 3)
rownames(beta_coef_store) <- rownames(proj$submodl[[1]]$beta)
for (xcx in seq(1,dim(beta_store)[1])) {
tmp = sort(beta_store[xcx,])
beta_coef_store[xcx,1] = tmp[c2_5]
beta_coef_store[xcx,2] = tmp[c50]
beta_coef_store[xcx,3] = tmp[c97_5]
}
tmp = sort(alpha_store)
alpha_coef_store = matrix()
alpha_coef_store[1] = tmp[c2_5]
alpha_coef_store[2] = tmp[c50]
alpha_coef_store[3] = tmp[c97_5]
sink(paste(save_path, '_PP_Projection_Coefficients.csv',sep = ''))
print('Predictor Variables')
print(beta_coef_store)
print('Intercept')
print(alpha_coef_store)
sink()