-
Notifications
You must be signed in to change notification settings - Fork 0
/
ProjectivePrediction_Compute.R
106 lines (83 loc) · 4.14 KB
/
ProjectivePrediction_Compute.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#After getting confidence intervals for cross validated Bayesian models. We
#want to train our final model on all of the training data. Once we have this
#model we can then apply projective prediction to rank variables according to
#their contribution. This final set of variables can then be used to
#run a final GLM/Bayes model using only those variables
#k-fold variable selection can't deal with 1/0 needs to be T/F
#for (j in seq(1,dim(train.data)[2])){
# train.data[,j] = as.logical(train.data[,j])
# test.data[,j] = as.logical(test.data[,j])
#}
#above doesn't fix... error is Error: New factor levels are not allowed.
#Levels allowed: 'FALSE', 'TRUE' , Levels found: '0', '1'
if (use_precomputed) {
#Note LOO for the full set of variables is memory intense and needs >16GB RAM
#it's easier to run on BioSpi save the RData file and then load here
load(paste(work_path, fn, sep = ''))
#vs = cv_vs
#note if you use another compute make sure the strings in work, output & save_path
#match else you'll get errors, so you may need to update here after loading e.g.
#work_path <- 'C:/Users/bs16044/OneDrive - University of Bristol/NewPathName/'
#output_path <- paste(work_path, 'output/', sep = '')
#save_path = paste(output_path, 'Day-',dateRange,'_Reading-',readingwanted_str,
# '_Outcome-',outcome_str, '_Imputation-',imputation_str, '_',sep = '')
} else {
names(tr_backup) = str_replace_all(names(tr_backup),"_","")
names(test_backup) = str_replace_all(names(test_backup),"_","")
# Horseshoe prior from BRMS vignette
#n <- nrow(train.data) # 100
#D <- ncol(train.data[, -1]) # 20
#p0 <- 10 # prior guess for the number of relevant variables
#tau0 <- p0/(D-p0) * 1/sqrt(n) # scale for tau (notice that stan_glm will automatically scale this by sigma)
#prior=prior(horseshoe(scale_global = tau0, scale_slab = 1), class=b),
# In practice, at least 4 chains should be
# used and 2000 iterations might be required for reliable inference.
# seed=1, chains=4, iter=2000) #try larger value than 2000 to get r_hat close to <1.05
#Train final bayesian model using all of the training data
#recast as numeric?
for (ii in c(1:dim(tr_backup)[2])) { #skip #2 age
tr_backup[,ii] <- as.numeric(tr_backup[,ii])
}
for (ii in c(1:dim(test_backup)[2])) { #skip #2 age
test_backup[,ii] <- as.numeric(test_backup[,ii])
}
#no need to recompute as it was done during bayesian CV section
#fit hs prior version - params set to get convergence - reference model
#brmfit <- brm(outcome ~ .,
# data = tr_backup, #train.data,
# family = bernoulli(),
# prior = prior(horseshoe(), class = b),
# cores = parallel::detectCores(),
# backend = 'cmdstanr',
# iter = 2000,
# control = list(adapt_delta = 0.999,
# step_size = 0.01,
# max_treedepth = 15),
# silent = 2)
#
#refmodel <- get_refmodel(brmfit)
refmodel <- get_refmodel(StatModel)
#validate_search = FALSE variable selection only once on full data set
#TRUE is default and does var selection every time (but otder presented in out put is
#for all data)
if (cv_style == 'Naive') {#NO cross validation - takes 6-7 minutes
print('naive variable selection...')
N_k = 0
vs_simple = varsel(refmodel, nterms_max = nt_max, validate_search = vs_flag, search_terms = solterms_ls)
vs = vs_simple
} else if (cv_style == 'KFold') {
#nterms_max uses too much memory if >50? takes 15? minutes
print('kfold variable selection...')
N_k = 5
kf_vs = cv_varsel(refmodel, cv_method = 'kfold', nterms_max = nt_max,
K = N_k, validate_search = vs_flag, search_terms = solterms_ls)
vs = kf_vs
} else if (cv_style == 'LOO') {
#NOTE!! cv_varsel is computationally intense can take 8+ hours on ~500 patients data
print('LOO variable selection...')
N_k = 0
cv_vs = cv_varsel(refmodel, cv_method = 'loo', nterms_max = nt_max,
validate_search = vs_flag, search_terms = solterms_ls)
vs = cv_vs
}
}