-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathworkflow.R
96 lines (85 loc) · 3.09 KB
/
workflow.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
library(rstan)
library(ggplot2)
util = new.env()
source("stan_utility.R", local = util)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
#############
## Scope out your problem
#############
df = read.csv("data.csv")
head(df)
#############
## Model!
#############
## 1. Create a new Stan file with your proposed model. We'll start with pooled.stan
#############
## Check the model with fake data! (skip if short on time)
## 1. Create a new Stan program (or use R, or some other language) and
## generate fake data according to the data-generating process described by
## your model.
## 2. Follow the rest of this workflow with your fake data!
#############
# To use Stan to generate fake data, we use the `generated quantities` block
# and the 'Fixed_param' algorithm. The following snippet will generate
# 200 fake data points.
# gen.model = stan_model("gen.stan")
# gen.data = sampling(data=d, algorithm='Fixed_param', warmup=0, iter=200)
# gendf = as.data.frame(gen.data)
# dataset.1 = gendf[1,]
#############
## Fit the model
#############
model = stan_model("../pooled.stan")
fit = sampling(model, list(log_radon=df$log_radon,
basement = df$basement,
N=nrow(df)))
#############
## Check diagnostics
#############
util$check_all_diagnostics(fit)
#############
## Check & graph fit
#############
print(fit)
sample = extract(fit)
# This line just plots our data by basement indicator
ggplot(df, aes(basement, log_radon)) + geom_count() +
# This next snippet will select every 10th posterior sample and plot the
# fit lines corresponding to that sample. It's a good way to get a sense of
# the uncertainty / distribution of the posterior
sapply(seq(1, nrow(sample$alpha), 10), function(i) {
geom_abline(intercept = sample$alpha[i],
slope = sample$basement_effect[i], alpha=0.03)
})
## This snippet makes the unpooled and hierarchical fit checking graph
## This will fail if you try to use it on the pooled model with a single alpha,
## so I'm starting with it commented out.
# estimates = apply(sample$a, 2, mean)
# sd = apply(sample$a, 2, sd)
# fit.df = data.frame(estimates, 1:85,
# estimates + 2*sd,
# estimates - 2*sd,
# sd)
# names(fit.df) = c("a", "county", "upper", "lower", "sd")
# fit.df = fit.df[order(fit.df$a),]
#
# ggplot(fit.df, aes(x=1:85, y=a)) + geom_pointrange(aes(ymin = lower, ymax = upper)) +
# ylim(-1.1, 3.5)
#############
## Check PPCs
#############
# Plot a single house's posterior predictive distribution (house 919 here)
## Commented out for now!
# ppcs = as.data.frame(sample$yppc)
# ggplot(ppcs) + geom_density(aes(V919)) + geom_vline(xintercept=df$log_radon[919])
#
# # Plot an entire replication's distribution against the actual data's distribution
# rep = 2000
# ppcs = data.frame(log_radon = df$log_radon,
# model = sample$yppc[rep,])
# library(reshape2)
# ppcs = reshape2::melt(ppcs)
# ggplot(ppcs, aes(x=value, linetype=variable)) + geom_density(alpha=0.2)
# Shinystan!
#launch_shinystan(fit)