Skip to content

Commit

Permalink
bug fix to deal with NAs for binomial
Browse files Browse the repository at this point in the history
  • Loading branch information
Nicholas Clark committed Mar 19, 2024
1 parent 1ebdff1 commit 2cbd0de
Show file tree
Hide file tree
Showing 27 changed files with 91 additions and 55 deletions.
2 changes: 1 addition & 1 deletion R/get_mvgam_priors.R
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ get_mvgam_priors = function(formula,
# in the model
ss_gam <- try(mvgam_setup(formula = formula,
family = family_to_mgcvfam(family),
data = data_train,
dat = data_train,
drop.unused.levels = FALSE,
maxit = 5),
silent = TRUE)
Expand Down
16 changes: 14 additions & 2 deletions R/gp.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,19 @@ make_gp_additions = function(gp_details, data,
term_k <- mgcv_model$smooth[[min(which(smooth_bys %in% by[x] &
smooth_terms == gp_covariates[x]))]]$df

# Check that response terms use the cbind() syntax
resp_terms <- rlang::f_lhs(formula(mgcv_model))
if(length(resp_terms) == 1){
response <- resp_terms
} else {
if(any(grepl('cbind', resp_terms))){
resp_terms <- resp_terms[-grepl('cbind', resp_terms)]
response <- resp_terms[1]
}
}

prep_gp_covariate(data = data,
response = rlang::f_lhs(formula(mgcv_model)),
response = response,
covariate = gp_covariates[x],
by = by[x],
level = level[x],
Expand Down Expand Up @@ -517,7 +528,8 @@ prep_gp_covariate = function(data,
k = 20){

# Get default gp param priors from a call to brms::get_prior()
def_gp_prior <- suppressWarnings(brms::get_prior(formula(paste0(response, ' ~ gp(', covariate,
def_gp_prior <- suppressWarnings(brms::get_prior(formula(paste0(response,
' ~ gp(', covariate,
ifelse(is.na(by), ', ',
paste0(', by = ', by, ', ')),
'k = ', k,
Expand Down
8 changes: 5 additions & 3 deletions R/interpret_mvgam.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ interpret_mvgam = function(formula, N, family){
if(is.function(family))
family <- family()

if(family$family == 'binomial'){
if(family$family %in% c('binomial', 'beta_binomial')){
# Check that response terms use the cbind() syntax
resp_terms <- as.character(terms(formula(formula))[[2]])
if(length(resp_terms) == 1){
Expand Down Expand Up @@ -49,7 +49,8 @@ interpret_mvgam = function(formula, N, family){

# Preserve offset if included
if(!is.null(attr(terms(formula(formula)), 'offset'))){
newformula <- as.formula(paste(terms.formula(formula)[[2]], '~',
newformula <- as.formula(paste(dimnames(attr(terms(formula), 'factors'))[[1]][1],
'~',
grep('offset', rownames(attr(terms.formula(formula), 'factors')),
value = TRUE),
'+',
Expand All @@ -60,7 +61,8 @@ interpret_mvgam = function(formula, N, family){
ifelse(int == 0, ' - 1', '')))

} else {
newformula <- as.formula(paste(terms.formula(formula)[[2]], '~',
newformula <- as.formula(paste(dimnames(attr(terms(formula), 'factors'))[[1]][1],
'~',
paste(paste(newfacs, collapse = '+'),
'+',
paste(refacs, collapse = '+'),
Expand Down
24 changes: 18 additions & 6 deletions R/mvgam.R
Original file line number Diff line number Diff line change
Expand Up @@ -756,8 +756,18 @@ mvgam = function(formula,
validate_family_resrictions(response = orig_y, family = family)
}

# Replace any NAs in the response for initial setup
data_train$y <- replace_nas(data_train$y)
# Fill in missing observations in data_train so the size of the dataset is correct when
# building the initial JAGS model
resp_terms <- as.character(terms(formula(formula))[[2]])
if(length(resp_terms) == 1){
out_name <- as.character(terms(formula(formula))[[2]])
} else {
if(any(grepl('cbind', resp_terms))){
resp_terms <- resp_terms[-grepl('cbind', resp_terms)]
out_name <- resp_terms[1]
}
}
data_train[[out_name]] <- replace_nas(data_train[[out_name]])

# Compute default priors
def_priors <- adapt_brms_priors(c(make_default_scales(orig_y,
Expand All @@ -783,7 +793,7 @@ mvgam = function(formula,
ss_gam <- try(mvgam_setup(formula = formula,
knots = knots,
family = family_to_mgcvfam(family),
data = data_train),
dat = data_train),
silent = TRUE)
if(inherits(ss_gam, 'try-error')){
if(grepl('missing values', ss_gam[1])){
Expand Down Expand Up @@ -827,7 +837,8 @@ mvgam = function(formula,
}
if(length(ss_gam$smooth) == 0) ss_jagam$jags.ini$lambda <- NULL

# Fill y with NAs if this is a simulation from the priors
# Fill y with NAs if this is a simulation from the priors;
# otherwise replace with the original supplied values
data_train <- check_priorsim(prior_simulation,
data_train, orig_y,
formula)
Expand Down Expand Up @@ -1338,7 +1349,8 @@ mvgam = function(formula,
n_lv = n_lv,
jags_data = ss_jagam$jags.data,
family = ifelse(family_char %in% c('binomial',
'bernoulli'),
'bernoulli',
'beta_binomial'),
'poisson',
family_char),
upper_bounds = upper_bounds)
Expand Down Expand Up @@ -1585,7 +1597,7 @@ mvgam = function(formula,
}

# Updates for Binomial and Bernoulli families
if(family_char %in% c('binomial', 'bernoulli')){
if(family_char %in% c('binomial', 'bernoulli', 'beta_binomial')){
bin_additions <- add_binomial(formula,
vectorised$model_file,
vectorised$model_data,
Expand Down
6 changes: 3 additions & 3 deletions R/mvgam_setup.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
mvgam_setup <- function(formula,
knots,
family = gaussian(),
data = list(),
dat = list(),
na.action,
drop.unused.levels = FALSE,
maxit = 5) {

if(missing(knots)){
# Initialise the GAM for a few iterations to ensure it all works without error
suppressWarnings(mgcv::gam(formula(formula),
data = data,
data = dat,
method = 'GCV.Cp',
family = family,
control = list(maxit = maxit),
Expand All @@ -22,7 +22,7 @@ mvgam_setup <- function(formula,
select = TRUE))
} else {
suppressWarnings(mgcv::gam(formula(formula),
data = data,
data = dat,
method = 'GCV.Cp',
family = family,
knots = knots,
Expand Down
2 changes: 1 addition & 1 deletion R/validations.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ validate_family_resrictions = function(response, family){

# negatives not allowed for several families
if(family$family %in% c('poisson', 'negative binomial',
'tweedie', 'binomial')){
'tweedie', 'binomial', 'beta_binomial')){
if(any(response < 0)){
stop(paste0('Values < 0 not allowed for count family responses'),
call. = FALSE)
Expand Down
5 changes: 2 additions & 3 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,8 @@ The goal of `mvgam` is to fit Bayesian Dynamic Generalized Additive Models to ti
A series of [vignettes cover data formatting, forecasting and several extended case studies of DGAMs](https://nicholasjclark.github.io/mvgam/){target="_blank"}. A number of other examples have also been compiled:

* [Ecological Forecasting with Dynamic Generalized Additive Models](https://www.youtube.com/watch?v=0zZopLlomsQ){target="_blank"}
* [mvgam case study 1: model comparison and data assimilation](https://rpubs.com/NickClark47/mvgam){target="_blank"}
* [mvgam case study 2: multivariate models](https://rpubs.com/NickClark47/mvgam2){target="_blank"}
* [mvgam case study 3: distributed lag models](https://rpubs.com/NickClark47/mvgam3){target="_blank"}
* [Distributed lags (and hierarchical distributed lags) using `mgcv` and `mvgam`](https://ecogambler.netlify.app/blog/distributed-lags-mgcv/){target="_blank"}
* [Ecological Forecasting with Dynamic GAMs; a tutorial and detailed case study](https://www.youtube.com/watch?v=RwllLjgPUmM){target="_blank"}

## Installation
Install from `GitHub` using:
Expand Down
52 changes: 26 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ been compiled:
- <a href="https://www.youtube.com/watch?v=0zZopLlomsQ"
target="_blank">Ecological Forecasting with Dynamic Generalized Additive
Models</a>
- <a href="https://rpubs.com/NickClark47/mvgam" target="_blank">mvgam case
study 1: model comparison and data assimilation</a>
- <a href="https://rpubs.com/NickClark47/mvgam2" target="_blank">mvgam
case study 2: multivariate models</a>
- <a href="https://rpubs.com/NickClark47/mvgam3" target="_blank">mvgam
case study 3: distributed lag models</a>
- <a href="https://ecogambler.netlify.app/blog/distributed-lags-mgcv/"
target="_blank">Distributed lags (and hierarchical distributed lags)
using <code>mgcv</code> and <code>mvgam</code></a>
- <a href="https://www.youtube.com/watch?v=RwllLjgPUmM"
target="_blank">Ecological Forecasting with Dynamic GAMs; a tutorial and
detailed case study</a>

## Installation

Expand Down Expand Up @@ -319,29 +319,29 @@ summary(lynx_mvgam)
#>
#>
#> GAM coefficient (beta) estimates:
#> 2.5% 50% 97.5% Rhat n_eff
#> (Intercept) 6.10 6.6000 7.0000 1.00 771
#> s(season).1 -0.59 0.0480 0.7200 1.00 973
#> s(season).2 -0.25 0.8000 1.8000 1.01 389
#> s(season).3 -0.12 1.2000 2.5000 1.01 321
#> s(season).4 -0.55 0.4100 1.3000 1.00 892
#> s(season).5 -1.20 -0.1000 0.9300 1.00 479
#> s(season).6 -1.00 -0.0089 1.1000 1.00 577
#> s(season).7 -0.72 0.3400 1.4000 1.01 659
#> s(season).8 -0.95 0.2200 1.8000 1.01 328
#> s(season).9 -1.10 -0.3100 0.6700 1.00 398
#> s(season).10 -1.30 -0.6700 -0.0054 1.01 595
#> 2.5% 50% 97.5% Rhat n_eff
#> (Intercept) 6.0000 6.600 7.0000 1.00 653
#> s(season).1 -0.6200 0.034 0.7100 1.00 788
#> s(season).2 -0.2300 0.840 1.8000 1.01 351
#> s(season).3 0.0081 1.300 2.5000 1.01 289
#> s(season).4 -0.4700 0.440 1.4000 1.00 739
#> s(season).5 -1.2000 -0.150 0.9200 1.01 485
#> s(season).6 -1.2000 -0.027 1.1000 1.00 612
#> s(season).7 -0.8000 0.380 1.5000 1.01 545
#> s(season).8 -1.0000 0.250 1.8000 1.01 282
#> s(season).9 -1.1000 -0.260 0.7100 1.01 369
#> s(season).10 -1.4000 -0.680 0.0082 1.01 459
#>
#> Approximate significance of GAM observation smooths:
#> edf Chi.sq p-value
#> s(season) 4.44 17999 0.27
#> s(season) 4.11 20324 0.22
#>
#> Latent trend AR parameter estimates:
#> 2.5% 50% 97.5% Rhat n_eff
#> ar1[1] 0.73 1.10 1.400 1.00 609
#> ar2[1] -0.82 -0.41 0.043 1.00 1562
#> ar3[1] -0.46 -0.12 0.300 1.01 510
#> sigma[1] 0.40 0.50 0.630 1.00 1051
#> ar1[1] 0.72 1.10 1.400 1.01 668
#> ar2[1] -0.83 -0.41 0.059 1.00 1648
#> ar3[1] -0.47 -0.11 0.330 1.01 466
#> sigma[1] 0.40 0.50 0.640 1.00 1103
#>
#> Stan MCMC diagnostics:
#> n_eff / iter looks reasonable for all parameters
Expand All @@ -350,7 +350,7 @@ summary(lynx_mvgam)
#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
#> E-FMI indicated no pathological behavior
#>
#> Samples were drawn using NUTS(diag_e) at Tue Mar 12 1:50:01 PM 2024.
#> Samples were drawn using NUTS(diag_e) at Tue Mar 19 3:04:49 PM 2024.
#> For each parameter, n_eff is a crude measure of effective sample size,
#> and Rhat is the potential scale reduction factor on split MCMC chains
#> (at convergence, Rhat = 1)
Expand Down Expand Up @@ -472,7 +472,7 @@ plot(lynx_mvgam, type = 'forecast', newdata = lynx_test)
<img src="man/figures/README-unnamed-chunk-21-1.png" width="60%" style="display: block; margin: auto;" />

#> Out of sample CRPS:
#> [1] 2852.181
#> [1] 2903.294

And the estimated latent trend component, again using the more flexible
`plot_mvgam_...()` option to show first derivatives of the estimated
Expand Down Expand Up @@ -628,7 +628,7 @@ summary(mod, include_betas = FALSE)
#> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
#> E-FMI indicated no pathological behavior
#>
#> Samples were drawn using NUTS(diag_e) at Tue Mar 12 1:51:32 PM 2024.
#> Samples were drawn using NUTS(diag_e) at Tue Mar 19 3:06:31 PM 2024.
#> For each parameter, n_eff is a crude measure of effective sample size,
#> and Rhat is the potential scale reduction factor on split MCMC chains
#> (at convergence, Rhat = 1)
Expand Down
4 changes: 3 additions & 1 deletion index.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ vembedr::embed_url("https://www.youtube.com/watch?v=0zZopLlomsQ")
* `lognormal()` for non-negative real-valued data
* `Gamma()` for non-negative real-valued data
* `betar()` for proportional data on `(0,1)`
* `bernoulli()` for binary data
* `poisson()` for count data
* `nb()` for overdispersed count data
* `nmix()` for count data with imperfect detection
* `binomial()` for count data with known number of trials
* `nmix()` for count data with imperfect detection (unknown number of trials)
* `tweedie()` for overdispersed count data

Note that only `poisson()`, `nb()`, and `tweedie()` are available if using `JAGS`. All families, apart from `tweedie()`, are supported if using `Stan`. See `??mvgam_families` for more information. Below is a simple example for simulating and modelling proportional data with `Beta` observations over a set of seasonal series with independent Gaussian Process dynamic trends:
Expand Down
5 changes: 4 additions & 1 deletion index.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,12 @@ package can handle data for the following families:
- `lognormal()` for non-negative real-valued data
- `Gamma()` for non-negative real-valued data
- `betar()` for proportional data on `(0,1)`
- `bernoulli()` for binary data
- `poisson()` for count data
- `nb()` for overdispersed count data
- `nmix()` for count data with imperfect detection
- `binomial()` for count data with known number of trials
- `nmix()` for count data with imperfect detection (unknown number of
trials)
- `tweedie()` for overdispersed count data

Note that only `poisson()`, `nb()`, and `tweedie()` are available if
Expand Down
Binary file modified man/figures/README-unnamed-chunk-13-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-unnamed-chunk-14-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-unnamed-chunk-15-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-unnamed-chunk-16-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-unnamed-chunk-17-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-unnamed-chunk-18-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-unnamed-chunk-19-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-unnamed-chunk-20-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-unnamed-chunk-21-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-unnamed-chunk-22-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-unnamed-chunk-23-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified man/figures/README-unnamed-chunk-24-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified src/mvgam.dll
Binary file not shown.
Binary file modified tests/testthat/Rplots.pdf
Binary file not shown.
14 changes: 11 additions & 3 deletions tests/testthat/test-binomial.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ dat <- rbind(data.frame(y = rbinom(n = 50, size = trials, prob = detprob1),
dplyr::mutate(series = as.factor(series)) %>%
dplyr::arrange(time, series)

# Throw in some NAs
dat$y[c(1,5,9)] <- NA

# Training and testing splits
dat_train <- dat %>%
dplyr::filter(time <= 40)
Expand All @@ -32,7 +35,8 @@ test_that("cbind() syntax required for binomial()", {
'Binomial family requires cbind() syntax in the formula left-hand side',
fixed = TRUE)

mod <- mvgam(cbind(y, ntrials) ~ series + s(x, by = series),
mod <- mvgam(cbind(y, ntrials) ~ s(series, bs = 're') +
gp(x, by = series, c = 5/4, k = 5),
family = binomial(),
data = dat_train,
run_model = FALSE)
Expand Down Expand Up @@ -78,6 +82,9 @@ dat <- rbind(data.frame(y = rbinom(n = 50, size = 1, prob = detprob1),
dplyr::mutate(series = as.factor(series)) %>%
dplyr::arrange(time, series)

# Throw in some NAs
dat$y[c(1,5,9)] <- NA

# Training and testing splits
dat_train <- dat %>%
dplyr::filter(time <= 40)
Expand All @@ -92,7 +99,8 @@ test_that("bernoulli() behaves appropriately", {
'y values must be 0 <= y <= 1',
fixed = TRUE)

mod <- mvgam(y ~ series + s(x, by = series),
mod <- mvgam(y ~ s(series, bs = 're') +
gp(x, by = series, c = 5/4, k = 5),
family = bernoulli(),
data = dat_train,
run_model = FALSE)
Expand All @@ -102,7 +110,7 @@ test_that("bernoulli() behaves appropriately", {

# Also with a trend_formula
mod <- mvgam(y ~ series,
trend_formula = ~ s(x, by = trend),
trend_formula = ~ gp(x, by = trend, c = 5/4),
trend_model = AR(),
family = bernoulli(),
data = dat_train,
Expand Down
4 changes: 0 additions & 4 deletions text.txt

This file was deleted.

4 changes: 3 additions & 1 deletion vignettes/mvgam_overview.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@ Here $\alpha$ are the unknown intercepts, the $\boldsymbol{s}$'s are unknown smo
|LogNormal (identity link) | `lognormal()` | Positive real values in $[0, \infty)$ | $\sigma$ |
|Gamma (log link) | `Gamma()` | Positive real values in $[0, \infty)$ | $\alpha$ |
|Beta (logit link) | `betar()` | Real values (proportional) in $[0,1]$ | $\phi$ |
|Bernoulli (logit link) | `bernoulli()` | Binary data in ${0,1}$ | - |
|Poisson (log link) | `poisson()` | Non-negative integers in $(0,1,2,...)$ | - |
|Negative Binomial2 (log link)| `nb()` | Non-negative integers in $(0,1,2,...)$ | $\phi$ |
|Binomial (logit link) | `binomial()` | Non-negative integers in $(0,1,2,...)$ | - |
|Poisson Binomial N-mixture (log link)| `nmix()` | Non-negative integers in $(0,1,2,...)$ | - |

For all supported observation families, any extra parameters that need to be estimated (i.e. the $\sigma$ in a Gaussian model or the $\phi$ in a Negative Binomial model) are estimated independently for each series. Note that default link functions cannot currently be changed in `mvgam`.
For all supported observation families, any extra parameters that need to be estimated (i.e. the $\sigma$ in a Gaussian model or the $\phi$ in a Negative Binomial model) are by default estimated independently for each series. However, users can opt to force all series to share extra observation parameters using `share_obs_params = TRUE` in `mvgam()`. Note that default link functions cannot currently be changed.

## Supported temporal dynamic processes
The dynamic processes can take a wide variety of forms, some of which can be multivariate to allow the different time series to interact or be correlated. When using the `mvgam()` function, the user chooses between different process models with the `trend_model` argument. Available process models are described in detail below.
Expand Down

0 comments on commit 2cbd0de

Please sign in to comment.