bug fix to deal with NAs for binomial

nicholasjclark · Mar 19, 2024 · 2cbd0de · 2cbd0de
1 parent 1ebdff1
commit 2cbd0de
Show file tree

Hide file tree

Showing 27 changed files with 91 additions and 55 deletions.
diff --git a/R/get_mvgam_priors.R b/R/get_mvgam_priors.R
@@ -422,7 +422,7 @@ get_mvgam_priors = function(formula,
     # in the model
     ss_gam <- try(mvgam_setup(formula = formula,
                               family = family_to_mgcvfam(family),
-                              data = data_train,
+                              dat = data_train,
                               drop.unused.levels = FALSE,
                               maxit = 5),
                   silent = TRUE)

diff --git a/R/gp.R b/R/gp.R
@@ -80,8 +80,19 @@ make_gp_additions = function(gp_details, data,
     term_k <- mgcv_model$smooth[[min(which(smooth_bys %in% by[x] &
                                smooth_terms == gp_covariates[x]))]]$df
 
+    # Check that response terms use the cbind() syntax
+    resp_terms <- rlang::f_lhs(formula(mgcv_model))
+    if(length(resp_terms) == 1){
+      response <- resp_terms
+    } else {
+      if(any(grepl('cbind', resp_terms))){
+        resp_terms <- resp_terms[-grepl('cbind', resp_terms)]
+        response <- resp_terms[1]
+      }
+    }
+
     prep_gp_covariate(data = data,
-                      response = rlang::f_lhs(formula(mgcv_model)),
+                      response = response,
                       covariate = gp_covariates[x],
                       by = by[x],
                       level = level[x],
@@ -517,7 +528,8 @@ prep_gp_covariate = function(data,
                              k = 20){
 
   # Get default gp param priors from a call to brms::get_prior()
-  def_gp_prior <- suppressWarnings(brms::get_prior(formula(paste0(response, ' ~ gp(', covariate,
+  def_gp_prior <- suppressWarnings(brms::get_prior(formula(paste0(response,
+                                                                  ' ~ gp(', covariate,
                                                  ifelse(is.na(by), ', ',
                                                         paste0(', by = ', by, ', ')),
                                                  'k = ', k,

diff --git a/R/interpret_mvgam.R b/R/interpret_mvgam.R
@@ -20,7 +20,7 @@ interpret_mvgam = function(formula, N, family){
     if(is.function(family))
       family <- family()
 
-    if(family$family == 'binomial'){
+    if(family$family %in% c('binomial', 'beta_binomial')){
       # Check that response terms use the cbind() syntax
       resp_terms <- as.character(terms(formula(formula))[[2]])
       if(length(resp_terms) == 1){
@@ -49,7 +49,8 @@ interpret_mvgam = function(formula, N, family){
 
     # Preserve offset if included
     if(!is.null(attr(terms(formula(formula)), 'offset'))){
-      newformula <- as.formula(paste(terms.formula(formula)[[2]], '~',
+      newformula <- as.formula(paste(dimnames(attr(terms(formula), 'factors'))[[1]][1],
+                                     '~',
                                      grep('offset', rownames(attr(terms.formula(formula), 'factors')),
                                           value = TRUE),
                                      '+',
@@ -60,7 +61,8 @@ interpret_mvgam = function(formula, N, family){
                                      ifelse(int == 0, ' - 1', '')))
 
     } else {
-      newformula <- as.formula(paste(terms.formula(formula)[[2]], '~',
+      newformula <- as.formula(paste(dimnames(attr(terms(formula), 'factors'))[[1]][1],
+                                     '~',
                                      paste(paste(newfacs, collapse = '+'),
                                            '+',
                                            paste(refacs, collapse = '+'),

diff --git a/R/mvgam.R b/R/mvgam.R
@@ -756,8 +756,18 @@ mvgam = function(formula,
     validate_family_resrictions(response = orig_y, family = family)
   }
 
-  # Replace any NAs in the response for initial setup
-  data_train$y <- replace_nas(data_train$y)
+  # Fill in missing observations in data_train so the size of the dataset is correct when
+  # building the initial JAGS model
+  resp_terms <- as.character(terms(formula(formula))[[2]])
+  if(length(resp_terms) == 1){
+    out_name <- as.character(terms(formula(formula))[[2]])
+  } else {
+    if(any(grepl('cbind', resp_terms))){
+      resp_terms <- resp_terms[-grepl('cbind', resp_terms)]
+      out_name <- resp_terms[1]
+    }
+  }
+  data_train[[out_name]] <- replace_nas(data_train[[out_name]])
 
   # Compute default priors
   def_priors <- adapt_brms_priors(c(make_default_scales(orig_y,
@@ -783,7 +793,7 @@ mvgam = function(formula,
   ss_gam <- try(mvgam_setup(formula = formula,
                             knots = knots,
                             family = family_to_mgcvfam(family),
-                            data = data_train),
+                            dat = data_train),
                 silent = TRUE)
   if(inherits(ss_gam, 'try-error')){
     if(grepl('missing values', ss_gam[1])){
@@ -827,7 +837,8 @@ mvgam = function(formula,
   }
   if(length(ss_gam$smooth) == 0) ss_jagam$jags.ini$lambda <- NULL
 
-  # Fill y with NAs if this is a simulation from the priors
+  # Fill y with NAs if this is a simulation from the priors;
+  # otherwise replace with the original supplied values
   data_train <- check_priorsim(prior_simulation,
                                data_train, orig_y,
                                formula)
@@ -1338,7 +1349,8 @@ mvgam = function(formula,
                                   n_lv = n_lv,
                                   jags_data = ss_jagam$jags.data,
                                   family = ifelse(family_char %in% c('binomial',
-                                                                     'bernoulli'),
+                                                                     'bernoulli',
+                                                                     'beta_binomial'),
                                                   'poisson',
                                                   family_char),
                                   upper_bounds = upper_bounds)
@@ -1585,7 +1597,7 @@ mvgam = function(formula,
     }
 
     # Updates for Binomial and Bernoulli families
-    if(family_char %in% c('binomial', 'bernoulli')){
+    if(family_char %in% c('binomial', 'bernoulli', 'beta_binomial')){
       bin_additions <- add_binomial(formula,
                                     vectorised$model_file,
                                     vectorised$model_data,

diff --git a/R/mvgam_setup.R b/R/mvgam_setup.R
@@ -5,15 +5,15 @@
 mvgam_setup <- function(formula,
                         knots,
                         family = gaussian(),
-                        data = list(),
+                        dat = list(),
                         na.action,
                         drop.unused.levels = FALSE,
                         maxit = 5) {
 
   if(missing(knots)){
     # Initialise the GAM for a few iterations to ensure it all works without error
     suppressWarnings(mgcv::gam(formula(formula),
-                               data = data,
+                               data = dat,
                                method = 'GCV.Cp',
                                family = family,
                                control = list(maxit = maxit),
@@ -22,7 +22,7 @@ mvgam_setup <- function(formula,
                                select = TRUE))
   } else {
     suppressWarnings(mgcv::gam(formula(formula),
-                               data = data,
+                               data = dat,
                                method = 'GCV.Cp',
                                family = family,
                                knots = knots,

diff --git a/R/validations.R b/R/validations.R
@@ -133,7 +133,7 @@ validate_family_resrictions = function(response, family){
 
   # negatives not allowed for several families
   if(family$family %in%  c('poisson', 'negative binomial',
-                           'tweedie', 'binomial')){
+                           'tweedie', 'binomial', 'beta_binomial')){
     if(any(response < 0)){
       stop(paste0('Values < 0 not allowed for count family responses'),
            call. = FALSE)

diff --git a/README.Rmd b/README.Rmd
@@ -29,9 +29,8 @@ The goal of `mvgam` is to fit Bayesian Dynamic Generalized Additive Models to ti
 A series of [vignettes cover data formatting, forecasting and several extended case studies of DGAMs](https://nicholasjclark.github.io/mvgam/){target="_blank"}. A number of other examples have also been compiled:
 
 * [Ecological Forecasting with Dynamic Generalized Additive Models](https://www.youtube.com/watch?v=0zZopLlomsQ){target="_blank"}
-* [mvgam case study 1: model comparison and data assimilation](https://rpubs.com/NickClark47/mvgam){target="_blank"}
-* [mvgam case study 2: multivariate models](https://rpubs.com/NickClark47/mvgam2){target="_blank"}
-* [mvgam case study 3: distributed lag models](https://rpubs.com/NickClark47/mvgam3){target="_blank"}
+* [Distributed lags (and hierarchical distributed lags) using `mgcv` and `mvgam`](https://ecogambler.netlify.app/blog/distributed-lags-mgcv/){target="_blank"}
+* [Ecological Forecasting with Dynamic GAMs; a tutorial and detailed case study](https://www.youtube.com/watch?v=RwllLjgPUmM){target="_blank"}
 
 ## Installation
 Install from `GitHub` using:

diff --git a/README.md b/README.md
@@ -29,12 +29,12 @@ been compiled:
 - <a href="https://www.youtube.com/watch?v=0zZopLlomsQ"
   target="_blank">Ecological Forecasting with Dynamic Generalized Additive
   Models</a>
-- <a href="https://rpubs.com/NickClark47/mvgam" target="_blank">mvgam case
-  study 1: model comparison and data assimilation</a>
-- <a href="https://rpubs.com/NickClark47/mvgam2" target="_blank">mvgam
-  case study 2: multivariate models</a>
-- <a href="https://rpubs.com/NickClark47/mvgam3" target="_blank">mvgam
-  case study 3: distributed lag models</a>
+- <a href="https://ecogambler.netlify.app/blog/distributed-lags-mgcv/"
+  target="_blank">Distributed lags (and hierarchical distributed lags)
+  using <code>mgcv</code> and <code>mvgam</code></a>
+- <a href="https://www.youtube.com/watch?v=RwllLjgPUmM"
+  target="_blank">Ecological Forecasting with Dynamic GAMs; a tutorial and
+  detailed case study</a>
 
 ## Installation
 
@@ -319,29 +319,29 @@ summary(lynx_mvgam)
 #> 
 #> 
 #> GAM coefficient (beta) estimates:
-#>               2.5%     50%   97.5% Rhat n_eff
-#> (Intercept)   6.10  6.6000  7.0000 1.00   771
-#> s(season).1  -0.59  0.0480  0.7200 1.00   973
-#> s(season).2  -0.25  0.8000  1.8000 1.01   389
-#> s(season).3  -0.12  1.2000  2.5000 1.01   321
-#> s(season).4  -0.55  0.4100  1.3000 1.00   892
-#> s(season).5  -1.20 -0.1000  0.9300 1.00   479
-#> s(season).6  -1.00 -0.0089  1.1000 1.00   577
-#> s(season).7  -0.72  0.3400  1.4000 1.01   659
-#> s(season).8  -0.95  0.2200  1.8000 1.01   328
-#> s(season).9  -1.10 -0.3100  0.6700 1.00   398
-#> s(season).10 -1.30 -0.6700 -0.0054 1.01   595
+#>                 2.5%    50%  97.5% Rhat n_eff
+#> (Intercept)   6.0000  6.600 7.0000 1.00   653
+#> s(season).1  -0.6200  0.034 0.7100 1.00   788
+#> s(season).2  -0.2300  0.840 1.8000 1.01   351
+#> s(season).3   0.0081  1.300 2.5000 1.01   289
+#> s(season).4  -0.4700  0.440 1.4000 1.00   739
+#> s(season).5  -1.2000 -0.150 0.9200 1.01   485
+#> s(season).6  -1.2000 -0.027 1.1000 1.00   612
+#> s(season).7  -0.8000  0.380 1.5000 1.01   545
+#> s(season).8  -1.0000  0.250 1.8000 1.01   282
+#> s(season).9  -1.1000 -0.260 0.7100 1.01   369
+#> s(season).10 -1.4000 -0.680 0.0082 1.01   459
 #> 
 #> Approximate significance of GAM observation smooths:
 #>            edf Chi.sq p-value
-#> s(season) 4.44  17999    0.27
+#> s(season) 4.11  20324    0.22
 #> 
 #> Latent trend AR parameter estimates:
 #>           2.5%   50% 97.5% Rhat n_eff
-#> ar1[1]    0.73  1.10 1.400 1.00   609
-#> ar2[1]   -0.82 -0.41 0.043 1.00  1562
-#> ar3[1]   -0.46 -0.12 0.300 1.01   510
-#> sigma[1]  0.40  0.50 0.630 1.00  1051
+#> ar1[1]    0.72  1.10 1.400 1.01   668
+#> ar2[1]   -0.83 -0.41 0.059 1.00  1648
+#> ar3[1]   -0.47 -0.11 0.330 1.01   466
+#> sigma[1]  0.40  0.50 0.640 1.00  1103
 #> 
 #> Stan MCMC diagnostics:
 #> n_eff / iter looks reasonable for all parameters
@@ -350,7 +350,7 @@ summary(lynx_mvgam)
 #> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
 #> E-FMI indicated no pathological behavior
 #> 
-#> Samples were drawn using NUTS(diag_e) at Tue Mar 12 1:50:01 PM 2024.
+#> Samples were drawn using NUTS(diag_e) at Tue Mar 19 3:04:49 PM 2024.
 #> For each parameter, n_eff is a crude measure of effective sample size,
 #> and Rhat is the potential scale reduction factor on split MCMC chains
 #> (at convergence, Rhat = 1)
@@ -472,7 +472,7 @@ plot(lynx_mvgam, type = 'forecast', newdata = lynx_test)
 <img src="man/figures/README-unnamed-chunk-21-1.png" width="60%" style="display: block; margin: auto;" />
 
     #> Out of sample CRPS:
-    #> [1] 2852.181
+    #> [1] 2903.294
 
 And the estimated latent trend component, again using the more flexible
 `plot_mvgam_...()` option to show first derivatives of the estimated
@@ -628,7 +628,7 @@ summary(mod, include_betas = FALSE)
 #> 0 of 2000 iterations saturated the maximum tree depth of 12 (0%)
 #> E-FMI indicated no pathological behavior
 #> 
-#> Samples were drawn using NUTS(diag_e) at Tue Mar 12 1:51:32 PM 2024.
+#> Samples were drawn using NUTS(diag_e) at Tue Mar 19 3:06:31 PM 2024.
 #> For each parameter, n_eff is a crude measure of effective sample size,
 #> and Rhat is the potential scale reduction factor on split MCMC chains
 #> (at convergence, Rhat = 1)

diff --git a/index.Rmd b/index.Rmd
@@ -32,9 +32,11 @@ vembedr::embed_url("https://www.youtube.com/watch?v=0zZopLlomsQ")
 * `lognormal()` for non-negative real-valued data
 * `Gamma()` for non-negative real-valued data
 * `betar()` for proportional data on `(0,1)`
+* `bernoulli()` for binary data
 * `poisson()` for count data
 * `nb()` for overdispersed count data
-* `nmix()` for count data with imperfect detection
+* `binomial()` for count data with known number of trials
+* `nmix()` for count data with imperfect detection (unknown number of trials)
 * `tweedie()` for overdispersed count data
 
 Note that only `poisson()`, `nb()`, and `tweedie()` are available if using `JAGS`. All families, apart from `tweedie()`, are supported if using `Stan`. See `??mvgam_families` for more information. Below is a simple example for simulating and modelling proportional data with `Beta` observations over a set of seasonal series with independent Gaussian Process dynamic trends:

diff --git a/index.md b/index.md
@@ -72,9 +72,12 @@ package can handle data for the following families:
 - `lognormal()` for non-negative real-valued data
 - `Gamma()` for non-negative real-valued data
 - `betar()` for proportional data on `(0,1)`
+- `bernoulli()` for binary data
 - `poisson()` for count data
 - `nb()` for overdispersed count data
-- `nmix()` for count data with imperfect detection
+- `binomial()` for count data with known number of trials
+- `nmix()` for count data with imperfect detection (unknown number of
+  trials)
 - `tweedie()` for overdispersed count data
 
 Note that only `poisson()`, `nb()`, and `tweedie()` are available if

diff --git a/man/figures/README-unnamed-chunk-13-1.png b/man/figures/README-unnamed-chunk-13-1.png
diff --git a/man/figures/README-unnamed-chunk-14-1.png b/man/figures/README-unnamed-chunk-14-1.png
diff --git a/man/figures/README-unnamed-chunk-15-1.png b/man/figures/README-unnamed-chunk-15-1.png
diff --git a/man/figures/README-unnamed-chunk-16-1.png b/man/figures/README-unnamed-chunk-16-1.png
diff --git a/man/figures/README-unnamed-chunk-17-1.png b/man/figures/README-unnamed-chunk-17-1.png
diff --git a/man/figures/README-unnamed-chunk-18-1.png b/man/figures/README-unnamed-chunk-18-1.png
diff --git a/man/figures/README-unnamed-chunk-19-1.png b/man/figures/README-unnamed-chunk-19-1.png
diff --git a/man/figures/README-unnamed-chunk-20-1.png b/man/figures/README-unnamed-chunk-20-1.png
diff --git a/man/figures/README-unnamed-chunk-21-1.png b/man/figures/README-unnamed-chunk-21-1.png
diff --git a/man/figures/README-unnamed-chunk-22-1.png b/man/figures/README-unnamed-chunk-22-1.png
diff --git a/man/figures/README-unnamed-chunk-23-1.png b/man/figures/README-unnamed-chunk-23-1.png
diff --git a/man/figures/README-unnamed-chunk-24-1.png b/man/figures/README-unnamed-chunk-24-1.png
diff --git a/src/mvgam.dll b/src/mvgam.dll
diff --git a/tests/testthat/Rplots.pdf b/tests/testthat/Rplots.pdf
diff --git a/tests/testthat/test-binomial.R b/tests/testthat/test-binomial.R
@@ -18,6 +18,9 @@ dat <- rbind(data.frame(y = rbinom(n = 50, size = trials, prob = detprob1),
   dplyr::mutate(series = as.factor(series)) %>%
   dplyr::arrange(time, series)
 
+# Throw in some NAs
+dat$y[c(1,5,9)] <- NA
+
 # Training and testing splits
 dat_train <- dat %>%
   dplyr::filter(time <= 40)
@@ -32,7 +35,8 @@ test_that("cbind() syntax required for binomial()", {
                'Binomial family requires cbind() syntax in the formula left-hand side',
                fixed = TRUE)
 
-  mod <- mvgam(cbind(y, ntrials) ~ series + s(x, by = series),
+  mod <- mvgam(cbind(y, ntrials) ~ s(series, bs = 're') +
+                 gp(x, by = series, c = 5/4, k = 5),
                family = binomial(),
                data = dat_train,
                run_model = FALSE)
@@ -78,6 +82,9 @@ dat <- rbind(data.frame(y = rbinom(n = 50, size = 1, prob = detprob1),
   dplyr::mutate(series = as.factor(series)) %>%
   dplyr::arrange(time, series)
 
+# Throw in some NAs
+dat$y[c(1,5,9)] <- NA
+
 # Training and testing splits
 dat_train <- dat %>%
   dplyr::filter(time <= 40)
@@ -92,7 +99,8 @@ test_that("bernoulli() behaves appropriately", {
                'y values must be 0 <= y <= 1',
                fixed = TRUE)
 
-  mod <- mvgam(y ~ series + s(x, by = series),
+  mod <- mvgam(y ~ s(series, bs = 're') +
+                 gp(x, by = series, c = 5/4, k = 5),
                family = bernoulli(),
                data = dat_train,
                run_model = FALSE)
@@ -102,7 +110,7 @@ test_that("bernoulli() behaves appropriately", {
 
   # Also with a trend_formula
   mod <- mvgam(y ~ series,
-               trend_formula = ~ s(x, by = trend),
+               trend_formula = ~ gp(x, by = trend, c = 5/4),
                trend_model = AR(),
                family = bernoulli(),
                data = dat_train,

diff --git a/text.txt b/text.txt
diff --git a/vignettes/mvgam_overview.Rmd b/vignettes/mvgam_overview.Rmd
@@ -44,11 +44,13 @@ Here $\alpha$ are the unknown intercepts, the $\boldsymbol{s}$'s are unknown smo
 |LogNormal (identity link)        | `lognormal()`   | Positive real values in $[0, \infty)$             | $\sigma$             |
 |Gamma (log link)             | `Gamma()`       | Positive real values in $[0, \infty)$             | $\alpha$             |
 |Beta (logit link)              | `betar()`       | Real values (proportional) in $[0,1]$             | $\phi$               |
+|Bernoulli (logit link)         | `bernoulli()`   | Binary data in ${0,1}$                          | -                    |
 |Poisson (log link)           | `poisson()`     | Non-negative integers in $(0,1,2,...)$            | -                    |
 |Negative Binomial2 (log link)| `nb()`          | Non-negative integers in $(0,1,2,...)$            | $\phi$               |
+|Binomial (logit link)           | `binomial()` | Non-negative integers in $(0,1,2,...)$            | -                    |
 |Poisson Binomial N-mixture (log link)| `nmix()`  | Non-negative integers in $(0,1,2,...)$            | -               |
 
-For all supported observation families, any extra parameters that need to be estimated (i.e. the $\sigma$ in a Gaussian model or the $\phi$ in a Negative Binomial model) are estimated independently for each series. Note that default link functions cannot currently be changed in `mvgam`.
+For all supported observation families, any extra parameters that need to be estimated (i.e. the $\sigma$ in a Gaussian model or the $\phi$ in a Negative Binomial model) are by default estimated independently for each series. However, users can opt to force all series to share extra observation parameters using `share_obs_params = TRUE` in `mvgam()`. Note that default link functions cannot currently be changed.
 
 ## Supported temporal dynamic processes
 The dynamic processes can take a wide variety of forms, some of which can be multivariate to allow the different time series to interact or be correlated. When using the `mvgam()` function, the user chooses between different process models with the `trend_model` argument. Available process models are described in detail below.