diff --git a/R/add_stan_data.R b/R/add_stan_data.R new file mode 100644 index 00000000..9afe3e8f --- /dev/null +++ b/R/add_stan_data.R @@ -0,0 +1,308 @@ +#' Add remaining data, model and parameter blocks to a Stan model +#' +#' +#' @export +#' @param jags_file Prepared JAGS mvgam model file +#' @param stan_file Incomplete Stan model file to be edited +#' @param jags_data Prepared mvgam data for JAGS modelling +#' @return A `list` containing the updated Stan model and model data +add_stan_data = function(jags_file, stan_file, jags_data, family = 'poisson'){ + + #### Modify the Stan file #### + # Update lines associated with particular family + if(family == 'poisson'){ + + } + + if(family == 'nb'){ + stan_file[grep('// raw basis', stan_file) + 2] <- + '\n// negative binomial overdispersion\nvector[n_series] r_inv;\n' + + stan_file[grep('// priors for smoothing', stan_file) + 2] <- + '\n// priors for overdispersion parameters\nr_inv ~ normal(0, 10);\n' + + to_negbin <- gsub('poisson_log', 'neg_binomial_2', + stan_file[grep('y[i, s] ~ poisson', stan_file, fixed = T)]) + stan_file[grep('y[i, s] ~ poisson', stan_file, fixed = T)] <- + gsub(');', ', inv(r_inv[s]));', to_negbin) + + add_exp_open <- gsub('\\(eta', '(exp(eta', + stan_file[grep('y[i, s] ~ neg_binomial', stan_file, fixed = T)]) + add_exp_cl <- gsub('],', ']),', + add_exp_open) + stan_file[grep('y[i, s] ~ neg_binomial', stan_file, fixed = T)] <- + add_exp_cl + + stan_file[grep('// posterior predictions', stan_file, fixed = T) - 1] <- + paste0('matrix[n, n_series] r_vec;\n', + 'vector[n_series] r;\n', + 'r = inv(r_inv);\n', + 'for (s in 1:n_series) {\n', + 'r_vec[1:n,s] = rep_vector(inv(r_inv[s]), n);\n}\n') + + to_negbin <- gsub('poisson_log_rng', 'neg_binomial_2_rng', + stan_file[grep('ypred = poisson_log_rng', stan_file, fixed = T)]) + stan_file[grep('ypred = poisson_log_rng', stan_file, fixed = T)] <- + gsub(');', ', to_vector(r_vec));', to_negbin) + + add_exp_open <- gsub('\\(eta', '(exp(eta', + stan_file[grep('ypred = neg_binomial', stan_file, fixed = T)]) + + if(any(grepl('to_vector(trend)', stan_file, fixed = T))){ + add_exp_cl <- gsub('to_vector(trend)', 'to_vector(trend))', + add_exp_open, fixed = T) + } else { + add_exp_cl <- gsub('eta)', 'eta,),', + add_exp_open) + } + + stan_file[grep('ypred = neg_binomial', stan_file, fixed = T)] <- + add_exp_cl + + stan_file <- readLines(textConnection(stan_file), n = -1) + } + + # Get dimensions and numbers of smooth terms + snames <- names(jags_data)[grep('S.*', names(jags_data))] + smooth_dims <- matrix(NA, ncol = 2, nrow = length(snames)) + for(i in 1:length(snames)){ + smooth_dims[i,] <- dim(jags_data[[snames[i]]]) + } + + # Insert the data block for the model + smooth_penalty_data <- vector() + for(i in 1:length(snames)){ + smooth_penalty_data[i] <- paste0('matrix[', smooth_dims[i, 1], + ',', + smooth_dims[i, 2], '] ', + snames[i], + '; // mgcv smooth penalty matrix ', snames[i]) + } + + # Search for any non-contiguous indices that sometimes are used by mgcv + if(any(grep('in c\\(', jags_file))){ + add_idxs <- TRUE + seq_character = function(x){ + all_nums <- as.numeric(unlist(strsplit(x, ':'))) + if(length(all_nums) > 1){ + out <- seq(all_nums[1], all_nums[2]) + } else { + out <- all_nums + } + out + } + + idx_locations <- grep('in c\\(', jags_file) + idx_vals <- list() + idx_data <- vector() + for(i in 1:length(idx_locations)){ + list_vals <- unlist(strsplit(gsub('^.*c\\(*|\\s*).*$', '', jags_file[idx_locations[i]]), ',')) + idx_vals[[i]] <- unlist(lapply(list_vals, seq_character)) + idx_data[i] <- paste0('int idx', i, '[', length(idx_vals[[i]]), ']; // discontiguous index values') + jags_file[idx_locations][i] <- sub("in.*\\)\\)", paste0("in idx", i, ')'), jags_file[idx_locations][i]) + } + + # Update the Stan data block + stan_file[grep('##insert data', + stan_file)] <- paste0('//Stan code generated by package mvgam\n', + 'data {', + '\n', + paste0(idx_data, collapse = '\n'), '\n', + 'int total_obs; // total number of observations\n', + 'int n; // number of timepoints per series\n', + 'int n_sp; // number of smoothing parameters\n', + 'int n_series; // number of series\n', + 'int num_basis; // total number of basis coefficients\n', + 'vector[num_basis] zero; // priro locations for basis coefficients\n', + 'matrix[num_basis, total_obs] X; // transposed mgcv GAM design matrix\n', + 'int ytimes[n, n_series]; // time-ordered matrix (which col in X belongs to each [time, series] observation?)\n', + paste0(smooth_penalty_data, collapse = '\n'), '\n', + 'int y_observed[n, n_series]; // indices of missing vs observed\n', + 'int y[n, n_series]; // time-ordered observations, with -1 indicating missing\n', + '}\n') + } else { + add_idxs <- FALSE + stan_file[grep('##insert data', + stan_file)] <- paste0('//Stan code generated by package mvgam\n', + 'data {', + '\n', + 'int total_obs; // total number of observations\n', + 'int n; // number of timepoints per series\n', + 'int n_sp; // number of smoothing parameters\n', + 'int n_series; // number of series\n', + 'int num_basis; // total number of basis coefficients\n', + 'vector[num_basis] zero; // prior locations for basis coefficients\n', + 'matrix[num_basis, total_obs] X; // transposed mgcv GAM design matrix\n', + 'int ytimes[n, n_series]; // time-ordered matrix (which col in X belongs to each [time, series] observation?)\n', + paste0(smooth_penalty_data, collapse = '\n'), '\n', + 'int y_observed[n, n_series]; // indices of missing vs observed\n', + 'int y[n, n_series]; // time-ordered observations, with -1 indicating missing\n', + '}\n') + } + stan_file <- readLines(textConnection(stan_file), n = -1) + + # Modify the model block to include each smooth term + smooths_start <- grep('## GAM-specific priors', jags_file) + 1 + smooths_end <- grep('## smoothing parameter priors...', jags_file) - 1 + jags_smooth_text <- jags_file[smooths_start:smooths_end] + jags_smooth_text <- gsub('##', '//', jags_smooth_text) + jags_smooth_text <- gsub('dexp', 'exponential', jags_smooth_text) + + K_starts <- grep('K.* <- ', jags_smooth_text) + for(i in 1:length(K_starts)){ + jags_smooth_text[K_starts[i]+1] <- gsub('\\bb\\b', 'b_raw', + gsub('dmnorm', 'multi_normal_prec', + paste0(gsub('K.*', + trimws(gsub('K.* <- ', '', + jags_smooth_text[K_starts[i]])), + jags_smooth_text[K_starts[i]+1]), ')'))) + } + jags_smooth_text <- jags_smooth_text[-K_starts] + if(any(grep('b\\[i\\] <- b_raw', jags_smooth_text))){ + jags_smooth_text <- jags_smooth_text[-grep('b\\[i\\] <- b_raw', jags_smooth_text)] + } + jags_smooth_text <- gsub('dnorm', 'normal', jags_smooth_text) + jags_smooth_text <- gsub(' ', ' ', jags_smooth_text) + jags_smooth_text[-grep('//|\\}|\\{', jags_smooth_text)] <- + paste0(jags_smooth_text[-grep('//|\\}|\\{', jags_smooth_text)], ';') + jags_smooth_text <- gsub(') }', '); }', jags_smooth_text) + jags_smooth_text <- gsub('}', '}\n', jags_smooth_text) + jags_smooth_text[(grep('//', + jags_smooth_text) - 1)[-1]] <- + paste0(jags_smooth_text[(grep('//', + jags_smooth_text) - 1)[-1]], '\n') + stan_file[grep('##insert smooths', + stan_file)] <- paste0(jags_smooth_text, collapse = '\n') + stan_file <- readLines(textConnection(stan_file), n = -1) + + # Deal with any random effect priors + if(any(grep('b_raw\\[i\\] ~', stan_file))){ + b_raw_string <- paste0(stan_file[grep('b_raw\\[i\\] ~', stan_file)-1], collapse = ',') + n_b_raw <- max(as.numeric(unlist(regmatches(b_raw_string, + gregexpr("[[:digit:]]+", + b_raw_string))))) + + n_sigma_raw <- max(as.numeric(unlist(regmatches(grep('sigma_raw', stan_file, value = T), + gregexpr("[[:digit:]]+", + grep('sigma_raw', + stan_file, value = T)))))) + + + stan_file <- stan_file[-grep('mu_raw.* ~ ', stan_file)] + stan_file <- stan_file[-grep('<- mu_raw', stan_file)] + stan_file <- stan_file[-grep('sigma_raw.* ~ ', stan_file)] + stan_file[grep('model \\{', stan_file)] <- + paste0('model {\n// prior for random effect population variances\nsigma_raw ~ exponential(0.5);\n\n', + '// prior for random effect population means\nmu_raw ~ normal(0, 1);\n') + + stan_file[grep('parameters \\{', stan_file)[1] + 2] <- + paste0(stan_file[grep('parameters \\{', stan_file)[1] + 2], + '\n', + '\n// random effect variances\n', + paste0('vector[',n_sigma_raw,'] sigma_raw', ';\n', collapse = ''), + '\n', + paste0('vector[',n_sigma_raw,'] mu_raw', ';\n', collapse = '')) + + b_raw_text <- vector() + b_raw_indices <- grep('b_raw\\[i\\] ~', stan_file) + for(i in 1:length(b_raw_indices)){ + + b_raw_text[i] <- paste0('for (i in ', as.numeric(sub('.*(?=.$)', '', + sub("\\:.*", "", + stan_file[b_raw_indices[i] - 1]), perl=T)), + ':', as.numeric(substr(sub(".*\\:", "", + stan_file[b_raw_indices[i]-1]), + 1, 1)),') {\nb[i] <- mu_raw[', i, '] + b_raw[i] * sigma_raw[',i, + '];\n}') + } + + # If parametric coefficients are included, they'll come before random effects + min_re_betas <- as.numeric(sub('.*(?=.$)', '', + sub("\\:.*", "", + stan_file[b_raw_indices[1]-1]), perl=T)) + if(min_re_betas > 1){ + b_raw_text <- c(paste0('\nfor (i in 1:', + min_re_betas - 1, ') {\nb[i] <- b_raw[i];\n}'), + b_raw_text, + paste0('\nfor (i in ', n_b_raw+1,':num_basis) {\nb[i] <- b_raw[i];\n}\n')) + } else { + b_raw_text <- c(b_raw_text, + paste0('\nfor (i in ', n_b_raw+1,':num_basis) {\nb[i] <- b_raw[i];\n}\n')) + } + + stan_file[grep('// basis coefficients', stan_file) + 2] <- paste0(b_raw_text, + collapse = '\n') + stan_file <- readLines(textConnection(stan_file), n = -1) + + # If no random effects, betas are equal to beta_raws + } else { + stan_file[grep('// basis coefficients', stan_file) + 2] <- + paste0('\nfor (i in ','1:num_basis) {\nb[i] <- b_raw[i];\n}') + stan_file <- readLines(textConnection(stan_file), n = -1) + } + + # Update parametric effect priors + if(any(grep('// parametric effect', stan_file))){ + stan_file[grep('// parametric effect', stan_file) + 1] <- + paste0('for (i in ', + + as.numeric(sub('.*(?=.$)', '', + sub("\\:.*", "", + stan_file[grep('// parametric effect', stan_file) + 1]), perl=T)), + ':', as.numeric(substr(sub(".*\\:", "", + stan_file[grep('// parametric effect', stan_file) + 1]), + 1, 1)), + ') {\nb_raw[i] ~ normal(0, 1);\n}') + stan_file <- readLines(textConnection(stan_file), n = -1) + } + unlink('base_gam_stan.txt') + stan_file <- readLines(textConnection(stan_file), n = -1) + + # Final tidying of the Stan model for readability + clean_up <- vector() + for(x in 1:length(stan_file)){ + clean_up[x] <- stan_file[x-1] == "" & stan_file[x] == "" + } + clean_up[is.na(clean_up)] <- FALSE + stan_file <- stan_file[!clean_up] + + + #### Modify the Stan data list #### + # Create matrix representing whether an observation was missing or not + y_observed <- matrix(NA, ncol = NCOL(jags_data$y), + nrow = NROW(jags_data$y)) + for (i in 1:dim(jags_data$y)[1]) { + for (s in 1:dim(jags_data$y)[2]) { + if (is.na(jags_data$y[i, s])) { + y_observed[i, s] = 0 + } else { + y_observed[i, s] = 1 + } + } + } + + # Use -1 for any missing observations so Stan doesn't throw errors due to NAs + y <- jags_data$y + y[is.na(y)] <- -1 + + # The data list for Stan + stan_data <- jags_data + stan_data$y <- y + stan_data$y_observed <- y_observed + stan_data$X <- t(stan_data$X) + stan_data$total_obs <- NCOL(stan_data$X) + stan_data$num_basis <- NROW(stan_data$X) + stan_data$n_sp <- as.numeric(sub('\\) \\{', '', + sub('for \\(i in 1\\:', '', + jags_file[grep('lambda\\[i\\] ~ ', + trimws(jags_file)) - 1]))) + + # Add discontiguous index values if required + if(add_idxs){ + names(idx_vals) <- paste0('idx', seq_len(length(idx_vals))) + stan_data <- append(stan_data, idx_vals) + } + + return(list(stan_file = stan_file, + model_data = stan_data)) +} diff --git a/R/mvgam.R b/R/mvgam.R new file mode 100644 index 00000000..e60688db --- /dev/null +++ b/R/mvgam.R @@ -0,0 +1,1040 @@ +#'Fit a Bayesian dynamic GAM to a univariate or multivariate set of discrete time series +#' +#'This function estimates the posterior distribution for Generalised Additive Models (GAMs) that can include +#'smooth functions, specified in the GAM formula, as well as latent temporal processes, specified by trend_model. +#'There are currently two options for specifying the structures of the trends (either as latent +#'dynamic factors to capture trend dependencies among series in a reduced dimension format, or as independent trends) +#' +#' +#'@param formula A \code{character} string specifying the GAM formula. These are exactly like the formula +#'for a GLM except that smooth terms, s, te, ti and t2, can be added to the right hand side +#'to specify that the linear predictor depends on smooth functions of predictors (or linear functionals of these). +#'@param knots An optional \code{list} containing user specified knot values to be used for basis construction. +#'For most bases the user simply supplies the knots to be used, which must match up with the k value supplied +#'(note that the number of knots is not always just k). Different terms can use different numbers of knots, +#'unless they share a covariate. +#'@param data_train A \code{dataframe} or \code{list} containing the model response variable and covariates +#'required by the GAM \code{formula}. Should include columns: +#''y' (the discrete outcomes; \code{NA}s allowed) +#''series' (character or factor index of the series IDs) +#''time' (numeric index of the time point for each observation). +#'Any other variables to be included in the linear predictor of \code{formula} must also be present +#'@param data_test Optional \code{dataframe} or \code{list} of test data containing at least 'series' and 'time' +#'in addition to any other variables included in the linear predictor of \code{formula}. If included, the +#'observations in \code{data_test} will be set to \code{NA} when fitting the model so that posterior +#'simulations can be obtained +#'@param run_model \code{logical}. If \code{FALSE}, the model is not fitted but instead the function will +#'return the model file and the data / initial values that are needed to fit the \code{JAGS} model +#'@param prior_simulation \code{logical}. If \code{TRUE}, no observations are fed to the model, and instead +#'simulations from prior distributions are returned +#'@param return_model_data \code{logical}. If \code{TRUE}, the list of data that is needed to fit the +#'model is returned, along with the initial values for smooth and AR parameters, once the model is fitted. +#'This will be helpful if users wish to modify the model file to add +#'other stochastic elements that are not currently avaiable in \code{mvgam}. Default is \code{FALSE} to reduce +#'the size of the returned object, unless \code{run_model == FALSE} +#'@param family \code{character}. Must be either 'nb' (for Negative Binomial), 'tw' (for Tweedie) or 'poisson' +#'@param use_lv \code{logical}. If \code{TRUE}, use dynamic factors to estimate series' +#'latent trends in a reduced dimension format. If \code{FALSE}, estimate independent latent trends for each series +#'@param n_lv \code{integer} the number of latent dynamic factors to use if \code{use_lv == TRUE}. +#'Cannot be \code{>n_series}. Defaults arbitrarily to \code{min(5, floor(n_series / 2))} +#'@param trend_model \code{character} specifying the time series dynamics for the latent trend. Options are: +#''None' (no latent trend component; i.e. the GAM component is all that contributes to the linear predictor, +#'and the observation process is the only source of error; similarly to what is estimated by \code{\link[mcgv]{gam}}), +#''RW' (random walk with possible drift), +#''AR1' (AR1 model with intercept), +#''AR2' (AR2 model with intercept) or +#''AR3' (AR3 model with intercept) or +#''GP' (Gaussian process with squared exponential kernel; currently under development and +#'only available for estimation in stan) +#'@param drift \code{logical} estimate a drift parameter in the latent trend components. Useful if the latent +#'trend is expected to broadly follow a non-zero slope. Note that if the latent trend is more or less stationary, +#'the drift parameter can become unidentifiable, especially if an intercept term is included in the GAM linear +#'predictor (which it is by default when calling \code{\link[mcgv]{jagam}}). Therefore this defaults to \code{FALSE} +#'@param chains \code{integer} specifying the number of parallel chains for the model +#'@param burnin \code{integer} specifying the number of iterations of the Markov chain to run during +#'adaptive mode to tune sampling algorithms +#'@param n_samples \code{integer} specifying the number of iterations of the Markov chain to run for +#'sampling the posterior distribution +#'@param thin Thinning interval for monitors +#'@param parallel \code{logical} specifying whether multiple cores should be used for +#'for generating \code{JAGS} simulations in parallel. If \code{TRUE}, the number of cores to use will be +#'\code{min(c(chains, parallel::detectCores() - 1))} +#'@param phi_prior \code{character} specifying (in JAGS syntax) the prior distribution for the drift terms/intercepts +#'in the latent trends +#'@param ar_prior \code{character} specifying (in JAGS syntax) the prior distribution for the AR terms +#'in the latent trends +#'@param r_prior \code{character} specifying (in JAGS syntax) the prior distribution for the Negative Binomial +#'overdispersion parameters. Note that this prior acts on the SQUARE ROOT of \code{r}, which is convenient +#'for inducing a complexity-penalising prior model whereby the observation process reduces to a Poisson +#'as the sampled parameter approaches \code{0}. Ignored if family is Poisson or Tweedie +#'@param twdis_prior \code{character} specifying (in JAGS syntax) the prior distribution for the Tweedie +#'overdispersion parameters. Ignored if family is Poisson or Negative Binomial +#'@param sigma_prior \code{character} specifying (in JAGS syntax) the prior distributions for the independent gaussian +#'variances used for the latent trends (ignored if \code{use_lv == TRUE}) +#'@param upper_bounds Optional \code{vector} of \code{integer} values specifying upper limits for each series. If supplied, +#'this generates a modified likelihood where values above the bound are given a likelihood of zero. Note this modification +#'is computationally expensive in \code{JAGS} but can lead to better estimates when true bounds exist. Default is to remove +#'truncation entirely (i.e. there is no upper bound for each series) +#'@param use_stan Logical. If \code{TRUE} and if \code{rstan} is installed, the model will be compiled and sampled using +#'the Hamiltonian Monte Carlo with a call to \code{\link[rstan]{stan}}. Note that this functionality is still in development and +#'not all options that are available in \code{JAGS} can be used, including: no option for a Tweedie family and no option for +#'dynamic factor trends. However, as \code{rstan} can estimate Hilbert base approximate gaussian processes, which +#'are much more computationally tractable than full GPs for time series with `>100` observations, estimation +#'in \code{rstan} can support latent GP trends while estimation in \code{JAGS} cannot +#'@param jags_path Optional character vector specifying the path to the location of the `JAGS` executable (.exe) to use +#'for modelling if `use_stan == FALSE`. If missing, the path will be recovered from a call to \code{\link[runjags]{findjags}} +#' +#'@details Dynamic GAMs are useful when we wish to predict future values from time series that show temporal dependence +#'but we do not want to rely on extrapolating from a smooth term (which can sometimes lead to unpredictable and unrealistic behaviours). +#'In addition, smooths can often try to wiggle excessively to capture any autocorrelation that is present in a time series, +#'which exacerbates the problem of forecasting ahead. As GAMs are very naturally viewed through a Bayesian lens, and we often +#'must model time series that show complex distributional features and missing data, parameters for `mvgam` models are estimated +#'in a Bayesian framework using Markov Chain Monte Carlo. +#'\cr +#'\cr +#'*Priors*: A \code{\link[mcgv]{jagam}} model file is generated from \code{formula} and modified to include any latent +#'temporal processes. Prior distributions for most important model parameters can be altered by the user to inspect model +#'sensitivities to given priors. Note that latent trends are estimated on the log scale so choose tau, AR and phi priors +#'accordingly. However more control over the model specification can be accomplished by first using `mvgam` as a +#'baseline, then editing the returned model accordingly. The model file can be edited and run outside +#'of `mvgam` by setting \code{run_model = TRUE} and this is encouraged for complex modelling tasks +#'\cr +#'\cr +#'*Random effects*: For any smooth terms using the random effect basis (\code{\link[mcgv]{smooth.construct.re.smooth.spec}}), +#'a non-centred parameterisation is automatically employed to avoid degeneracies that are common in hierarchical models. +#'Note however that centred versions may perform better for series that are particularly informative, so as with any +#'foray into Bayesian modelling, it is worth building an understanding of the model's assumptions and limitations by following a +#'principled workflow. +#'\cr +#'\cr +#'*Overdispersion parameters*: When more than one series is included in \code{data_train} and an overdispersed +#'exponential family is used, by default the overdispersion parameters (`r` for Poisson, `twdis` for Tweedie) are +#'estimated independently for each series. Note that for Tweedie +#'models, estimating the power parameter `p` alongside the overdispersion parameter +#'`twdis` and the smooth coefficients is very challenging for noisy data, introducing some difficult posterior geometries. +#'The `p` parameter is therefore fixed at `1.5` (i.e. a so-called Geometric Poisson model). +#'\cr +#'\cr +#'*Factor regularisation*: When using a dynamic factor model for the trends, factor precisions are given +#'regularized penalty priors to theoretically allow some factors to be dropped from the model by squeezing increasing +#'factors' variances to zero. This is done to help protect against selecting too many latent factors than are needed to +#'capture dependencies in the data, so it can often be advantageous to set `n_lv`` to a slightly larger number. However +#'larger numbers of factors do come with additional computational costs so these should be balanced as well. +#'\cr +#'\cr +#'*Residuals*: For each series, randomized quantile (i.e. Dunn-Smyth) residuals are calculated for inspecting model diagnostics +#'If the fitted model is appropriate then Dunn-Smyth residuals will be standard normal in distribution and no +#'autocorrelation will be evident. When a particular observation is missing, the residual is calculated by comparing independent +#'draws from the model's posterior distribution +#' +#'@author Nicholas J Clark +#' +#'@seealso \code{\link[rjags]{jags.model}}, \code{\link[mcgv]{jagam}}, \code{\link[mcgv]{gam}} +#'@return A \code{list} object of class \code{mvgam} containing model output, the text representation of the model file, +#'the mgcv model output (for easily generating simulations at +#'unsampled covariate values), Dunn-Smyth residuals for each series and key information needed +#'for other functions in the package +#' +#'@export + +mvgam = function(formula, + knots, + data_train, + data_test, + run_model = TRUE, + prior_simulation = FALSE, + return_model_data = FALSE, + family = 'poisson', + use_lv = FALSE, + n_lv, + trend_model = 'RW', + drift = FALSE, + chains = 2, + burnin = 5000, + n_samples = 2000, + thin = 4, + parallel = TRUE, + phi_prior, + ar_prior, + r_prior, + twdis_prior, + sigma_prior, + upper_bounds, + use_stan = FALSE, + jags_path){ + + # Check arguments + trend_model <- match.arg(arg = trend_model, choices = c("None", "RW", "AR1", "AR2", "AR3", "GP")) + family <- match.arg(arg = family, choices = c("nb", "poisson", "tw")) + + if(chains == 1){ + stop('number of chains must be >1') + } + if(sign(chains) != 1){ + stop('argument "chains" must be a positive integer', + call. = FALSE) + } else { + if(chains%%1 != 0){ + stop('argument "chains" must be a positive integer', + call. = FALSE) + } + } + + if(sign(burnin) != 1){ + stop('argument "burnin" must be a positive integer', + call. = FALSE) + } else { + if(burnin%%1 != 0){ + stop('argument "burnin" must be a positive integer', + call. = FALSE) + } + } + + # JAGS cannot support latent GP trends as there is no easy way to use Hilbert base + # approximation to reduce the computational demands + if(!use_stan & trend_model == 'GP'){ + if(!require(rstan)){ + stop('gaussian process trends not yet supported for JAGS and cannot find rstan library') + } else { + warning('gaussian process trends not yet supported for JAGS; reverting to stan') + use_stan <- TRUE + } + } + + # If Stan is to be used, make sure it is installed + if(use_stan){ + if(!require(rstan)){ + warning('rstan library not installed; reverting to JAGS') + use_stan <- FALSE + } + } + + # Stan can only handle certain options in the development phase + if(use_stan & use_lv){ + warning('dynamic factor trends not yet supported for stan; reverting to JAGS') + use_stan <- FALSE + } + + if(use_stan & family == 'tw'){ + warning('Tweedie family not yet supported for stan; reverting to JAGS') + use_stan <- FALSE + } + + # If the model is to be run in JAGS, make sure the JAGS software can be located + if(!use_stan){ + if(run_model){ + if(missing(jags_path)){ + jags_path <- runjags::findjags() + } + + # Code borrowed from the runjags package + jags_status <- runjags::testjags(jags_path, silent = TRUE) + if(!jags_status$JAGS.available){ + if(jags_status$os=="windows"){ + # Try it again - sometimes this helps + Sys.sleep(0.2) + jags_status <- runjags::testjags(jags_path, silent = TRUE) + } + + if(!jags_status$JAGS.available){ + cat("Unable to call JAGS using '", jags_path, + "'\nTry specifying the path to the JAGS binary as the jags_path argument, or installing the rjags package.\nUse the runjags::testjags() function for more detailed diagnostics.\n", sep="") + stop("Unable to call JAGS", call. = FALSE) + } + } + } + } + + # Add series factor variable if missing + if(class(data_train)[1] != 'list'){ + if(!'series' %in% colnames(data_train)){ + data_train$series <- factor('series1') + if(!missing(data_test)){ + data_test$series <- factor('series1') + } + } + + # Must be able to index by time; it is too dangerous to 'guess' as this could make a huge + # impact on resulting estimates / inferences + if(!'time' %in% colnames(data_train)){ + stop('data_train does not contain a "time" column') + } + } + + if(class(data_train)[1] == 'list'){ + if(!'series' %in% names(data_train)){ + data_train$series <- factor('series1') + if(!missing(data_test)){ + data_test$series <- factor('series1') + } + } + + if(!'time' %in% names(data_train)){ + stop('data_train does not contain a "time" column') + } + } + + # Number of latent variables cannot be greater than number of series + if(use_lv){ + if(missing(n_lv)){ + n_lv <- min(2, floor(length(unique(data_train$series)) / 2)) + } + if(n_lv > length(unique(data_train$series))){ + stop('number of latent variables cannot be greater than number of series') + } + } + + # No point in latent variables if trend model is None + if(trend_model == 'None' & use_lv){ + use_lv <- FALSE + warning('No point in latent variables if trend model is None; changing use_lv to FALSE') + } + + # Ensure outcome is labelled 'y' + form_terms <- terms(formula(formula)) + if(dimnames(attr(form_terms, 'factors'))[[1]][1] != 'y'){ + stop('Outcome variable must be named "y"') + } + + # If there are missing values in y, use predictions from an initial mgcv model to fill + # these in so that initial values to maintain the true size of the training dataset + orig_y <- data_train$y + + if(!missing(knots)){ + + # Estimate the GAM model using mgcv so that the linear predictor matrix can be easily calculated + # when simulating from the JAGS model later on + if(family == 'nb'){ + ss_gam <- mgcv::gam(formula(formula), + data = data_train, + method = "REML", + family = nb(), + drop.unused.levels = FALSE, + knots = knots, + control = list(nthreads = parallel::detectCores()-1, + maxit = 100)) + } else if(family == 'poisson'){ + ss_gam <- mgcv::gam(formula(formula), + data = data_train, + method = "REML", + family = poisson(), + drop.unused.levels = FALSE, + knots = knots, + control = list(nthreads = parallel::detectCores()-1, + maxit = 100)) + } else { + ss_gam <- mgcv::gam(formula(formula), + data = data_train, + method = "REML", + family = tw(), + drop.unused.levels = FALSE, + knots = knots, + control = list(nthreads = parallel::detectCores()-1, + maxit = 100)) + } + + } else { + if(family == 'nb'){ + ss_gam <- mgcv::gam(formula(formula), + data = data_train, + method = "REML", + family = nb(), + drop.unused.levels = FALSE, + control = list(nthreads = parallel::detectCores()-1)) + } else if(family == 'poisson'){ + ss_gam <- mgcv::gam(formula(formula), + data = data_train, + method = "REML", + family = poisson(), + drop.unused.levels = FALSE, + control = list(nthreads = parallel::detectCores()-1)) + } else { + ss_gam <- mgcv::gam(formula(formula), + data = data_train, + method = "REML", + family = tw(), + drop.unused.levels = FALSE, + control = list(nthreads = parallel::detectCores()-1)) + } + + } + + # Fill in missing observations in data_train so the size of the dataset is correct when + # building the JAGS model + data_train$y[which(is.na(data_train$y))] <- round(predict(ss_gam, + newdata = data_train, + type = 'response'), 0)[which(is.na(data_train$y))] + + + # Make jags file and appropriate data structures; note this has to use Poisson but the + # resulting JAGS file will be modified to accomodate the specified response distribution accordingly + if(!missing(knots)){ + ss_jagam <- mgcv::jagam(formula, + data = data_train, + family = poisson(), + drop.unused.levels = FALSE, + file = 'base_gam.txt', + sp.prior = 'gamma', + diagonalize = F, + knots = knots) + } else { + ss_jagam <- mgcv::jagam(formula, + data = data_train, + family = poisson(), + drop.unused.levels = FALSE, + file = 'base_gam.txt', + sp.prior = 'gamma', + diagonalize = F) + } + + # Update initial values of lambdas using the full estimates from the + # fitted bam model to speed convergence; remove initial betas so that the + # chains can start in very different regions of the parameter space + ss_jagam$jags.ini$b <- NULL + + if(length(ss_gam$sp) == length(ss_jagam$jags.ini$lambda)){ + ss_jagam$jags.ini$lambda <- ss_gam$sp + ss_jagam$jags.ini$lambda[log(ss_jagam$jags.ini$lambda) > 10] <- exp(10) + } + + # Fill y with NAs if this is a simulation from the priors + if(prior_simulation){ + data_train$y <- rep(NA, length(data_train$y)) + } else { + data_train$y <- orig_y + } + + # Read in the base (unmodified) jags model file + base_model <- suppressWarnings(readLines('base_gam.txt')) + + # Remove lines from the linear predictor section + lines_remove <- c(1:grep('## response', base_model)) + base_model <- base_model[-lines_remove] + + # Any parametric effects in the gam (particularly the intercept) need to get more sensible priors to ensure they + # do not directly compete with the latent trends + if(any(grepl('Parametric effect priors', base_model))){ + + in_parenth <- regmatches(base_model[grep('Parametric effect priors', + base_model) + 1], + gregexpr( "(?<=\\().+?(?=\\))", base_model[grep('Parametric effect priors', + base_model) + 1], perl = T))[[1]][1] + n_terms <- as.numeric(sub(".*:", "", in_parenth)) + ss_jagam$jags.data$p_coefs <- coef(ss_gam)[1:n_terms] + + rmvn <- function(n,mu,sig) { + L <- mroot(sig);m <- ncol(L); + t(mu + L%*%matrix(rnorm(m*n),m,n)) + } + + beta_sims <- rmvn(1000, coef(ss_gam), ss_gam$Vp) + ss_jagam$jags.data$p_taus <- apply(as.matrix(beta_sims[,1:n_terms]), + 2, function(x) 1 / sd(x)) + + base_model[grep('Parametric effect priors', + base_model) + 1] <- paste0(' for (i in 1:', + n_terms, + ') { b[i] ~ dnorm(p_coefs[i], p_taus[i]) }') + base_model[grep('Parametric effect priors', + base_model)] <- c(' ## parametric effect priors (regularised for identifiability)') + } + + # For any random effect smooths, use the non-centred parameterisation to avoid degeneracies + smooth_labs <- do.call(rbind, lapply(seq_along(ss_gam$smooth), function(x){ + data.frame(label = ss_gam$smooth[[x]]$label, class = class(ss_gam$smooth[[x]])[1]) + })) + + if(any(smooth_labs$class == 'random.effect')){ + re_smooths <- smooth_labs %>% + dplyr::filter(class == 'random.effect') %>% + dplyr::pull(label) + + for(i in 1:length(re_smooths)){ + in_parenth <- regmatches(base_model[grep(re_smooths[i], + base_model, fixed = T) + 1], + gregexpr( "(?<=\\().+?(?=\\))", base_model[grep(re_smooths[i], + base_model, fixed = T) + 1], + perl = T))[[1]][1] + n_terms <- as.numeric(sub(".*:", "", in_parenth)) + n_start <- as.numeric(strsplit(sub(".*\\(", "", in_parenth), ':')[[1]][1]) + base_model[grep(re_smooths[i], + base_model, fixed = T) + 1] <- paste0(' for (i in ', n_start, ':', + n_terms, + ') {\n b_raw[i] ~ dnorm(0, 1)\n', + 'b[i] <- ', + paste0('mu_raw', i), ' + b_raw[i] * ', + paste0('sigma_raw', i), '\n }\n ', + paste0('sigma_raw', i), ' ~ dexp(1)\n', + paste0('mu_raw', i), ' ~ dnorm(0, 1)') + base_model[grep(re_smooths[i], + base_model, fixed = T)] <- paste0(' ## prior (non-centred) for ', re_smooths[i], '...') + } + + } + + base_model[grep('smoothing parameter priors', + base_model)] <- c(' ## smoothing parameter priors...') + + # Add replacement lines for priors, trends and the linear predictor + fil <- tempfile(fileext = ".xt") + modification <- add_base_dgam_lines(use_lv) + cat(c(readLines(textConnection(modification)), base_model), file = fil, + sep = "\n") + model_file <- trimws(readLines(fil, n = -1)) + + # Update prior distributions + if(!missing(phi_prior)){ + model_file[grep('phi\\[s\\] ~', model_file)] <- paste0(' phi[s] ~ ', phi_prior) + } + + if(!missing(ar_prior)){ + model_file[grep('ar1\\[s\\] ~', model_file)] <- paste0(' ar1[s] ~ ', ar_prior) + model_file[grep('ar2\\[s\\] ~', model_file)] <- paste0(' ar2[s] ~ ', ar_prior) + model_file[grep('ar3\\[s\\] ~', model_file)] <- paste0(' ar3[s] ~ ', ar_prior) + } + + if(!missing(sigma_prior)){ + model_file[grep('sigma\\[s\\] ~', model_file)] <- paste0(' sigma[s] ~ ', sigma_prior) + } + + # Modify observation distribution lines + if(family == 'tw'){ + model_file <- add_tweedie_lines(model_file, upper_bounds = upper_bounds, + twdis_prior = twdis_prior) + + } else if(family == 'poisson'){ + model_file <- add_poisson_lines(model_file, upper_bounds = upper_bounds) + + } else { + if(missing(upper_bounds)){ + model_file[grep('y\\[i, s\\] ~', model_file)] <- ' y[i, s] ~ dnegbin(rate[i, s], r[s])' + model_file[grep('ypred\\[i, s\\] ~', model_file)] <- ' ypred[i, s] ~ dnegbin(rate[i, s], r[s])' + } + + if(!missing(r_prior)){ + model_file[grep('r_raw\\[s\\] ~', model_file)] <- paste0('r_raw[s] ~ ', r_prior) + } + } + + # Modify lines needed for the specified trend model + model_file <- add_trend_lines(model_file, stan = F, + use_lv = use_lv, + trend_model = trend_model, + drift = drift) + + # Use informative priors based on the fitted mgcv model to speed convergence + # and eliminate searching over strange parameter spaces + if(length(ss_gam$sp) == length(ss_jagam$jags.ini$lambda)){ + model_file[grep('lambda\\[i\\] ~', model_file)] <- ' lambda[i] ~ dexp(1/sp[i])T(0.0001, )' + } else { + model_file[grep('lambda\\[i\\] ~', model_file)] <- ' lambda[i] ~ dexp(0.05)T(0.0001, )' + } + + # Final tidying of the JAGS model for readability + clean_up <- vector() + for(x in 1:length(model_file)){ + clean_up[x] <- model_file[x-1] == "" & model_file[x] == "" + } + clean_up[is.na(clean_up)] <- FALSE + model_file <- model_file[!clean_up] + + model_file_jags <- textConnection(model_file) + + # Covariate dataframe including training and testing observations + if(!missing(data_test)){ + X <- data.frame(rbind(ss_jagam$jags.data$X, + predict(ss_gam, newdata = data_test, type = 'lpmatrix'))) + + # Add a time variable + if(class(data_train)[1] == 'list'){ + temp_dat_train <- data.frame(time = data_train$time, + series = data_train$series) + temp_dat_test <- data.frame(time = data_test$time, + series = data_test$series) + + X$time <- rbind(temp_dat_train, temp_dat_test) %>% + dplyr::left_join(rbind(temp_dat_train, temp_dat_test) %>% + dplyr::select(time) %>% + dplyr::distinct() %>% + dplyr::arrange(time) %>% + dplyr::mutate(time = dplyr::row_number()), + by = c('time')) %>% + dplyr::pull(time) + + # Add a series identifier variable + X$series <- as.numeric(rbind(temp_dat_train, temp_dat_test)$series) + + # Add an outcome variable + X$outcome <- c(orig_y, rep(NA, NROW(temp_dat_test))) + + } else { + + X$time <- rbind(data_train, data_test[,1:ncol(data_train)]) %>% + dplyr::left_join(rbind(data_train, data_test[,1:ncol(data_train)]) %>% + dplyr::select(time) %>% + dplyr::distinct() %>% + dplyr::arrange(time) %>% + dplyr::mutate(time = dplyr::row_number()), + by = c('time')) %>% + dplyr::pull(time) + + # Add a series identifier variable + X$series <- as.numeric(rbind(data_train, data_test)$series) + + # Add an outcome variable + X$outcome <- c(data_train$y, rep(NA, NROW(data_test))) + } + + } else { + X <- data.frame(ss_jagam$jags.data$X) + + if(class(data_train)[1] == 'list'){ + temp_dat <- data.frame(time = data_train$time) + X$time <- temp_dat %>% + dplyr::left_join(temp_dat %>% + dplyr::select(time) %>% + dplyr::distinct() %>% + dplyr::arrange(time) %>% + dplyr::mutate(time = dplyr::row_number()), + by = c('r')) %>% + dplyr::pull(time) + } else { + X$time <- data_train %>% + dplyr::left_join(data_train %>% + dplyr::select(time) %>% + dplyr::distinct() %>% + dplyr::arrange(time) %>% + dplyr::mutate(time = dplyr::row_number()), + by = c('time')) %>% + dplyr::pull(time) + } + + X$outcome <- c(data_train$y) + X$series <- as.numeric(data_train$series) + } + + # Arrange by time then by series + X %>% + dplyr::arrange(time, series) -> X + + # Matrix of indices in X that correspond to timepoints for each series + ytimes <- matrix(NA, nrow = length(unique(X$time)), ncol = length(unique(X$series))) + for(i in 1:length(unique(X$series))){ + ytimes[,i] <- which(X$series == i) + } + ss_jagam$jags.data$ytimes <- ytimes + + # Matrix of outcomes in X that correspond to each series at each timepoint + ys_mat <- matrix(NA, nrow = NROW(ytimes), ncol = NCOL(ytimes)) + for(i in 1:length(unique(X$series))){ + ys_mat[,i] <- X$outcome[which(X$series == i)] + } + ss_jagam$jags.data$y <- ys_mat + + # Other necessary variables for JAGS + ss_jagam$jags.data$n <- NROW(ytimes) + ss_jagam$jags.data$n_series <- NCOL(ytimes) + ss_jagam$jags.data$X <- as.matrix(X %>% + dplyr::select(-time, -series, -outcome)) + if(!missing(upper_bounds)){ + ss_jagam$jags.data$upper_bound <- upper_bounds + } + + if(length(ss_gam$sp) == length(ss_jagam$jags.ini$lambda)){ + ss_jagam$jags.data$sp <- ss_gam$sp + } + + # Machine epsilon for minimum allowable non-zero rate + if(family == 'nb'){ + ss_jagam$jags.data$min_eps <- .Machine$double.eps + } + + # Number of latent variables to use + if(use_lv){ + if(missing(n_lv)){ + ss_jagam$jags.data$n_lv <- min(2, floor(ss_jagam$jags.data$n_series / 2)) + } else { + ss_jagam$jags.data$n_lv <- n_lv + ss_jagam$jags.ini$X1 <- rep(1, n_lv) + ss_jagam$jags.ini$X2 <- 1 + } + if(ss_jagam$jags.data$n_lv > ss_jagam$jags.data$n_series){ + stop('Number of latent variables cannot be greater than number of series') + } + } + + if(missing(upper_bounds)){ + upper_bounds <- NULL + } + + if(use_lv){ + n_lv <- ss_jagam$jags.data$n_lv + } else { + n_lv <- NULL + } + + # Add information about the call and necessary data structures to the model file + # Get dimensions and numbers of smooth terms + snames <- names(ss_jagam$jags.data)[grep('S.*', names(ss_jagam$jags.data))] + smooth_dims <- matrix(NA, ncol = 2, nrow = length(snames)) + for(i in 1:length(snames)){ + smooth_dims[i,] <- dim(ss_jagam$jags.data[[snames[i]]]) + } + smooth_penalty_data <- vector() + for(i in 1:length(snames)){ + smooth_penalty_data[i] <- paste0('matrix ', + snames[i], + '; mgcv smooth penalty matrix ', snames[i]) + } + + if('sp' %in% names(ss_jagam$jags.data)){ + if(length(ss_jagam$jags.data$sp) == 1){ + sp_data <- c(paste0('real sp; inverse exponential location prior for smoothing parameter ', + paste0(names(ss_jagam$jags.data$sp))), + '___________values ranging 5 - 50 are a good start') + } else { + sp_data <- c(paste0('vector sp; inverse exponential location priors for smoothing parameters: ', + paste0(names(ss_jagam$jags.data$sp), collapse = '; ')), + '___________values ranging 5 - 50 are a good start') + } + + } else { + sp_data <- NULL + } + + if('p_coefs' %in% names(ss_jagam$jags.data)){ + parametric_ldata <- paste0('vector p_coefs; vector (length = ', + length(ss_jagam$jags.data$p_coefs), + ') of prior Gaussian means for parametric effects') + } else { + parametric_ldata <- NULL + } + + if('p_taus' %in% names(ss_jagam$jags.data)){ + parametric_tdata <- paste0('vector p_taus; vector (length = ', + length(ss_jagam$jags.data$p_coefs), + ') of prior Gaussian precisions for parametric effects') + } else { + parametric_tdata <- NULL + } + + model_file <- c('JAGS code generated by package mvgam', + '\n', + 'GAM formula:', + gsub('\"', '', paste(formula[2],formula[3],sep=' ~ ')), + '\n', + 'Trend model:', + trend_model, + '\n', + 'Required data:', + 'integer n; number of timepoints per series\n', + 'integer n_series; number of series\n', + 'matrix y; time-ordered observations of dimension n x n_series (missing values allowed)\n', + 'matrix ytimes; time-ordered n x n_series matrix (which row in X belongs to each [time, series] observation?)\n', + 'matrix X; mgcv GAM design matrix of dimension (n x n_series) x basis dimension\n', + paste0(smooth_penalty_data), + 'vector zero; prior basis coefficient locations vector of length ncol(X)\n', + paste0(parametric_ldata), + paste0(parametric_tdata), + sp_data, + '\n', + model_file) + + if(!run_model){ + # Return only the model file and all data / inits needed to run the model + # outside of mvgam + unlink('base_gam.txt') + if(use_stan){ + # Import the base Stan model file + modification <- add_base_dgam_lines(stan = TRUE) + unlink('base_gam_stan.txt') + cat(modification, file = 'base_gam_stan.txt', sep = '\n', append = T) + base_stan_model <- trimws(suppressWarnings(readLines('base_gam_stan.txt'))) + unlink('base_gam_stan.txt') + + # Add necessary trend structure + base_stan_model <- add_trend_lines(model_file = base_stan_model, + stan = T, + trend_model = trend_model, + drift = drift) + + # Add remaining data, model and parameters blocks to the Stan model file; + # gather Stan data structure + stan_objects <- add_stan_data(jags_file = trimws(model_file), + stan_file = base_stan_model, + jags_data = ss_jagam$jags.data, + family = family) + + output <- structure(list(call = formula, + family = dplyr::case_when(family == 'tw' ~ 'Tweedie', + family == 'poisson' ~ 'Poisson', + TRUE ~ 'Negative Binomial'), + trend_model = trend_model, + drift = drift, + pregam = ss_jagam$pregam, + model_file = stan_objects$stan_file, + model_data = stan_objects$model_data, + mgcv_model = ss_gam, + sp_names = names(ss_jagam$pregam$lsp0), + ytimes = ytimes, + use_lv = use_lv, + n_lv = n_lv, + upper_bounds = upper_bounds, + obs_data = data_train, + fit_engine = 'stan'), + class = 'mvgam_prefit') + } else { + output <- structure(list(call = formula, + family = dplyr::case_when(family == 'tw' ~ 'Tweedie', + family == 'poisson' ~ 'Poisson', + TRUE ~ 'Negative Binomial'), + trend_model = trend_model, + drift = drift, + pregam = ss_jagam$pregam, + model_file = trimws(model_file), + model_data = ss_jagam$jags.data, + mgcv_model = ss_gam, + sp_names = names(ss_jagam$pregam$lsp0), + ytimes = ytimes, + use_lv = use_lv, + n_lv = n_lv, + upper_bounds = upper_bounds, + obs_data = data_train, + fit_engine = 'jags'), + class = 'mvgam_prefit') + } + + } else { + + if(use_stan){ + fit_engine <- 'stan' + # Import the base Stan model file + modification <- add_base_dgam_lines(stan = TRUE) + unlink('base_gam_stan.txt') + cat(modification, file = 'base_gam_stan.txt', sep = '\n', append = T) + base_stan_model <- trimws(suppressWarnings(readLines('base_gam_stan.txt'))) + unlink('base_gam_stan.txt') + + # Add necessary trend structure + base_stan_model <- add_trend_lines(model_file = base_stan_model, + stan = T, + trend_model = trend_model, + drift = drift) + + # Add remaining data, model and parameters blocks to the Stan model file; + # gather Stan data structure + stan_objects <- add_stan_data(jags_file = trimws(model_file), + stan_file = base_stan_model, + jags_data = ss_jagam$jags.data, + family = family) + model_data <- stan_objects$model_data + + # Should never call library in a function, but just doing this for now while + # developing stan functionality!! + require(rstan) + options(mc.cores = parallel::detectCores()) + + # Sensible inits needed for the betas + if(trend_model != 'None'){ + initf1 <- function() { + list(b_raw = runif(stan_objects$model_data$num_basis, -0.25, 0.25)) + } + } else { + initf1 <- function() { + list(b_raw = runif(stan_objects$model_data$num_basis, -0.25, 0.25), + sigma = runif(stan_objects$model_data$n_series, 0.075, 1)) + } + } + inits <- initf1 + + # Fit the model in stan + if(trend_model == 'GP'){ + stan_control <- list(max_treedepth = 12) + } else { + stan_control <- list(max_treedepth = 10) + } + + fit1 <- stan(model_code = stan_objects$stan_file, + iter = burnin + 1000, + chains = chains, + data = stan_objects$model_data, + cores = min(c(chains, parallel::detectCores() - 1)), + init = initf1, + verbose = F, + control = stan_control, + pars = get_monitor_pars(family = family, + use_lv = use_lv, + trend_model = trend_model, + drift = drift), + refresh = 500) + + # Use Michael Betancourt's utility functions for checking diagnostics + #source('https://raw.githubusercontent.com/betanalpha/knitr_case_studies/master/factor_modeling/stan_utility.R') + check_all_diagnostics(fit1) + + out_gam_mod <- fit1 + } + + if(!use_stan){ + fit_engine <- 'jags' + model_data <- ss_jagam$jags.data + + # Set monitor parameters and initial values + param <- get_monitor_pars(family, use_lv, trend_model, drift) + + initlist <- replicate(chains, ss_jagam$jags.ini, + simplify = FALSE) + inits <- initlist + runjags::runjags.options(silent.jags = TRUE, silent.runjags = TRUE) + n.burn <- burnin + + # Initiate adaptation of the model for the full burnin period. This is necessary as JAGS + # will take a while to optimise the samplers, so long adaptation with little 'burnin' + # is more crucial than little adaptation but long 'burnin' https://mmeredith.net/blog/2016/Adapt_or_burn.htm + unlink('base_gam.txt') + cat(model_file, file = 'base_gam.txt', sep = '\n', append = T) + if(parallel){ + cl <- parallel::makePSOCKcluster(min(c(chains, parallel::detectCores() - 1))) + setDefaultCluster(cl) + gam_mod <- runjags::run.jags(model = 'base_gam.txt', + data = ss_jagam$jags.data, + modules = 'glm', + inits = initlist, + n.chains = chains, + # Rely on long adaptation to tune samplers appropriately + adapt = max(1000, n.burn - 1000), + burnin = 1000, + sample = n_samples, + jags = jags_path, + thin = thin, + method = "rjparallel", + monitor = c(param, 'deviance'), + silent.jags = TRUE, + cl = cl) + stopCluster(cl) + } else { + gam_mod <- runjags::run.jags(model = 'base_gam.txt', + data = ss_jagam$jags.data, + modules = 'glm', + inits = initlist, + n.chains = chains, + # Rely on long adaptation to tune samplers appropriately + adapt = max(1000, n.burn - 1000), + burnin = 1000, + sample = n_samples, + jags = jags_path, + thin = thin, + method = "rjags", + monitor = c(param, 'deviance'), + silent.jags = TRUE) + } + out_gam_mod <- coda::as.mcmc.list(gam_mod) + } + + unlink('base_gam.txt') + unlink(fil) + + # Get Dunn-Smyth Residual distributions for each series + series_resids <- get_mvgam_resids(object = list( + model_output = out_gam_mod, + fit_engine = fit_engine, + family = dplyr::case_when(family == 'tw' ~ 'Tweedie', + family == 'poisson' ~ 'Poisson', + TRUE ~ 'Negative Binomial'), + obs_data = data_train, + ytimes = ytimes), + n_cores = min(c(parallel::detectCores() - 1, NCOL(ytimes)))) + + # Create a jam object and get smooth penalty names in more interpretable format + ## Modified sim2jam function; takes simulation output + ## and a pregam object from jagam, and attempts to create a fake gam object suitable + ## for calculating estimated degrees of freedom + create_jam <- function(out_gam_mod, pregam) { + + b <- t(MCMCvis::MCMCchains(out_gam_mod, 'b')) + pregam$Vp <- cov(t(b)) + pregam$coefficients <- rowMeans(b) + pregam$sig2 <- 1 + + # Compute estimated degrees of freedom + eta <- pregam$X %*% pregam$coefficients + mu <- pregam$family$linkinv(eta) + w <- as.numeric(pregam$w * pregam$family$mu.eta(eta)^2 / pregam$family$variance(mu)) + XWX <- t(pregam$X) %*% (w*pregam$X) + + pregam$edf <- rowSums(pregam$Vp*t(XWX)) / pregam$sig2 + class(pregam) <- "jam" + pregam + } + + jam <- create_jam(out_gam_mod, ss_jagam$pregam) + jam$sp <- exp(colMeans(MCMCvis::MCMCchains(out_gam_mod, 'rho'))) + + name_starts <- unlist(purrr:::map(jam$smooth, 'first.sp')) + name_ends <- unlist(purrr:::map(jam$smooth, 'last.sp')) + + rho_names <- unlist(lapply(seq(1:length(ss_gam$smooth)), function(i){ + + number_seq <- seq(1:(1 + name_ends[i] - name_starts[i])) + number_seq[1] <- '' + + paste0(rep(ss_gam$smooth[[i]]$label, + length(number_seq)), + number_seq) + })) + + if(use_stan){ + model_file <- stan_objects$stan_file + } else { + model_file <- trimws(model_file) + } + + if(return_model_data){ + output <- structure(list(call = formula, + family = dplyr::case_when(family == 'tw' ~ 'Tweedie', + family == 'poisson' ~ 'Poisson', + TRUE ~ 'Negative Binomial'), + trend_model = trend_model, + drift = drift, + pregam = ss_jagam$pregam, + model_output = out_gam_mod, + model_file = model_file, + sp_names = rho_names, + model_data = model_data, + inits = inits, + mgcv_model = ss_gam, + jam_model = jam, + ytimes = ytimes, + resids = series_resids, + use_lv = use_lv, + n_lv = n_lv, + upper_bounds = upper_bounds, + obs_data = data_train, + fit_engine = fit_engine), + class = 'mvgam') + } else { + output <- structure(list(call = formula, + family = dplyr::case_when(family == 'tw' ~ 'Tweedie', + family == 'poisson' ~ 'Poisson', + TRUE ~ 'Negative Binomial'), + trend_model = trend_model, + drift = drift, + pregam = ss_jagam$pregam, + model_output = out_gam_mod, + model_file = model_file, + sp_names = rho_names, + mgcv_model = ss_gam, + jam_model = jam, + ytimes = ytimes, + resids = series_resids, + use_lv = use_lv, + n_lv = n_lv, + upper_bounds = upper_bounds, + obs_data = data_train, + fit_engine = fit_engine), + class = 'mvgam') + } + } + + return(output) +} diff --git a/R/stan_checks.R b/R/stan_checks.R new file mode 100644 index 00000000..fef45551 --- /dev/null +++ b/R/stan_checks.R @@ -0,0 +1,198 @@ +#' Check transitions that ended with a divergence +#' @param fit A stanfit object +#' @param quiet Logical (verbose or not?) +#' @details Utility function written by Michael Betancourt (https://betanalpha.github.io/) +#' @noRd +check_div <- function(fit, quiet=FALSE) { + sampler_params <- rstan::get_sampler_params(fit, inc_warmup=FALSE) + divergent <- do.call(rbind, sampler_params)[,'divergent__'] + n = sum(divergent) + N = length(divergent) + + if (!quiet) print(sprintf('%s of %s iterations ended with a divergence (%s%%)', + n, N, 100 * n / N)) + if (n > 0) { + if (!quiet) print(' Try running with larger adapt_delta to remove the divergences') + if (quiet) return(FALSE) + } else { + if (quiet) return(TRUE) + } +} + +#' Check transitions that ended prematurely due to maximum tree depth limit +#' @param fit A stanfit object +#' @param quiet Logical (verbose or not?) +#' @details Utility function written by Michael Betancourt (https://betanalpha.github.io/) +#' @noRd +check_treedepth <- function(fit, max_depth = 10, quiet=FALSE) { + sampler_params <- rstan::get_sampler_params(fit, inc_warmup=FALSE) + treedepths <- do.call(rbind, sampler_params)[,'treedepth__'] + n = length(treedepths[sapply(treedepths, function(x) x == max_depth)]) + N = length(treedepths) + + if (!quiet) + print(sprintf('%s of %s iterations saturated the maximum tree depth of %s (%s%%)', + n, N, max_depth, 100 * n / N)) + + if (n > 0) { + if (!quiet) print(' Run again with max_treedepth set to a larger value to avoid saturation') + if (quiet) return(FALSE) + } else { + if (quiet) return(TRUE) + } +} + +#' Check the energy fraction of missing information (E-FMI) +#' @param fit A stanfit object +#' @param quiet Logical (verbose or not?) +#' @details Utility function written by Michael Betancourt (https://betanalpha.github.io/) +#' @noRd +check_energy <- function(fit, quiet=FALSE) { + sampler_params <- rstan::get_sampler_params(fit, inc_warmup=FALSE) + no_warning <- TRUE + for (n in 1:length(sampler_params)) { + energies = sampler_params[n][[1]][,'energy__'] + numer = sum(diff(energies)**2) / length(energies) + denom = var(energies) + if (numer / denom < 0.2) { + if (!quiet) print(sprintf('Chain %s: E-FMI = %s', n, numer / denom)) + no_warning <- FALSE + } + } + if (no_warning) { + if (!quiet) print('E-FMI indicated no pathological behavior') + if (quiet) return(TRUE) + } else { + if (!quiet) print(' E-FMI below 0.2 indicates you may need to reparameterize your model') + if (quiet) return(FALSE) + } +} + +#' Check the effective sample size per iteration +#' @param fit A stanfit object +#' @param quiet Logical (verbose or not?) +#' @details Utility function written by Michael Betancourt (https://betanalpha.github.io/) +#' @noRd +check_n_eff <- function(fit, quiet=FALSE) { + fit_summary <- rstan::summary(fit, probs = c(0.5))$summary + N <- dim(fit_summary)[[1]] + + iter <- dim(rstan:::extract(fit)[[1]])[[1]] + + no_warning <- TRUE + for (n in 1:N) { + if (is.nan(fit_summary[,'n_eff'][n])) { + if (!quiet) print(sprintf('n_eff for parameter %s is NaN!', + rownames(fit_summary)[n])) + no_warning <- FALSE + } else { + ratio <- fit_summary[,'n_eff'][n] / iter + if (ratio < 0.001) { + if (!quiet) print(sprintf('n_eff / iter for parameter %s is %s!', + rownames(fit_summary)[n], ratio)) + no_warning <- FALSE + } + } + } + if (no_warning) { + if (!quiet) print('n_eff / iter looks reasonable for all parameters') + if (quiet) return(TRUE) + } + else { + if (!quiet) print(' n_eff / iter below 0.001 indicates that the effective sample size has likely been overestimated') + if (quiet) return(FALSE) + } +} + +#' Check the potential scale reduction factors +#' @param fit A stanfit object +#' @param quiet Logical (verbose or not?) +#' @details Utility function written by Michael Betancourt (https://betanalpha.github.io/) +#' @noRd +check_rhat <- function(fit, quiet=FALSE) { + fit_summary <- rstan::summary(fit, probs = c(0.5))$summary + N <- dim(fit_summary)[[1]] + + no_warning <- TRUE + for (n in 1:N) { + rhat <- fit_summary[,'Rhat'][n] + if (rhat > 1.1 || is.infinite(rhat) || is.nan(rhat)) { + if (!quiet) print(sprintf('Rhat for parameter %s is %s!', + rownames(fit_summary)[n], rhat)) + no_warning <- FALSE + } + } + if (no_warning) { + if (!quiet) print('Rhat looks reasonable for all parameters') + if (quiet) return(TRUE) + } else { + if (!quiet) print(' Rhat above 1.1 indicates that the chains very likely have not mixed') + if (quiet) return(FALSE) + } +} + +#' Run all diagnostic checks +#' @param fit A stanfit object +#' @param quiet Logical (verbose or not?) +#' @details Utility function written by Michael Betancourt (https://betanalpha.github.io/) +#' @noRd +check_all_diagnostics <- function(fit, quiet=FALSE) { + if (!quiet) { + check_n_eff(fit) + check_rhat(fit) + check_div(fit) + check_treedepth(fit) + check_energy(fit) + } else { + warning_code <- 0 + + if (!check_n_eff(fit, quiet=TRUE)) + warning_code <- bitwOr(warning_code, bitwShiftL(1, 0)) + if (!check_rhat(fit, quiet=TRUE)) + warning_code <- bitwOr(warning_code, bitwShiftL(1, 1)) + if (!check_div(fit, quiet=TRUE)) + warning_code <- bitwOr(warning_code, bitwShiftL(1, 2)) + if (!check_treedepth(fit, quiet=TRUE)) + warning_code <- bitwOr(warning_code, bitwShiftL(1, 3)) + if (!check_energy(fit, quiet=TRUE)) + warning_code <- bitwOr(warning_code, bitwShiftL(1, 4)) + + return(warning_code) + } +} + +#' Parse warnings +#' @param warning_code Type of warning code to generate +#' @details Utility function written by Michael Betancourt (https://betanalpha.github.io/) +#' @noRd +parse_warning_code <- function(warning_code) { + if (bitwAnd(warning_code, bitwShiftL(1, 0))) + print("n_eff / iteration warning") + if (bitwAnd(warning_code, bitwShiftL(1, 1))) + print("rhat warning") + if (bitwAnd(warning_code, bitwShiftL(1, 2))) + print("divergence warning") + if (bitwAnd(warning_code, bitwShiftL(1, 3))) + print("treedepth warning") + if (bitwAnd(warning_code, bitwShiftL(1, 4))) + print("energy warning") +} + +#' Return parameter arrays separated into divergent and non-divergent transitions +#' @param fit A stanfit object +#' @details Utility function written by Michael Betancourt (https://betanalpha.github.io/) +#' @noRd +partition_div <- function(fit) { + nom_params <- rstan:::extract(fit, permuted=FALSE) + n_chains <- dim(nom_params)[2] + params <- as.data.frame(do.call(rbind, lapply(1:n_chains, function(n) nom_params[,n,]))) + + sampler_params <- get_sampler_params(fit, inc_warmup=FALSE) + divergent <- do.call(rbind, sampler_params)[,'divergent__'] + params$divergent <- divergent + + div_params <- params[params$divergent == 1,] + nondiv_params <- params[params$divergent == 0,] + + return(list(div_params, nondiv_params)) +} diff --git a/man/add_stan_data.Rd b/man/add_stan_data.Rd new file mode 100644 index 00000000..a881da6d --- /dev/null +++ b/man/add_stan_data.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add_stan_data.R +\name{add_stan_data} +\alias{add_stan_data} +\title{Add remaining data, model and parameter blocks to a Stan model} +\usage{ +add_stan_data(jags_file, stan_file, jags_data, family = "poisson") +} +\arguments{ +\item{jags_file}{Prepared JAGS mvgam model file} + +\item{stan_file}{Incomplete Stan model file to be edited} + +\item{jags_data}{Prepared mvgam data for JAGS modelling} +} +\value{ +A \code{list} containing the updated Stan model and model data +} +\description{ +Add remaining data, model and parameter blocks to a Stan model +} diff --git a/man/mvgam.Rd b/man/mvgam.Rd new file mode 100644 index 00000000..9315480f --- /dev/null +++ b/man/mvgam.Rd @@ -0,0 +1,199 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mvgam.R +\name{mvgam} +\alias{mvgam} +\title{Fit a Bayesian dynamic GAM to a univariate or multivariate set of discrete time series} +\usage{ +mvgam( + formula, + knots, + data_train, + data_test, + run_model = TRUE, + prior_simulation = FALSE, + return_model_data = FALSE, + family = "poisson", + use_lv = FALSE, + n_lv, + trend_model = "RW", + drift = FALSE, + chains = 2, + burnin = 5000, + n_samples = 2000, + thin = 4, + parallel = TRUE, + phi_prior, + ar_prior, + r_prior, + twdis_prior, + sigma_prior, + upper_bounds, + use_stan = FALSE, + jags_path +) +} +\arguments{ +\item{formula}{A \code{character} string specifying the GAM formula. These are exactly like the formula +for a GLM except that smooth terms, s, te, ti and t2, can be added to the right hand side +to specify that the linear predictor depends on smooth functions of predictors (or linear functionals of these).} + +\item{knots}{An optional \code{list} containing user specified knot values to be used for basis construction. +For most bases the user simply supplies the knots to be used, which must match up with the k value supplied +(note that the number of knots is not always just k). Different terms can use different numbers of knots, +unless they share a covariate.} + +\item{data_train}{A \code{dataframe} or \code{list} containing the model response variable and covariates +required by the GAM \code{formula}. Should include columns: +'y' (the discrete outcomes; \code{NA}s allowed) +'series' (character or factor index of the series IDs) +'time' (numeric index of the time point for each observation). +Any other variables to be included in the linear predictor of \code{formula} must also be present} + +\item{data_test}{Optional \code{dataframe} or \code{list} of test data containing at least 'series' and 'time' +in addition to any other variables included in the linear predictor of \code{formula}. If included, the +observations in \code{data_test} will be set to \code{NA} when fitting the model so that posterior +simulations can be obtained} + +\item{run_model}{\code{logical}. If \code{FALSE}, the model is not fitted but instead the function will +return the model file and the data / initial values that are needed to fit the \code{JAGS} model} + +\item{prior_simulation}{\code{logical}. If \code{TRUE}, no observations are fed to the model, and instead +simulations from prior distributions are returned} + +\item{return_model_data}{\code{logical}. If \code{TRUE}, the list of data that is needed to fit the +model is returned, along with the initial values for smooth and AR parameters, once the model is fitted. +This will be helpful if users wish to modify the model file to add +other stochastic elements that are not currently avaiable in \code{mvgam}. Default is \code{FALSE} to reduce +the size of the returned object, unless \code{run_model == FALSE}} + +\item{family}{\code{character}. Must be either 'nb' (for Negative Binomial), 'tw' (for Tweedie) or 'poisson'} + +\item{use_lv}{\code{logical}. If \code{TRUE}, use dynamic factors to estimate series' +latent trends in a reduced dimension format. If \code{FALSE}, estimate independent latent trends for each series} + +\item{n_lv}{\code{integer} the number of latent dynamic factors to use if \code{use_lv == TRUE}. +Cannot be \code{>n_series}. Defaults arbitrarily to \code{min(5, floor(n_series / 2))}} + +\item{trend_model}{\code{character} specifying the time series dynamics for the latent trend. Options are: +'None' (no latent trend component; i.e. the GAM component is all that contributes to the linear predictor, +and the observation process is the only source of error; similarly to what is estimated by \code{\link[mcgv]{gam}}), +'RW' (random walk with possible drift), +'AR1' (AR1 model with intercept), +'AR2' (AR2 model with intercept) or +'AR3' (AR3 model with intercept) or +'GP' (Gaussian process with squared exponential kernel; currently under development and +only available for estimation in stan)} + +\item{drift}{\code{logical} estimate a drift parameter in the latent trend components. Useful if the latent +trend is expected to broadly follow a non-zero slope. Note that if the latent trend is more or less stationary, +the drift parameter can become unidentifiable, especially if an intercept term is included in the GAM linear +predictor (which it is by default when calling \code{\link[mcgv]{jagam}}). Therefore this defaults to \code{FALSE}} + +\item{chains}{\code{integer} specifying the number of parallel chains for the model} + +\item{burnin}{\code{integer} specifying the number of iterations of the Markov chain to run during +adaptive mode to tune sampling algorithms} + +\item{n_samples}{\code{integer} specifying the number of iterations of the Markov chain to run for +sampling the posterior distribution} + +\item{thin}{Thinning interval for monitors} + +\item{parallel}{\code{logical} specifying whether multiple cores should be used for +for generating \code{JAGS} simulations in parallel. If \code{TRUE}, the number of cores to use will be +\code{min(c(chains, parallel::detectCores() - 1))}} + +\item{phi_prior}{\code{character} specifying (in JAGS syntax) the prior distribution for the drift terms/intercepts +in the latent trends} + +\item{ar_prior}{\code{character} specifying (in JAGS syntax) the prior distribution for the AR terms +in the latent trends} + +\item{r_prior}{\code{character} specifying (in JAGS syntax) the prior distribution for the Negative Binomial +overdispersion parameters. Note that this prior acts on the SQUARE ROOT of \code{r}, which is convenient +for inducing a complexity-penalising prior model whereby the observation process reduces to a Poisson +as the sampled parameter approaches \code{0}. Ignored if family is Poisson or Tweedie} + +\item{twdis_prior}{\code{character} specifying (in JAGS syntax) the prior distribution for the Tweedie +overdispersion parameters. Ignored if family is Poisson or Negative Binomial} + +\item{sigma_prior}{\code{character} specifying (in JAGS syntax) the prior distributions for the independent gaussian +variances used for the latent trends (ignored if \code{use_lv == TRUE})} + +\item{upper_bounds}{Optional \code{vector} of \code{integer} values specifying upper limits for each series. If supplied, +this generates a modified likelihood where values above the bound are given a likelihood of zero. Note this modification +is computationally expensive in \code{JAGS} but can lead to better estimates when true bounds exist. Default is to remove +truncation entirely (i.e. there is no upper bound for each series)} + +\item{use_stan}{Logical. If \code{TRUE} and if \code{rstan} is installed, the model will be compiled and sampled using +the Hamiltonian Monte Carlo with a call to \code{\link[rstan]{stan}}. Note that this functionality is still in development and +not all options that are available in \code{JAGS} can be used, including: no option for a Tweedie family and no option for +dynamic factor trends. However, as \code{rstan} can estimate Hilbert base approximate gaussian processes, which +are much more computationally tractable than full GPs for time series with \verb{>100} observations, estimation +in \code{rstan} can support latent GP trends while estimation in \code{JAGS} cannot} + +\item{jags_path}{Optional character vector specifying the path to the location of the \code{JAGS} executable (.exe) to use +for modelling if \code{use_stan == FALSE}. If missing, the path will be recovered from a call to \code{\link[runjags]{findjags}}} +} +\value{ +A \code{list} object of class \code{mvgam} containing model output, the text representation of the model file, +the mgcv model output (for easily generating simulations at +unsampled covariate values), Dunn-Smyth residuals for each series and key information needed +for other functions in the package +} +\description{ +This function estimates the posterior distribution for Generalised Additive Models (GAMs) that can include +smooth functions, specified in the GAM formula, as well as latent temporal processes, specified by trend_model. +There are currently two options for specifying the structures of the trends (either as latent +dynamic factors to capture trend dependencies among series in a reduced dimension format, or as independent trends) +} +\details{ +Dynamic GAMs are useful when we wish to predict future values from time series that show temporal dependence +but we do not want to rely on extrapolating from a smooth term (which can sometimes lead to unpredictable and unrealistic behaviours). +In addition, smooths can often try to wiggle excessively to capture any autocorrelation that is present in a time series, +which exacerbates the problem of forecasting ahead. As GAMs are very naturally viewed through a Bayesian lens, and we often +must model time series that show complex distributional features and missing data, parameters for \code{mvgam} models are estimated +in a Bayesian framework using Markov Chain Monte Carlo. +\cr +\cr +\emph{Priors}: A \code{\link[mcgv]{jagam}} model file is generated from \code{formula} and modified to include any latent +temporal processes. Prior distributions for most important model parameters can be altered by the user to inspect model +sensitivities to given priors. Note that latent trends are estimated on the log scale so choose tau, AR and phi priors +accordingly. However more control over the model specification can be accomplished by first using \code{mvgam} as a +baseline, then editing the returned model accordingly. The model file can be edited and run outside +of \code{mvgam} by setting \code{run_model = TRUE} and this is encouraged for complex modelling tasks +\cr +\cr +\emph{Random effects}: For any smooth terms using the random effect basis (\code{\link[mcgv]{smooth.construct.re.smooth.spec}}), +a non-centred parameterisation is automatically employed to avoid degeneracies that are common in hierarchical models. +Note however that centred versions may perform better for series that are particularly informative, so as with any +foray into Bayesian modelling, it is worth building an understanding of the model's assumptions and limitations by following a +principled workflow. +\cr +\cr +\emph{Overdispersion parameters}: When more than one series is included in \code{data_train} and an overdispersed +exponential family is used, by default the overdispersion parameters (\code{r} for Poisson, \code{twdis} for Tweedie) are +estimated independently for each series. Note that for Tweedie +models, estimating the power parameter \code{p} alongside the overdispersion parameter +\code{twdis} and the smooth coefficients is very challenging for noisy data, introducing some difficult posterior geometries. +The \code{p} parameter is therefore fixed at \code{1.5} (i.e. a so-called Geometric Poisson model). +\cr +\cr +\emph{Factor regularisation}: When using a dynamic factor model for the trends, factor precisions are given +regularized penalty priors to theoretically allow some factors to be dropped from the model by squeezing increasing +factors' variances to zero. This is done to help protect against selecting too many latent factors than are needed to +capture dependencies in the data, so it can often be advantageous to set `n_lv`` to a slightly larger number. However +larger numbers of factors do come with additional computational costs so these should be balanced as well. +\cr +\cr +\emph{Residuals}: For each series, randomized quantile (i.e. Dunn-Smyth) residuals are calculated for inspecting model diagnostics +If the fitted model is appropriate then Dunn-Smyth residuals will be standard normal in distribution and no +autocorrelation will be evident. When a particular observation is missing, the residual is calculated by comparing independent +draws from the model's posterior distribution +} +\seealso{ +\code{\link[rjags]{jags.model}}, \code{\link[mcgv]{jagam}}, \code{\link[mcgv]{gam}} +} +\author{ +Nicholas J Clark +}