diff --git a/NEWS.md b/NEWS.md index 7f22eb37..97ae71f3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ ## Improvements - Tidymodels speed up +- Added external regressor support for ARIMA by introducing a new model option of `arimax`, which uses engineered features in addition to any external regressors supplied. - Automated feature selection, refer to feature selection vignette for more details # finnts 0.3.0 diff --git a/R/models.R b/R/models.R index e7ccd835..04137f92 100644 --- a/R/models.R +++ b/R/models.R @@ -4,7 +4,7 @@ #' #' @return simple recipe #' @noRd -get_recipie_simple <- function(train_data) { +get_recipe_simple <- function(train_data) { recipes::recipe(Target ~ Date, data = train_data %>% dplyr::select(-Combo)) } @@ -14,7 +14,7 @@ get_recipie_simple <- function(train_data) { #' #' @return combo recipe #' @noRd -get_recipie_combo <- function(train_data) { +get_recipe_combo <- function(train_data) { recipes::recipe(Target ~ Date + Combo, data = train_data) } @@ -32,7 +32,8 @@ get_recipie_combo <- function(train_data) { #' @param pca pca #' @return configurable recipe #' @noRd -get_recipie_configurable <- function(train_data, + +get_recipe_configurable <- function(train_data, mutate_adj_half = FALSE, rm_date = "plain", step_nzv = "zv", @@ -41,7 +42,9 @@ get_recipie_configurable <- function(train_data, character_factor = FALSE, center_scale = FALSE, one_hot = FALSE, - pca = TRUE) { + pca = TRUE, + corr = FALSE, + lincomb = FALSE) { mutate_adj_half_fn <- function(df) { if (mutate_adj_half) { df %>% @@ -65,6 +68,15 @@ get_recipie_configurable <- function(train_data, "none" = df ) } + + corr_fn <- function(df) { + if (corr) { + df %>% + recipes::step_corr(recipes::all_numeric_predictors(), threshold = .5, id = "remove_correlated_vars") + } else { + df + } + } step_nz_fn <- function(df) { switch(step_nzv, @@ -88,7 +100,7 @@ get_recipie_configurable <- function(train_data, dummy_one_hot_fn <- function(df) { if (dummy_one_hot) { df %>% - recipes::step_dummy(recipes::all_nominal(), one_hot = one_hot, id = "step_dummy") + recipes::step_dummy(recipes::all_nominal_predictors(), one_hot = one_hot, id = "step_dummy") } else { df } @@ -122,6 +134,15 @@ get_recipie_configurable <- function(train_data, } } + rm_lincomb_fn <- function(df) { + if (lincomb) { + df %>% + recipes::step_lincomb(recipes::all_numeric_predictors(), id = "remove_linear_combs") + } else { + df + } + } + recipes::recipe(Target ~ ., data = train_data %>% dplyr::select(-Combo)) %>% mutate_adj_half_fn() %>% step_nz_fn() %>% @@ -130,7 +151,9 @@ get_recipie_configurable <- function(train_data, dummy_one_hot_fn() %>% character_factor_fn() %>% center_scale_fn() %>% - pca_fn() + pca_fn() %>% + rm_lincomb_fn() %>% + corr_fn() } @@ -168,7 +191,7 @@ get_fit_simple <- function(train_data, #' @param tune_results Tune results #' @param wflw_spec_tune Worflow Spec after tuning #' -#' @return simple recipie +#' @return simple recipe #' @noRd get_fit_wkflw_best <- function(train_data, tune_results, @@ -187,14 +210,14 @@ get_fit_wkflw_best <- function(train_data, #' #' @param train_data Training Data #' @param model_spec Model Spec -#' @param recipie_spec Recipe Spec +#' @param recipe_spec Recipe Spec #' #' @return simple recipe #' @noRd get_fit_wkflw_nocombo <- function(train_data, model_spec, - recipie_spec) { - get_workflow_simple(model_spec, recipie_spec) %>% + recipe_spec) { + get_workflow_simple(model_spec, recipe_spec) %>% generics::fit(train_data) } @@ -365,8 +388,8 @@ get_latin_hypercube_grid <- function(model_spec) { #' @noRd arima <- function(train_data, frequency) { - recipie_simple <- train_data %>% - get_recipie_simple() + recipe_simple <- train_data %>% + get_recipe_simple() model_spec_arima <- modeltime::arima_reg( seasonal_period = frequency @@ -375,13 +398,45 @@ arima <- function(train_data, wflw_spec <- get_workflow_simple( model_spec_arima, - recipie_simple + recipe_simple ) return(wflw_spec) } +#' ARIMAX Model +#' +#' @param train_data Training Data +#' @param frequency Frequency of Data +#' +#' @return Get the ARIMAX based model +#' @noRd +arimax <- function(train_data, + frequency, + pca) { + + recipe_spec_arimax <- train_data %>% + get_recipe_configurable( + step_nzv = "zv", + dummy_one_hot = TRUE, + corr = TRUE, + pca = pca, + lincomb = TRUE + ) + model_spec_arima <- modeltime::arima_reg( + seasonal_period = frequency + ) %>% + parsnip::set_engine("auto_arima") + + wflw_spec <- get_workflow_simple( + model_spec_arima, + recipe_spec_arimax + ) + + return(wflw_spec) +} + #' ARIMA Boost Model #' #' @param train_data Training Data @@ -394,7 +449,7 @@ arima_boost <- function(train_data, frequency, pca) { recipe_spec_arima_boost <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( step_nzv = "zv", norm_date_adj_year = TRUE, one_hot = TRUE, @@ -436,7 +491,7 @@ cubist <- function(train_data, pca) { if (model_type == "ensemble") { recipe_spec_cubist <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", step_nzv = "nzv", one_hot = FALSE, @@ -444,7 +499,7 @@ cubist <- function(train_data, ) } else { recipe_spec_cubist <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", step_nzv = "nzv", one_hot = FALSE, @@ -477,8 +532,8 @@ cubist <- function(train_data, #' @noRd croston <- function(train_data, frequency) { - recipie_simple <- train_data %>% - get_recipie_simple() + recipe_simple <- train_data %>% + get_recipe_simple() model_spec_croston <- modeltime::exp_smoothing( seasonal_period = frequency @@ -487,7 +542,7 @@ croston <- function(train_data, wflw_spec <- get_workflow_simple( model_spec_croston, - recipie_simple + recipe_simple ) return(wflw_spec) @@ -502,8 +557,8 @@ croston <- function(train_data, #' @noRd ets <- function(train_data, frequency) { - recipie_simple <- train_data %>% - get_recipie_simple() + recipe_simple <- train_data %>% + get_recipe_simple() model_spec_ets <- modeltime::exp_smoothing( error = "auto", @@ -515,7 +570,7 @@ ets <- function(train_data, wflw_spec <- get_workflow_simple( model_spec_ets, - recipie_simple + recipe_simple ) return(wflw_spec) @@ -534,7 +589,7 @@ glmnet <- function(train_data, pca) { if (model_type == "ensemble") { recipe_spec_glmnet <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", step_nzv = "zv", one_hot = FALSE, @@ -543,7 +598,7 @@ glmnet <- function(train_data, ) } else { recipe_spec_glmnet <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", step_nzv = "zv", one_hot = FALSE, @@ -579,7 +634,7 @@ mars <- function(train_data, model_type = "single", pca) { recipe_spec_mars <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", pca = pca ) @@ -610,7 +665,7 @@ mars <- function(train_data, meanf <- function(train_data, frequency) { recipe_spec_meanf <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_meanf <- modeltime::window_reg( window_size = round(frequency) @@ -641,7 +696,7 @@ nnetar <- function(train_data, horizon, frequency) { recipe_spec_nnetar <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_nnetar <- modeltime::nnetar_reg( seasonal_period = frequency, @@ -674,7 +729,7 @@ nnetar_xregs <- function(train_data, frequency, pca) { recipe_spec_nnetar <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( norm_date_adj_year = TRUE, one_hot = TRUE, pca = pca @@ -707,7 +762,7 @@ nnetar_xregs <- function(train_data, #' @noRd prophet <- function(train_data) { recipe_spec_prophet <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_prophet <- modeltime::prophet_reg( growth = tune::tune(), @@ -739,7 +794,7 @@ prophet <- function(train_data) { prophet_boost <- function(train_data, pca) { recipe_spec_prophet_boost <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( step_nzv = "zv", norm_date_adj_year = TRUE, one_hot = TRUE, @@ -777,7 +832,7 @@ prophet_boost <- function(train_data, prophet_xregs <- function(train_data, pca) { recipe_spec_prophet_xregs <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( step_nzv = "zv", dummy_one_hot = FALSE, character_factor = TRUE, @@ -814,7 +869,7 @@ prophet_xregs <- function(train_data, snaive <- function(train_data, frequency) { recipe_spec_snaive <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_snaive <- modeltime::naive_reg( seasonal_period = round(frequency) @@ -841,7 +896,7 @@ stlm_arima <- function(train_data, seasonal_period_stlm_arima <- seasonal_period recipe_spec_stlm_arima <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_stlm_arima <- modeltime::seasonal_reg( seasonal_period_1 = seasonal_period_stlm_arima[1], @@ -870,7 +925,7 @@ stlm_ets <- function(train_data, seasonal_period_stlm_ets <- seasonal_period recipe_spec_stlm_ets <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_stlm_ets <- modeltime::seasonal_reg( seasonal_period_1 = seasonal_period_stlm_ets[1], @@ -900,14 +955,14 @@ svm_poly <- function(train_data, pca) { if (model_type == "ensemble") { recipe_spec_svm <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", one_hot = FALSE, pca = pca ) } else { recipe_spec_svm <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", norm_date_adj_year = TRUE, one_hot = FALSE, @@ -945,14 +1000,14 @@ svm_rbf <- function(train_data, pca) { if (model_type == "ensemble") { recipe_spec_svm <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", one_hot = FALSE, pca = pca ) } else { recipe_spec_svm <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( norm_date_adj_year = TRUE, rm_date = "with_adj", one_hot = FALSE, @@ -988,7 +1043,7 @@ tbats <- function(train_data, seasonal_period_tbats <- seasonal_period recipe_spec_tbats <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_tbats <- modeltime::seasonal_reg( seasonal_period_1 = seasonal_period_tbats[1], @@ -1015,7 +1070,7 @@ tbats <- function(train_data, theta <- function(train_data, frequency) { recipe_spec_theta <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_theta <- modeltime::exp_smoothing( seasonal_period = frequency @@ -1045,7 +1100,7 @@ xgboost <- function(train_data, # create model recipe if (model_type == "ensemble") { recipe_spec_xgboost <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", step_nzv = "zv", one_hot = TRUE, @@ -1053,7 +1108,7 @@ xgboost <- function(train_data, ) } else { recipe_spec_xgboost <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", step_nzv = "zv", one_hot = TRUE, diff --git a/R/prep_models.R b/R/prep_models.R index c7c161fe..08369539 100644 --- a/R/prep_models.R +++ b/R/prep_models.R @@ -516,7 +516,7 @@ model_workflows <- function(run_info, # models to run ml_models <- c( - "arima", "arima-boost", "cubist", "croston", "ets", "glmnet", "mars", "meanf", + "arima", "arima-boost", "arimax", "cubist", "croston", "ets", "glmnet", "mars", "meanf", "nnetar", "nnetar-xregs", "prophet", "prophet-boost", "prophet-xregs", "snaive", "stlm-arima", "stlm-ets", "svm-poly", "svm-rbf", "tbats", "theta", "xgboost" ) diff --git a/R/run_info.R b/R/run_info.R index bf9acc5b..774d1f3f 100644 --- a/R/run_info.R +++ b/R/run_info.R @@ -80,7 +80,7 @@ set_run_info <- function(experiment_name = "finn_fcst", fs::dir_create(tempdir(), models_folder) fs::dir_create(tempdir(), forecasts_folder) fs::dir_create(tempdir(), logs_folder) - } else if (is.null(storage_object) & substr(path, 1, 6) == "/synfs") { + } else if (is.null(storage_object) & substr(path, 1, 6) == "/synfs") { temp_path <- stringr::str_replace(path, "/synfs/", "synfs:/") if (!dir.exists(fs::path(path, prep_data_folder) %>% as.character())) { diff --git a/vignettes/models-used-in-finnts.Rmd b/vignettes/models-used-in-finnts.Rmd index bfafb8db..a22132ef 100644 --- a/vignettes/models-used-in-finnts.Rmd +++ b/vignettes/models-used-in-finnts.Rmd @@ -7,7 +7,6 @@ vignette: > %\VignetteEncoding{UTF-8} --- - ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, @@ -23,6 +22,7 @@ reactable::reactable( data.frame() %>% rbind(data.frame(Model = "arima", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Regression model that is based on finding relationships between lagged values of the target variable you are trying to forecast.")) %>% rbind(data.frame(Model = "arima-boost", Type = "multivariate, local", Underlying.Package = "modeltime, forecast, xgboost", Description = "Arima model (refer to arima) that models the trend compoent of target variable, then uses xgboost model (refer to xgboost) to train on the remaining residuals.")) %>% + rbind(data.frame(Model = "arimax", Type = "multivariate, local", Underlying.Package = "modeltime, forecast", Description = "ARIMA model that incorporates external regressors and other engineered features.")) %>% rbind(data.frame(Model = "cubist", Type = "multivariate, local, global, ensemble", Underlying.Package = "rules", Description = "Hybrid of tree based and linear regression approach. Many decision trees are built, but regression coefficients are used at each terminal node instead of averging values in other tree based approaches.")) %>% rbind(data.frame(Model = "croston", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Useful for intermittent demand forecasting, aka when there are a lot of periods of zero values. Involves simple exponential smoothing on non-zero values of target variable and another application of seasonal exponential smoothing on periods between non-zero elements of the target variable. Refer to ets for more details on exponential smoothing.")) %>% rbind(data.frame(Model = "ets", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Forecasts produced using exponential smoothing methods are weighted averages of past observations, with the weights decaying exponentially as the observations get older. Exponential smoothing models try to forecast the components of a time series which can be broken down in to error, trend, and seasonality. These components can be forecasted separately then either added or multiplied together to get the final forecast output.")) %>% @@ -61,15 +61,15 @@ reactable::reactable( ### Univariate vs Multivariate Models -* **Univariate models** only use the date and target variable values when producing a forecast. They are mostly common on various statistical forecasting models like arima and ets. +- **Univariate models** only use the date and target variable values when producing a forecast. They are mostly common on various statistical forecasting models like arima and ets. -* **Multivariate models** leverage many features when producing a forecast, provided as input data before model training. These features can be automatically created using internal feature engineering techniques within the package, or provided as external regressors. Most common machine learning models today, like xgboost and cubist, are multivariate models. An important thing to note is that multivariate models provided in the package can leverage different recipes of feature engineering, that contain different techniques of creating features. These can be identified by seeing the letter "R" followed by a number like "1" or "2". More info can be found in the feature engineering vignette. +- **Multivariate models** leverage many features when producing a forecast, provided as input data before model training. These features can be automatically created using internal feature engineering techniques within the package, or provided as external regressors. Most common machine learning models today, like xgboost and cubist, are multivariate models. An important thing to note is that multivariate models provided in the package can leverage different recipes of feature engineering, that contain different techniques of creating features. These can be identified by seeing the letter "R" followed by a number like "1" or "2". More info can be found in the feature engineering vignette. ### Global vs Local Models -* **Global models** take the entire data set across all individual time series and model them all at once within a single model. Global models are only ran if the input data contains more than one individual time series. +- **Global models** take the entire data set across all individual time series and model them all at once within a single model. Global models are only ran if the input data contains more than one individual time series. -* **Local models** take each individual time series from the input data and model them separately. +- **Local models** take each individual time series from the input data and model them separately. ### Ensemble Models