From 329b523edefa67ab9c370d37f30967b3cc598871 Mon Sep 17 00:00:00 2001 From: Taichi Kato Date: Mon, 26 Jun 2023 16:05:06 -0400 Subject: [PATCH 01/11] removed path transformation/use of ms helper --- R/run_info.R | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/R/run_info.R b/R/run_info.R index bf9acc5b..85687118 100644 --- a/R/run_info.R +++ b/R/run_info.R @@ -80,29 +80,7 @@ set_run_info <- function(experiment_name = "finn_fcst", fs::dir_create(tempdir(), models_folder) fs::dir_create(tempdir(), forecasts_folder) fs::dir_create(tempdir(), logs_folder) - } else if (is.null(storage_object) & substr(path, 1, 6) == "/synfs") { - temp_path <- stringr::str_replace(path, "/synfs/", "synfs:/") - - if (!dir.exists(fs::path(path, prep_data_folder) %>% as.character())) { - notebookutils::mssparkutils.fs.mkdirs(fs::path(temp_path, prep_data_folder) %>% as.character()) - } - - if (!dir.exists(fs::path(path, prep_models_folder) %>% as.character())) { - notebookutils::mssparkutils.fs.mkdirs(fs::path(temp_path, prep_models_folder) %>% as.character()) - } - - if (!dir.exists(fs::path(path, models_folder) %>% as.character())) { - notebookutils::mssparkutils.fs.mkdirs(fs::path(temp_path, models_folder) %>% as.character()) - } - - if (!dir.exists(fs::path(path, forecasts_folder) %>% as.character())) { - notebookutils::mssparkutils.fs.mkdirs(fs::path(temp_path, forecasts_folder) %>% as.character()) - } - - if (!dir.exists(fs::path(path, logs_folder) %>% as.character())) { - notebookutils::mssparkutils.fs.mkdirs(fs::path(temp_path, logs_folder) %>% as.character()) - } - } else if (is.null(storage_object)) { + }else if (is.null(storage_object)) { fs::dir_create(path, prep_data_folder) fs::dir_create(path, prep_models_folder) fs::dir_create(path, models_folder) From 125bfc09e572b54048cf51119f30d046c5914ac5 Mon Sep 17 00:00:00 2001 From: Taichi Kato Date: Fri, 18 Aug 2023 12:22:29 -0700 Subject: [PATCH 02/11] Added arima-xreg --- R/models.R | 61 ++++++++++++++++++++++++++++++++++++++++++++++--- R/prep_models.R | 2 +- 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/R/models.R b/R/models.R index 70cb80bc..757c3c97 100644 --- a/R/models.R +++ b/R/models.R @@ -40,7 +40,9 @@ get_recipie_configurable <- function(train_data, character_factor = FALSE, center_scale = FALSE, one_hot = FALSE, - pca = TRUE) { + pca = TRUE, + corr = FALSE, + lincomb = FALSE) { mutate_adj_half_fn <- function(df) { if (mutate_adj_half) { df %>% @@ -63,6 +65,15 @@ get_recipie_configurable <- function(train_data, "none" = df ) } + + corr_fn <- function(df) { + if (corr) { + df %>% + recipes::step_corr(recipes::all_numeric_predictors(), threshold = .5) + } else { + df + } + } step_nz_fn <- function(df) { switch(step_nzv, @@ -119,16 +130,27 @@ get_recipie_configurable <- function(train_data, df } } + + rm_lincomb_fn <- function(df) { + if (lincomb) { + df %>% + recipes::step_lincomb(recipes::all_numeric_predictors()) + } else { + df + } + } recipes::recipe(Target ~ ., data = train_data %>% dplyr::select(-Combo)) %>% mutate_adj_half_fn() %>% - step_nz_fn() %>% rm_date_fn() %>% norm_date_adj_year_fn() %>% dummy_one_hot_fn() %>% character_factor_fn() %>% center_scale_fn() %>% - pca_fn() + pca_fn() %>% + step_nz_fn() %>% + rm_lincomb_fn() %>% + corr_fn() } @@ -380,6 +402,39 @@ arima <- function(train_data, } +#' ARIMA Xregs Model +#' +#' @param train_data Training Data +#' @param frequency Frequency of Data +#' +#' @return Get the ARIMA based model +#' @noRd +arima_xregs <- function(train_data, + frequency, + pca) { + + recipie_simple <- train_data %>% # rename recipe + get_recipie_configurable( + step_nzv = "zv", + dummy_one_hot = FALSE, + corr = TRUE, + pca = FALSE, + lincomb = TRUE + ) %>% + step_select(recipes::all_numeric_predictors(),recipes::all_date_predictors(),recipes::all_datetime_predictors()) + model_spec_arima <- modeltime::arima_reg( + seasonal_period = frequency + ) %>% + parsnip::set_engine("auto_arima") + + wflw_spec <- get_workflow_simple( + model_spec_arima, + recipie_simple + ) + + return(wflw_spec) +} + #' ARIMA Boost Model #' #' @param train_data Training Data diff --git a/R/prep_models.R b/R/prep_models.R index dbaeb12d..097effce 100644 --- a/R/prep_models.R +++ b/R/prep_models.R @@ -505,7 +505,7 @@ model_workflows <- function(run_info, # models to run ml_models <- c( - "arima", "arima-boost", "cubist", "croston", "ets", "glmnet", "mars", "meanf", + "arima", "arima-boost", "arima-xregs", "cubist", "croston", "ets", "glmnet", "mars", "meanf", "nnetar", "nnetar-xregs", "prophet", "prophet-boost", "prophet-xregs", "snaive", "stlm-arima", "stlm-ets", "svm-poly", "svm-rbf", "tbats", "theta", "xgboost" ) From feb634ece8952bdc24824f3754e1b2a7d508fb45 Mon Sep 17 00:00:00 2001 From: Taichi Kato Date: Fri, 18 Aug 2023 12:22:52 -0700 Subject: [PATCH 03/11] debug --- R/prep_data.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/prep_data.R b/R/prep_data.R index d003865a..a14d540e 100644 --- a/R/prep_data.R +++ b/R/prep_data.R @@ -730,7 +730,8 @@ clean_outliers_missing_values <- function(df, clean_outliers, clean_missing_values, frequency_number, - external_regressors) { + external_regressors, + outlier_method = '') { correct_clean_func <- function(col) { if (clean_missing_values & sum(!is.na(col)) < 2) { col From 6f37646d4c85d3c165c70e0f6ebfac2456b2a3b5 Mon Sep 17 00:00:00 2001 From: Taichi Kato Date: Fri, 18 Aug 2023 12:23:03 -0700 Subject: [PATCH 04/11] debug --- R/train_models.R | 29 +++++++++++++++++------------ tests/testthat/test-models.R | 0 2 files changed, 17 insertions(+), 12 deletions(-) create mode 100644 tests/testthat/test-models.R diff --git a/R/train_models.R b/R/train_models.R index 130f587b..6d284e9a 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -321,7 +321,6 @@ train_models <- function(run_info, dplyr::group_split(dplyr::row_number(), .keep = FALSE), .combine = "rbind", .packages = inner_packages, - .errorhandling = "remove", .verbose = FALSE, .inorder = FALSE, .multicombine = TRUE, @@ -407,16 +406,22 @@ train_models <- function(run_info, # fit model set.seed(seed) - - if (nrow(hyperparameters) > 0) { - model_fit <- workflow_final %>% - tune::finalize_workflow(parameters = hyperparameters) %>% - generics::fit(data = training) - } else { - model_fit <- workflow_final %>% - generics::fit(data = training) + tryCatch( + { + if (nrow(hyperparameters) > 0) { + model_fit <- workflow_final %>% + tune::finalize_workflow(parameters = hyperparameters) %>% + generics::fit(data = training) + } else { + model_fit <- workflow_final %>% + generics::fit(data = training) + + } + + }, error = function(err) { + stop("ERROR:", err) } - + ) # create prediction model_prediction <- testing %>% dplyr::bind_cols( @@ -436,7 +441,7 @@ train_models <- function(run_info, Hyperparameter_ID = param_combo, Prediction = list(model_prediction) ) - + browser() return(final_tbl) } %>% base::suppressPackageStartupMessages() @@ -451,7 +456,7 @@ train_models <- function(run_info, dplyr::arrange(RMSE) %>% dplyr::slice(1) %>% dplyr::ungroup() - + model_tune_tbl <- initial_tune_tbl %>% dplyr::select(Model_Name, Model_Type, Recipe_ID, Hyperparameter_ID, Train_Test_ID, Prediction) %>% dplyr::right_join(best_param, by = c("Model_Name", "Model_Type", "Recipe_ID", "Hyperparameter_ID")) %>% diff --git a/tests/testthat/test-models.R b/tests/testthat/test-models.R new file mode 100644 index 00000000..e69de29b From cff4c01fa00d4d592a54e886646576d1ee556cbc Mon Sep 17 00:00:00 2001 From: Taichi Kato Date: Fri, 18 Aug 2023 12:25:07 -0700 Subject: [PATCH 05/11] Renaming recipie to recipe for consistency --- R/models.R | 78 +++++++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/R/models.R b/R/models.R index 70cb80bc..751f8449 100644 --- a/R/models.R +++ b/R/models.R @@ -4,7 +4,7 @@ #' #' @return simple recipe #' @noRd -get_recipie_simple <- function(train_data) { +get_recipe_simple <- function(train_data) { recipes::recipe(Target ~ Date, data = train_data %>% dplyr::select(-Combo)) } @@ -14,7 +14,7 @@ get_recipie_simple <- function(train_data) { #' #' @return combo recipe #' @noRd -get_recipie_combo <- function(train_data) { +get_recipe_combo <- function(train_data) { recipes::recipe(Target ~ Date + Combo, data = train_data) } @@ -31,7 +31,7 @@ get_recipie_combo <- function(train_data) { #' @param one_hot True or False #' @return configurable recipe #' @noRd -get_recipie_configurable <- function(train_data, +get_recipe_configurable <- function(train_data, mutate_adj_half = FALSE, # todo Fix this. Should be true rm_date = "plain", step_nzv = "zv", @@ -166,7 +166,7 @@ get_fit_simple <- function(train_data, #' @param tune_results Tune results #' @param wflw_spec_tune Worflow Spec after tuning #' -#' @return simple recipie +#' @return simple recipe #' @noRd get_fit_wkflw_best <- function(train_data, tune_results, @@ -185,14 +185,14 @@ get_fit_wkflw_best <- function(train_data, #' #' @param train_data Training Data #' @param model_spec Model Spec -#' @param recipie_spec Recipe Spec +#' @param recipe_spec Recipe Spec #' #' @return simple recipe #' @noRd get_fit_wkflw_nocombo <- function(train_data, model_spec, - recipie_spec) { - get_workflow_simple(model_spec, recipie_spec) %>% + recipe_spec) { + get_workflow_simple(model_spec, recipe_spec) %>% generics::fit(train_data) } @@ -363,8 +363,8 @@ get_latin_hypercube_grid <- function(model_spec) { #' @noRd arima <- function(train_data, frequency) { - recipie_simple <- train_data %>% - get_recipie_simple() + recipe_simple <- train_data %>% + get_recipe_simple() model_spec_arima <- modeltime::arima_reg( seasonal_period = frequency @@ -373,7 +373,7 @@ arima <- function(train_data, wflw_spec <- get_workflow_simple( model_spec_arima, - recipie_simple + recipe_simple ) return(wflw_spec) @@ -392,7 +392,7 @@ arima_boost <- function(train_data, frequency, pca) { recipe_spec_arima_boost <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( step_nzv = "zv", norm_date_adj_year = TRUE, one_hot = TRUE, @@ -434,7 +434,7 @@ cubist <- function(train_data, pca) { if (model_type == "ensemble") { recipe_spec_cubist <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", step_nzv = "nzv", one_hot = FALSE, @@ -442,7 +442,7 @@ cubist <- function(train_data, ) } else { recipe_spec_cubist <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", step_nzv = "nzv", one_hot = FALSE, @@ -475,8 +475,8 @@ cubist <- function(train_data, #' @noRd croston <- function(train_data, frequency) { - recipie_simple <- train_data %>% - get_recipie_simple() + recipe_simple <- train_data %>% + get_recipe_simple() model_spec_croston <- modeltime::exp_smoothing( seasonal_period = frequency @@ -485,7 +485,7 @@ croston <- function(train_data, wflw_spec <- get_workflow_simple( model_spec_croston, - recipie_simple + recipe_simple ) return(wflw_spec) @@ -500,8 +500,8 @@ croston <- function(train_data, #' @noRd ets <- function(train_data, frequency) { - recipie_simple <- train_data %>% - get_recipie_simple() + recipe_simple <- train_data %>% + get_recipe_simple() model_spec_ets <- modeltime::exp_smoothing( error = "auto", @@ -513,7 +513,7 @@ ets <- function(train_data, wflw_spec <- get_workflow_simple( model_spec_ets, - recipie_simple + recipe_simple ) return(wflw_spec) @@ -532,7 +532,7 @@ glmnet <- function(train_data, pca) { if (model_type == "ensemble") { recipe_spec_glmnet <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", step_nzv = "zv", one_hot = FALSE, @@ -541,7 +541,7 @@ glmnet <- function(train_data, ) } else { recipe_spec_glmnet <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", step_nzv = "zv", one_hot = FALSE, @@ -577,7 +577,7 @@ mars <- function(train_data, model_type = "single", pca) { recipe_spec_mars <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", pca = pca ) @@ -608,7 +608,7 @@ mars <- function(train_data, meanf <- function(train_data, frequency) { recipe_spec_meanf <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_meanf <- modeltime::window_reg( window_size = frequency @@ -639,7 +639,7 @@ nnetar <- function(train_data, horizon, frequency) { recipe_spec_nnetar <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_nnetar <- modeltime::nnetar_reg( seasonal_period = frequency, @@ -672,7 +672,7 @@ nnetar_xregs <- function(train_data, frequency, pca) { recipe_spec_nnetar <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( norm_date_adj_year = TRUE, one_hot = TRUE, pca = pca @@ -705,7 +705,7 @@ nnetar_xregs <- function(train_data, #' @noRd prophet <- function(train_data) { recipe_spec_prophet <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_prophet <- modeltime::prophet_reg( growth = tune::tune(), @@ -737,7 +737,7 @@ prophet <- function(train_data) { prophet_boost <- function(train_data, pca) { recipe_spec_prophet_boost <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( step_nzv = "zv", norm_date_adj_year = TRUE, one_hot = TRUE, @@ -775,7 +775,7 @@ prophet_boost <- function(train_data, prophet_xregs <- function(train_data, pca) { recipe_spec_prophet_xregs <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( step_nzv = "zv", dummy_one_hot = FALSE, character_factor = TRUE, @@ -812,7 +812,7 @@ prophet_xregs <- function(train_data, snaive <- function(train_data, frequency) { recipe_spec_snaive <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_snaive <- modeltime::naive_reg( seasonal_period = frequency @@ -839,7 +839,7 @@ stlm_arima <- function(train_data, seasonal_period_stlm_arima <- seasonal_period recipe_spec_stlm_arima <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_stlm_arima <- modeltime::seasonal_reg( seasonal_period_1 = seasonal_period_stlm_arima[1], @@ -868,7 +868,7 @@ stlm_ets <- function(train_data, seasonal_period_stlm_ets <- seasonal_period recipe_spec_stlm_ets <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_stlm_ets <- modeltime::seasonal_reg( seasonal_period_1 = seasonal_period_stlm_ets[1], @@ -898,14 +898,14 @@ svm_poly <- function(train_data, pca) { if (model_type == "ensemble") { recipe_spec_svm <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", one_hot = FALSE, pca = pca ) } else { recipe_spec_svm <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", norm_date_adj_year = TRUE, one_hot = FALSE, @@ -943,14 +943,14 @@ svm_rbf <- function(train_data, pca) { if (model_type == "ensemble") { recipe_spec_svm <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", one_hot = FALSE, pca = pca ) } else { recipe_spec_svm <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( norm_date_adj_year = TRUE, rm_date = "with_adj", one_hot = FALSE, @@ -986,7 +986,7 @@ tbats <- function(train_data, seasonal_period_tbats <- seasonal_period recipe_spec_tbats <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_tbats <- modeltime::seasonal_reg( seasonal_period_1 = seasonal_period_tbats[1], @@ -1013,7 +1013,7 @@ tbats <- function(train_data, theta <- function(train_data, frequency) { recipe_spec_theta <- train_data %>% - get_recipie_simple() + get_recipe_simple() model_spec_theta <- modeltime::exp_smoothing( seasonal_period = frequency @@ -1043,7 +1043,7 @@ xgboost <- function(train_data, # create model recipe if (model_type == "ensemble") { recipe_spec_xgboost <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", step_nzv = "zv", one_hot = TRUE, @@ -1051,7 +1051,7 @@ xgboost <- function(train_data, ) } else { recipe_spec_xgboost <- train_data %>% - get_recipie_configurable( + get_recipe_configurable( rm_date = "with_adj", step_nzv = "zv", one_hot = TRUE, From cf3f55233e427dbe6e374a168ef504df75b04f9f Mon Sep 17 00:00:00 2001 From: Taichi Kato Date: Wed, 23 Aug 2023 09:12:04 -0700 Subject: [PATCH 06/11] updated model recipe --- R/models.R | 7 +++---- R/train_models.R | 10 +--------- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/R/models.R b/R/models.R index 757c3c97..302d1e5a 100644 --- a/R/models.R +++ b/R/models.R @@ -97,7 +97,7 @@ get_recipie_configurable <- function(train_data, dummy_one_hot_fn <- function(df) { if (dummy_one_hot) { df %>% - recipes::step_dummy(recipes::all_nominal(), one_hot = one_hot) + recipes::step_dummy(recipes::all_nominal_predictors(), one_hot = one_hot) } else { df } @@ -416,12 +416,11 @@ arima_xregs <- function(train_data, recipie_simple <- train_data %>% # rename recipe get_recipie_configurable( step_nzv = "zv", - dummy_one_hot = FALSE, + dummy_one_hot = TRUE, corr = TRUE, pca = FALSE, lincomb = TRUE - ) %>% - step_select(recipes::all_numeric_predictors(),recipes::all_date_predictors(),recipes::all_datetime_predictors()) + ) model_spec_arima <- modeltime::arima_reg( seasonal_period = frequency ) %>% diff --git a/R/train_models.R b/R/train_models.R index 6d284e9a..fe7cde71 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -247,7 +247,7 @@ train_models <- function(run_info, x = current_combo_list_final, .combine = "rbind", .packages = packages, - .errorhandling = "stop", + .errorhandling = "remove", .verbose = FALSE, .inorder = FALSE, .multicombine = TRUE, @@ -406,8 +406,6 @@ train_models <- function(run_info, # fit model set.seed(seed) - tryCatch( - { if (nrow(hyperparameters) > 0) { model_fit <- workflow_final %>% tune::finalize_workflow(parameters = hyperparameters) %>% @@ -417,11 +415,6 @@ train_models <- function(run_info, generics::fit(data = training) } - - }, error = function(err) { - stop("ERROR:", err) - } - ) # create prediction model_prediction <- testing %>% dplyr::bind_cols( @@ -441,7 +434,6 @@ train_models <- function(run_info, Hyperparameter_ID = param_combo, Prediction = list(model_prediction) ) - browser() return(final_tbl) } %>% base::suppressPackageStartupMessages() From e6e1aba9f6bc5b4ecb5c0c18aebeac0de0eeeefc Mon Sep 17 00:00:00 2001 From: Taichi Kato Date: Wed, 23 Aug 2023 13:20:42 -0700 Subject: [PATCH 07/11] rename arima-xregs to arimax --- R/models.R | 8 ++++---- R/prep_models.R | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/models.R b/R/models.R index 4ba3aeb6..298603e8 100644 --- a/R/models.R +++ b/R/models.R @@ -405,19 +405,19 @@ arima <- function(train_data, } -#' ARIMA Xregs Model +#' ARIMAX Model #' #' @param train_data Training Data #' @param frequency Frequency of Data #' -#' @return Get the ARIMA based model +#' @return Get the ARIMAX based model #' @noRd -arima_xregs <- function(train_data, +arimax <- function(train_data, frequency, pca) { recipie_simple <- train_data %>% # rename recipe - get_recipie_configurable( + get_recipe_configurable( step_nzv = "zv", dummy_one_hot = TRUE, corr = TRUE, diff --git a/R/prep_models.R b/R/prep_models.R index f9940b84..08369539 100644 --- a/R/prep_models.R +++ b/R/prep_models.R @@ -516,7 +516,7 @@ model_workflows <- function(run_info, # models to run ml_models <- c( - "arima", "arima-boost", "arima-xregs", "cubist", "croston", "ets", "glmnet", "mars", "meanf", + "arima", "arima-boost", "arimax", "cubist", "croston", "ets", "glmnet", "mars", "meanf", "nnetar", "nnetar-xregs", "prophet", "prophet-boost", "prophet-xregs", "snaive", "stlm-arima", "stlm-ets", "svm-poly", "svm-rbf", "tbats", "theta", "xgboost" ) From d972a31c063cd6ecc511a202adcaef97ede473db Mon Sep 17 00:00:00 2001 From: Taichi Kato Date: Wed, 23 Aug 2023 16:11:49 -0700 Subject: [PATCH 08/11] undo: remove path transformation/use of ms helper --- R/run_info.R | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/R/run_info.R b/R/run_info.R index 85687118..774d1f3f 100644 --- a/R/run_info.R +++ b/R/run_info.R @@ -80,7 +80,29 @@ set_run_info <- function(experiment_name = "finn_fcst", fs::dir_create(tempdir(), models_folder) fs::dir_create(tempdir(), forecasts_folder) fs::dir_create(tempdir(), logs_folder) - }else if (is.null(storage_object)) { + } else if (is.null(storage_object) & substr(path, 1, 6) == "/synfs") { + temp_path <- stringr::str_replace(path, "/synfs/", "synfs:/") + + if (!dir.exists(fs::path(path, prep_data_folder) %>% as.character())) { + notebookutils::mssparkutils.fs.mkdirs(fs::path(temp_path, prep_data_folder) %>% as.character()) + } + + if (!dir.exists(fs::path(path, prep_models_folder) %>% as.character())) { + notebookutils::mssparkutils.fs.mkdirs(fs::path(temp_path, prep_models_folder) %>% as.character()) + } + + if (!dir.exists(fs::path(path, models_folder) %>% as.character())) { + notebookutils::mssparkutils.fs.mkdirs(fs::path(temp_path, models_folder) %>% as.character()) + } + + if (!dir.exists(fs::path(path, forecasts_folder) %>% as.character())) { + notebookutils::mssparkutils.fs.mkdirs(fs::path(temp_path, forecasts_folder) %>% as.character()) + } + + if (!dir.exists(fs::path(path, logs_folder) %>% as.character())) { + notebookutils::mssparkutils.fs.mkdirs(fs::path(temp_path, logs_folder) %>% as.character()) + } + } else if (is.null(storage_object)) { fs::dir_create(path, prep_data_folder) fs::dir_create(path, prep_models_folder) fs::dir_create(path, models_folder) From 20bceadfb3f2f617a11342c3b84cc2f9dd062946 Mon Sep 17 00:00:00 2001 From: Taichi Kato Date: Tue, 29 Aug 2023 17:38:46 -0700 Subject: [PATCH 09/11] Fixed recipe ordering, and formatting --- R/models.R | 12 ++++++------ R/prep_data.R | 3 +-- R/train_models.R | 3 ++- vignettes/models-used-in-finnts.Rmd | 10 +++++----- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/R/models.R b/R/models.R index 298603e8..04137f92 100644 --- a/R/models.R +++ b/R/models.R @@ -72,7 +72,7 @@ get_recipe_configurable <- function(train_data, corr_fn <- function(df) { if (corr) { df %>% - recipes::step_corr(recipes::all_numeric_predictors(), threshold = .5) + recipes::step_corr(recipes::all_numeric_predictors(), threshold = .5, id = "remove_correlated_vars") } else { df } @@ -137,7 +137,7 @@ get_recipe_configurable <- function(train_data, rm_lincomb_fn <- function(df) { if (lincomb) { df %>% - recipes::step_lincomb(recipes::all_numeric_predictors()) + recipes::step_lincomb(recipes::all_numeric_predictors(), id = "remove_linear_combs") } else { df } @@ -145,13 +145,13 @@ get_recipe_configurable <- function(train_data, recipes::recipe(Target ~ ., data = train_data %>% dplyr::select(-Combo)) %>% mutate_adj_half_fn() %>% + step_nz_fn() %>% rm_date_fn() %>% norm_date_adj_year_fn() %>% dummy_one_hot_fn() %>% character_factor_fn() %>% center_scale_fn() %>% pca_fn() %>% - step_nz_fn() %>% rm_lincomb_fn() %>% corr_fn() } @@ -416,12 +416,12 @@ arimax <- function(train_data, frequency, pca) { - recipie_simple <- train_data %>% # rename recipe + recipe_spec_arimax <- train_data %>% get_recipe_configurable( step_nzv = "zv", dummy_one_hot = TRUE, corr = TRUE, - pca = FALSE, + pca = pca, lincomb = TRUE ) model_spec_arima <- modeltime::arima_reg( @@ -431,7 +431,7 @@ arimax <- function(train_data, wflw_spec <- get_workflow_simple( model_spec_arima, - recipie_simple + recipe_spec_arimax ) return(wflw_spec) diff --git a/R/prep_data.R b/R/prep_data.R index 74f6578a..2f331e41 100644 --- a/R/prep_data.R +++ b/R/prep_data.R @@ -775,8 +775,7 @@ clean_outliers_missing_values <- function(df, clean_outliers, clean_missing_values, frequency_number, - external_regressors, - outlier_method = '') { + external_regressors) { correct_clean_func <- function(col) { if (clean_missing_values & sum(!is.na(col)) < 2) { col diff --git a/R/train_models.R b/R/train_models.R index cb6d4542..a0215fb3 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -244,7 +244,7 @@ train_models <- function(run_info, x = current_combo_list_final, .combine = "rbind", .packages = packages, - .errorhandling = "remove", + .errorhandling = "stop", .verbose = FALSE, .inorder = FALSE, .multicombine = TRUE, @@ -288,6 +288,7 @@ train_models <- function(run_info, dplyr::select(Model_Name, Model_Recipe) %>% dplyr::group_split(dplyr::row_number(), .keep = FALSE), .combine = "rbind", + .errorhandling = "remove", .verbose = FALSE, .inorder = FALSE, .multicombine = TRUE, diff --git a/vignettes/models-used-in-finnts.Rmd b/vignettes/models-used-in-finnts.Rmd index bfafb8db..fe38e52b 100644 --- a/vignettes/models-used-in-finnts.Rmd +++ b/vignettes/models-used-in-finnts.Rmd @@ -7,7 +7,6 @@ vignette: > %\VignetteEncoding{UTF-8} --- - ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, @@ -23,6 +22,7 @@ reactable::reactable( data.frame() %>% rbind(data.frame(Model = "arima", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Regression model that is based on finding relationships between lagged values of the target variable you are trying to forecast.")) %>% rbind(data.frame(Model = "arima-boost", Type = "multivariate, local", Underlying.Package = "modeltime, forecast, xgboost", Description = "Arima model (refer to arima) that models the trend compoent of target variable, then uses xgboost model (refer to xgboost) to train on the remaining residuals.")) %>% + rbind(data.frame(Model = "arimax", Type = "multivariate, local", Underlying.Package = "modeltime, forecast", Description = "Prophet model that incorporates external regressors and other engineered features.")) %>% rbind(data.frame(Model = "cubist", Type = "multivariate, local, global, ensemble", Underlying.Package = "rules", Description = "Hybrid of tree based and linear regression approach. Many decision trees are built, but regression coefficients are used at each terminal node instead of averging values in other tree based approaches.")) %>% rbind(data.frame(Model = "croston", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Useful for intermittent demand forecasting, aka when there are a lot of periods of zero values. Involves simple exponential smoothing on non-zero values of target variable and another application of seasonal exponential smoothing on periods between non-zero elements of the target variable. Refer to ets for more details on exponential smoothing.")) %>% rbind(data.frame(Model = "ets", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Forecasts produced using exponential smoothing methods are weighted averages of past observations, with the weights decaying exponentially as the observations get older. Exponential smoothing models try to forecast the components of a time series which can be broken down in to error, trend, and seasonality. These components can be forecasted separately then either added or multiplied together to get the final forecast output.")) %>% @@ -61,15 +61,15 @@ reactable::reactable( ### Univariate vs Multivariate Models -* **Univariate models** only use the date and target variable values when producing a forecast. They are mostly common on various statistical forecasting models like arima and ets. +- **Univariate models** only use the date and target variable values when producing a forecast. They are mostly common on various statistical forecasting models like arima and ets. -* **Multivariate models** leverage many features when producing a forecast, provided as input data before model training. These features can be automatically created using internal feature engineering techniques within the package, or provided as external regressors. Most common machine learning models today, like xgboost and cubist, are multivariate models. An important thing to note is that multivariate models provided in the package can leverage different recipes of feature engineering, that contain different techniques of creating features. These can be identified by seeing the letter "R" followed by a number like "1" or "2". More info can be found in the feature engineering vignette. +- **Multivariate models** leverage many features when producing a forecast, provided as input data before model training. These features can be automatically created using internal feature engineering techniques within the package, or provided as external regressors. Most common machine learning models today, like xgboost and cubist, are multivariate models. An important thing to note is that multivariate models provided in the package can leverage different recipes of feature engineering, that contain different techniques of creating features. These can be identified by seeing the letter "R" followed by a number like "1" or "2". More info can be found in the feature engineering vignette. ### Global vs Local Models -* **Global models** take the entire data set across all individual time series and model them all at once within a single model. Global models are only ran if the input data contains more than one individual time series. +- **Global models** take the entire data set across all individual time series and model them all at once within a single model. Global models are only ran if the input data contains more than one individual time series. -* **Local models** take each individual time series from the input data and model them separately. +- **Local models** take each individual time series from the input data and model them separately. ### Ensemble Models From 57f742f35d73ce753c19318be8d4e6d739403c8e Mon Sep 17 00:00:00 2001 From: Taichi Kato Date: Wed, 30 Aug 2023 12:56:14 -0700 Subject: [PATCH 10/11] Corrected typo --- vignettes/models-used-in-finnts.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/models-used-in-finnts.Rmd b/vignettes/models-used-in-finnts.Rmd index fe38e52b..a22132ef 100644 --- a/vignettes/models-used-in-finnts.Rmd +++ b/vignettes/models-used-in-finnts.Rmd @@ -22,7 +22,7 @@ reactable::reactable( data.frame() %>% rbind(data.frame(Model = "arima", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Regression model that is based on finding relationships between lagged values of the target variable you are trying to forecast.")) %>% rbind(data.frame(Model = "arima-boost", Type = "multivariate, local", Underlying.Package = "modeltime, forecast, xgboost", Description = "Arima model (refer to arima) that models the trend compoent of target variable, then uses xgboost model (refer to xgboost) to train on the remaining residuals.")) %>% - rbind(data.frame(Model = "arimax", Type = "multivariate, local", Underlying.Package = "modeltime, forecast", Description = "Prophet model that incorporates external regressors and other engineered features.")) %>% + rbind(data.frame(Model = "arimax", Type = "multivariate, local", Underlying.Package = "modeltime, forecast", Description = "ARIMA model that incorporates external regressors and other engineered features.")) %>% rbind(data.frame(Model = "cubist", Type = "multivariate, local, global, ensemble", Underlying.Package = "rules", Description = "Hybrid of tree based and linear regression approach. Many decision trees are built, but regression coefficients are used at each terminal node instead of averging values in other tree based approaches.")) %>% rbind(data.frame(Model = "croston", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Useful for intermittent demand forecasting, aka when there are a lot of periods of zero values. Involves simple exponential smoothing on non-zero values of target variable and another application of seasonal exponential smoothing on periods between non-zero elements of the target variable. Refer to ets for more details on exponential smoothing.")) %>% rbind(data.frame(Model = "ets", Type = "univariate, local", Underlying.Package = "modeltime, forecast", Description = "Forecasts produced using exponential smoothing methods are weighted averages of past observations, with the weights decaying exponentially as the observations get older. Exponential smoothing models try to forecast the components of a time series which can be broken down in to error, trend, and seasonality. These components can be forecasted separately then either added or multiplied together to get the final forecast output.")) %>% From 41e5228d1ba71b324a60c99dfb7d50d18b23e875 Mon Sep 17 00:00:00 2001 From: Taichi Kato Date: Wed, 30 Aug 2023 12:58:26 -0700 Subject: [PATCH 11/11] Included news message about ARIMAX --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 2973340a..73bb0fd3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ ## Improvements - Tidymodels speed up +- Added external regressor support for ARIMA by introducing a new model option of `arimax`, which uses engineered features in addition to any external regressors supplied. # finnts 0.3.0