From 0e616839b4941dc197d0a8744e64f3d552c3bc16 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Tue, 29 Aug 2023 08:48:24 -0700 Subject: [PATCH 01/11] new box-cox and differencing updates, new list_models() function --- DESCRIPTION | 3 +- NAMESPACE | 1 + NEWS.md | 2 + R/ensemble_models.R | 4 +- R/feature_selection.R | 34 +++++-- R/models.R | 78 +++++++++++++++ R/prep_data.R | 204 ++++++++++++++++++++++++++++++++++++++- R/prep_models.R | 18 +--- R/train_models.R | 215 ++++++++++++++++++++++++++++++++++++++++-- R/utility.R | 2 +- man/list_models.Rd | 14 +++ man/prep_data.Rd | 6 ++ 12 files changed, 540 insertions(+), 41 deletions(-) create mode 100644 man/list_models.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 577c9e82..51ab618f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: finnts Title: Microsoft Finance Time Series Forecasting Framework -Version: 0.3.0.9002 +Version: 0.3.0.9003 Authors@R: c(person(given = "Mike", family = "Tokic", @@ -33,6 +33,7 @@ Imports: doParallel, dplyr, earth, + feasts, foreach, fs, generics, diff --git a/NAMESPACE b/NAMESPACE index 1d964f84..616c03f5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(get_prepped_data) export(get_prepped_models) export(get_run_info) export(get_trained_models) +export(list_models) export(prep_data) export(prep_models) export(set_run_info) diff --git a/NEWS.md b/NEWS.md index c3b806b8..efcd12f1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,8 @@ - Tidymodels speed up - Automated feature selection, refer to feature selection vignette for more details - Error handling in hierarchical forecast reconciliation +- Box-cox and differencing transformations +- Added new function, `list_models()`, that lists available models in the package ## Bug Fixes diff --git a/R/ensemble_models.R b/R/ensemble_models.R index 94d88c06..1af8408d 100644 --- a/R/ensemble_models.R +++ b/R/ensemble_models.R @@ -140,14 +140,14 @@ ensemble_models <- function(run_info, } # get ensemble models to run - ensemble_model_list <- c("cubist", "glmnet", "svm-poly", "svm-rbf", "xgboost") + ensemble_model_list <- list_ensemble_models() if (is.na(models_to_run) & is.na(models_not_to_run)) { # do nothing, using existing ml_models list } else if (is.na(models_to_run) & !is.na(models_not_to_run)) { ensemble_model_list <- setdiff(ensemble_model_list, stringr::str_split(models_not_to_run, "---")[[1]]) } else { - ensemble_model_list <- ensemble_model_list[c("cubist", "glmnet", "svm-poly", "svm-rbf", "xgboost") %in% stringr::str_split(models_to_run, "---")[[1]]] + ensemble_model_list <- ensemble_model_list[list_ensemble_models() %in% stringr::str_split(models_to_run, "---")[[1]]] } # parallel run info diff --git a/R/feature_selection.R b/R/feature_selection.R index 987359fb..92194416 100644 --- a/R/feature_selection.R +++ b/R/feature_selection.R @@ -131,16 +131,30 @@ select_features <- function(input_data, dplyr::select(Feature, Vote, Auto_Accept) # cubist feature importance - vip_cubist_results <- vip_cubist_fn( - input_data, - seed - ) %>% - dplyr::rename(Feature = Variable) %>% - dplyr::mutate( - Vote = 1, - Auto_Accept = 0 - ) %>% - dplyr::select(Feature, Vote, Auto_Accept) + vip_cubist_results <- tryCatch( + { + vip_cubist_fn( + input_data, + seed + ) %>% + dplyr::rename(Feature = Variable) %>% + dplyr::mutate( + Vote = 1, + Auto_Accept = 0 + ) %>% + dplyr::select(Feature, Vote, Auto_Accept) + }, + warning = function(w) { + # do nothing + }, + error = function(e) { + tibble::tibble() + } + ) + + if (nrow(vip_cubist_results) == 0) { + votes_needed <- votes_needed-1 + } # lasso regression feature importance vip_lm_initial <- vip_lm_fn( diff --git a/R/models.R b/R/models.R index e7ccd835..15ca66ac 100644 --- a/R/models.R +++ b/R/models.R @@ -1,3 +1,81 @@ +#' List all available models +#' +#' @return list of models +#' @export +list_models <- function() { + list <- c( + "arima", "arima-boost", "cubist", "croston", "ets", "glmnet", "mars", "meanf", + "nnetar", "nnetar-xregs", "prophet", "prophet-boost", "prophet-xregs", "snaive", + "stlm-arima", "stlm-ets", "svm-poly", "svm-rbf", "tbats", "theta", "xgboost" + ) + + return(list) +} + +#' List models with hyperparameters +#' +#' +#' @return list of models +#' @noRd +list_hyperparmater_models <- function() { + list <- c( + "arima-boost", "cubist", "glmnet", "mars", + "nnetar", "nnetar-xregs", "prophet", "prophet-boost", + "prophet-xregs", "svm-poly", "svm-rbf", "xgboost" + ) + + return(list) +} + +#' List ensemble models +#' +#' +#' @return list of models +#' @noRd +list_ensemble_models <- function() { + list <- c( + "cubist", "glmnet", "svm-poly", "svm-rbf", "xgboost" + ) + + return(list) +} + +#' List models capable with R2 recipe +#' +#' +#' @return list of models +#' @noRd +list_r2_models <- function() { + list <- c("cubist", "glmnet", "svm-poly", "svm-rbf", "xgboost") + + return(list) +} + +#' List global models +#' +#' +#' @return list of models +#' @noRd +list_global_models <- function() { + list <- c("cubist", "glmnet", "mars", "svm-poly", "svm-rbf", "xgboost") + + return(list) +} + +#' List multivariate models +#' +#' +#' @return list of models +#' @noRd +list_multivariate_models <- function() { + list <- c( + list_global_models(), "arima-boost", "prophet-boost", "prophet-xregs", + "nnetar-xregs" + ) + + return(list) +} + #' Gets a simple recipe #' #' @param train_data Training Data diff --git a/R/prep_data.R b/R/prep_data.R index 2f331e41..ab865587 100644 --- a/R/prep_data.R +++ b/R/prep_data.R @@ -23,6 +23,8 @@ #' existing series, and does not add new values onto the beginning or end, but does provide a value of 0 for said #' values. #' @param clean_outliers If TRUE, outliers are cleaned and inputted with values more in line with historical data. +#' @param box_cox Apply box-cox transformation to normalize variance in data +#' @param stationary Apply differencing to make data stationary #' @param forecast_approach How the forecast is created. The default of 'bottoms_up' trains models for each individual #' time series. Value of 'grouped_hierarchy' creates a grouped time series to forecast at while 'standard_hierarchy' creates #' a more traditional hierarchical time series to forecast, both based on the hts package. @@ -80,6 +82,8 @@ prep_data <- function(run_info, fiscal_year_start = 1, clean_missing_values = TRUE, clean_outliers = FALSE, + box_cox = TRUE, + stationary = TRUE, forecast_approach = "bottoms_up", parallel_processing = NULL, num_cores = NULL, @@ -104,6 +108,8 @@ prep_data <- function(run_info, check_input_type("fiscal_year_start", fiscal_year_start, "numeric") check_input_type("clean_missing_values", clean_missing_values, "logical") check_input_type("clean_outliers", clean_outliers, "logical") + check_input_type("box_cox", box_cox, "logical") + check_input_type("stationary", stationary, "logical") check_input_type("forecast_approach", forecast_approach, "character", c("bottoms_up", "grouped_hierarchy", "standard_hierarchy")) check_input_type("parallel_processing", parallel_processing, c("character", "NULL"), c("NULL", "local_machine", "spark")) check_input_type("num_cores", num_cores, c("numeric", "NULL")) @@ -287,13 +293,18 @@ prep_data <- function(run_info, .noexport = NULL ) %op% { + # get specific time series combo <- x %>% dplyr::pull(Combo) + + return_tbl <- tibble::tibble(Combo = combo, + Combo_Hash = hash_data(combo)) initial_prep_combo_tbl <- filtered_initial_prep_tbl %>% dplyr::filter(Combo == combo) %>% dplyr::collect() + # external regressor handling xregs_future_tbl <- get_xregs_future_values_tbl( initial_prep_combo_tbl, external_regressors, @@ -309,6 +320,7 @@ prep_data <- function(run_info, xregs_future_list <- NULL } + # initial data prep initial_tbl <- initial_prep_combo_tbl %>% dplyr::filter(Combo == combo) %>% dplyr::select( @@ -350,7 +362,31 @@ prep_data <- function(run_info, NA, Target )) + + # box-cox transformation + if(box_cox) { + + box_cox_tbl <- initial_tbl %>% + apply_box_cox() + initial_tbl <- box_cox_tbl$data + + return_tbl <- return_tbl %>% + dplyr::left_join(box_cox_tbl$diff_info, by = "Combo") + } + + # make stationary + if(stationary) { + stationary_tbl <- initial_tbl %>% + make_stationary() + + initial_tbl <- stationary_tbl$data + + return_tbl <- return_tbl %>% + dplyr::left_join(stationary_tbl$diff_info, by = "Combo") + } + + # date features date_features <- initial_tbl %>% dplyr::select(Date) %>% dplyr::mutate( @@ -418,25 +454,30 @@ prep_data <- function(run_info, suffix = "-R2" ) } - return() + return(return_tbl) } %>% base::suppressPackageStartupMessages() - # clean up any parallel run process par_end(cl) } else if (parallel_processing == "spark") { - # print(filtered_initial_prep_tbl) # prevents spark tbl errors + final_data <- filtered_initial_prep_tbl %>% adjust_df(return_type = "sdf") %>% sparklyr::spark_apply(function(df, context) { + # update objects fn_env <- .GlobalEnv for (name in names(context)) { assign(name, context[[name]], envir = fn_env) } + # get specific time series combo <- unique(df$Combo) + return_tbl <- tibble::tibble(Combo = combo, + Combo_Hash = hash_data(combo)) + + # handle external regressors xregs_future_tbl <- get_xregs_future_values_tbl( df, external_regressors, @@ -452,6 +493,7 @@ prep_data <- function(run_info, xregs_future_list <- NULL } + # initial data prep initial_tbl <- df %>% dplyr::filter(Combo == combo) %>% dplyr::select( @@ -494,6 +536,30 @@ prep_data <- function(run_info, Target )) + # box-cox transformation + if(box_cox) { + + box_cox_tbl <- initial_tbl %>% + apply_box_cox() + + initial_tbl <- box_cox_tbl$data + + return_tbl <- return_tbl %>% + dplyr::left_join(box_cox_tbl$diff_info, by = "Combo") + } + + # make stationary + if(stationary) { + stationary_tbl <- initial_tbl %>% + make_stationary() + + initial_tbl <- stationary_tbl$data + + return_tbl <- return_tbl %>% + dplyr::left_join(stationary_tbl$diff_info, by = "Combo") + } + + # create date features date_features <- initial_tbl %>% dplyr::select(Date) %>% dplyr::mutate( @@ -559,7 +625,7 @@ prep_data <- function(run_info, ) } - return(data.frame(Combo = combo)) + return(data.frame(return_tbl)) }, group_by = "Combo", context = list( @@ -589,7 +655,11 @@ prep_data <- function(run_info, rolling_window_periods = rolling_window_periods, write_data = write_data, write_data_folder = write_data_folder, - write_data_type = write_data_type + write_data_type = write_data_type, + box_cox = box_cox, + stationary = stationary, + make_stationary = make_stationary, + apply_box_cox = apply_box_cox ) ) } @@ -646,6 +716,8 @@ prep_data <- function(run_info, fiscal_year_start = fiscal_year_start, clean_missing_values = clean_missing_values, clean_outliers = clean_outliers, + stationary = stationary, + box_cox = box_cox, forecast_approach = forecast_approach, parallel_processing = ifelse(is.null(parallel_processing), NA, parallel_processing), num_cores = ifelse(is.null(num_cores), NA, num_cores), @@ -664,6 +736,18 @@ prep_data <- function(run_info, folder = "logs", suffix = NULL ) + + # write any transformation data + if(box_cox || stationary) { + write_data( + x = final_data, + combo = NULL, + run_info = run_info, + output_type = "data", + folder = "prep_data", + suffix = "-orig_combo_info" + ) + } } #' Function to perform log transformation @@ -939,6 +1023,116 @@ get_date_regex <- function(date_type) { return(date_regex) } +#' Apply box cox transformation +#' +#' @param data input data +#' +#' @return Returns df of box cox transformed data +#' @noRd +apply_box_cox <- function(df) { + + final_tbl <- df %>% dplyr::select(Date) + + diff_info <- tibble::tibble(Combo = unique(df$Combo), + Box_Cox_Lambda = NULL) + + for (column_name in names(df)) { + + # Only check numeric columns with more than 2 unique values + if (is.numeric(df[[column_name]]) & length(unique(df[[column_name]])) > 2) { + + temp_tbl <- df %>% + dplyr::select(Date, column_name) %>% + dplyr::rename(Column = column_name) + + # get lambda value + lambda_value <- timetk::auto_lambda(temp_tbl$Column) + + if(column_name == "Target") { + diff_info <- diff_info %>% + dplyr::mutate(Box_Cox_Lambda = lambda_value) + } + + # box cox transformation + temp_tbl <- temp_tbl %>% + dplyr::mutate(Column = timetk::box_cox_vec(Column, + lambda = lambda_value, + silent = TRUE)) + + # clean up names and add to final df + colnames(temp_tbl)[colnames(temp_tbl) == "Column"] <- column_name + + final_tbl <- cbind(final_tbl, temp_tbl %>% dplyr::select(column_name)) + } + else { + if(column_name != "Date") { + final_tbl <- cbind(final_tbl, df %>% dplyr::select(column_name)) + } + } + } + + return(list(data = tibble::tibble(final_tbl), diff_info = diff_info)) +} + +#' Make data stationary +#' +#' @param data input data +#' +#' @return Returns df of differenced data +#' @noRd +make_stationary <- function(df) { + + final_tbl <- df %>% dplyr::select(Date) + + diff_info <- tibble::tibble(Combo = unique(df$Combo), + Diff_Value1 = NA, + Diff_Value2 = NA) + + for (column_name in names(df)) { + + # Only check numeric columns with more than 2 unique values + if (is.numeric(df[[column_name]]) & length(unique(df[[column_name]])) > 2) { + + temp_tbl <- df %>% + dplyr::select(Date, column_name) %>% + dplyr::rename(Column = column_name) + + # check for standard difference + ndiffs <- temp_tbl %>% + dplyr::pull(Column) %>% + feasts::unitroot_ndiffs() %>% + as.numeric() + + if(ndiffs > 0) { + if(column_name == "Target") { + diff_info <- diff_info %>% + dplyr::mutate(Diff_Value1 = temp_tbl %>% dplyr::slice(1) %>% dplyr::pull(Column)) + + if(ndiffs > 1) { + diff_info <- diff_info %>% + dplyr::mutate(Diff_Value2 = temp_tbl %>% dplyr::slice(2) %>% dplyr::pull(Column)) + } + } + temp_tbl <- temp_tbl %>% + dplyr::mutate(Column = timetk::diff_vec(Column, + difference = ndiffs, + silent = TRUE)) + } + + colnames(temp_tbl)[colnames(temp_tbl) == "Column"] <- column_name + + final_tbl <- cbind(final_tbl, temp_tbl %>% dplyr::select(column_name)) + } + else { + if(column_name != "Date") { + final_tbl <- cbind(final_tbl, df %>% dplyr::select(column_name)) + } + } + } + + return(list(data = tibble::tibble(final_tbl), diff_info = diff_info)) +} + #' Function to perform feature engineering according to R1 recipe #' #' @param data data frame diff --git a/R/prep_models.R b/R/prep_models.R index c7c161fe..78e1e349 100644 --- a/R/prep_models.R +++ b/R/prep_models.R @@ -217,16 +217,10 @@ train_test_split <- function(run_info, unique() # models with hyperparameters to tune - hyperparam_model_list <- c( - "arima-boost", "cubist", "glmnet", "mars", - "nnetar", "nnetar-xregs", "prophet", "prophet-boost", - "prophet-xregs", "svm-poly", "svm-rbf", "xgboost" - ) + hyperparam_model_list <- list_hyperparmater_models() # ensemble models - ensemble_model_list <- c( - "cubist", "glmnet", "svm-poly", "svm-rbf", "xgboost" - ) + ensemble_model_list <- list_ensemble_models() if (sum(model_workflow_list %in% ensemble_model_list) == 0 & run_ensemble_models) { run_ensemble_models <- FALSE @@ -515,11 +509,7 @@ model_workflows <- function(run_info, model_workflow_tbl <- tibble::tibble() # models to run - ml_models <- c( - "arima", "arima-boost", "cubist", "croston", "ets", "glmnet", "mars", "meanf", - "nnetar", "nnetar-xregs", "prophet", "prophet-boost", "prophet-xregs", "snaive", - "stlm-arima", "stlm-ets", "svm-poly", "svm-rbf", "tbats", "theta", "xgboost" - ) + ml_models <- list_models() if (is.null(models_to_run) & is.null(models_not_to_run)) { @@ -539,7 +529,7 @@ model_workflows <- function(run_info, } } - r2_models <- c("cubist", "glmnet", "svm-poly", "svm-rbf", "xgboost") + r2_models <- list_r2_models() iter_tbl <- tibble::tibble() diff --git a/R/train_models.R b/R/train_models.R index 9351bc2f..db3c9189 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -90,6 +90,8 @@ train_models <- function(run_info, combo_variables <- strsplit(log_df$combo_variables, split = "---")[[1]] date_type <- log_df$date_type forecast_approach <- log_df$forecast_approach + stationary <- log_df$stationary + box_cox <- log_df$box_cox if (is.null(run_global_models) & date_type %in% c("day", "week")) { run_global_models <- FALSE @@ -131,17 +133,25 @@ train_models <- function(run_info, dplyr::pull(Model_Name) %>% unique() - global_model_list <- c("cubist", "glmnet", "mars", "svm-poly", "svm-rbf", "xgboost") - fs_model_list <- c( - global_model_list, "arima-boost", "prophet-boost", "prophet-xregs", - "nnetar-xregs" - ) + global_model_list <- list_global_models() + fs_model_list <- list_multivariate_models() if (sum(model_workflow_list %in% global_model_list) == 0 & run_global_models) { run_global_models <- FALSE cli::cli_alert_info("Turning global models off since no multivariate models were chosen to run.") cli::cli_progress_update() } + + # get other time series info + if(box_cox || stationary) { + orig_combo_info_tbl <- read_file(run_info, + path = paste0( + "/prep_data/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), + "-orig_combo_info.", run_info$data_output + ), + return_type = "df" + ) + } # get list of tasks to run current_combo_list <- c() @@ -259,11 +269,19 @@ train_models <- function(run_info, .noexport = NULL ) %op% { + + # get time series combo_hash <- x model_recipe_tbl <- get_recipe_data(run_info, combo = x ) + + # get other time series info + if(box_cox || stationary) { + filtered_combo_info_tbl <- orig_combo_info_tbl %>% + dplyr::filter(Combo_Hash == combo_hash) + } if (inner_parallel) { # ensure variables get exported @@ -408,6 +426,13 @@ train_models <- function(run_info, ) %>% dplyr::select(Hyperparameter_Combo, Hyperparameters) %>% tidyr::unnest(Hyperparameters) + + if(stationary & !(model %in% list_multivariate_models())) { + # undifference the data for a univariate model + prep_data <- prep_data %>% + undifference_recipe(filtered_combo_info_tbl, + model_train_test_tbl %>% dplyr::slice(1) %>% dplyr::pull(Train_End)) + } # tune hyperparameters set.seed(seed) @@ -455,9 +480,11 @@ train_models <- function(run_info, pkgs = inner_packages, parallel_over = "everything" ) - ) + ) %>% + tune::collect_predictions() - final_fcst <- tune::collect_predictions(refit_tbl) %>% + # finalize forecast + final_fcst <- refit_tbl %>% dplyr::rename( Forecast = .pred, Train_Test_ID = id @@ -474,9 +501,28 @@ train_models <- function(run_info, by = ".row" ) %>% dplyr::mutate(Hyperparameter_ID = hyperparameter_id) %>% - dplyr::select(-.row, -.config) %>% + dplyr::select(-.row, -.config) + + # undo differencing transformation + if(stationary & model %in% list_multivariate_models()) { + final_fcst <- final_fcst %>% + undifference_forecast(prep_data, + filtered_combo_info_tbl) + } + + # undo box-cox transformation + if(box_cox) { + lambda <- filtered_combo_info_tbl$Box_Cox_Lambda + final_fcst <- final_fcst %>% + dplyr::mutate(Forecast = timetk::box_cox_inv_vec(Forecast, lambda = lambda), + Target = timetk::box_cox_inv_vec(Target, lambda = lambda)) + } + + # negative forecast adjustment + final_fcst <- final_fcst %>% negative_fcst_adj(negative_forecast) + # return the forecast combo_id <- ifelse(x == "All-Data", "All-Data", unique(final_fcst$Combo)) final_return_tbl <- tibble::tibble( @@ -700,3 +746,156 @@ create_splits <- function(data, train_test_splits) { return(resamples) } + +#' Function to undifference forecast data +#' +#' @param forecast_data forecast data +#' @param recipe_data recipe data +#' @param diff_tbl diff table +#' +#' @return tbl with undifferenced forecast +#' @noRd +undifference_forecast <- function(forecast_data, + recipe_data, + diff_tbl) { + + # check if data needs to be undifferenced + diff1 <- diff_tbl$Diff_Value1 + diff2 <- diff_tbl$Diff_Value2 + + if(is.na(diff1) & is.na(diff2)) { + return(forecast_data) + } + + # return df + return_tbl <- tibble::tibble() + + # train test id number + train_test_id <- unique(forecast_data$Train_Test_ID) + + # non seasonal differencing + if(!is.na(diff1)) { + + # loop through each back test split + for(id in train_test_id) { + + # get specific train test split + fcst_temp_tbl <- forecast_data %>% + dplyr::filter(Train_Test_ID == id) + + fcst_start_date <- min(unique(fcst_temp_tbl$Date)) + + # prep recipe data + if("Horizon" %in% colnames(recipe_data)) { + filtered_recipe_data <- recipe_data %>% + dplyr::filter(Date < fcst_start_date, + Horizon == min(unique(recipe_data$Horizon))) + } else { + filtered_recipe_data <- recipe_data %>% + dplyr::filter(Date < fcst_start_date) + } + + # adjust recipe data + filtered_recipe_data$Target[1] <- NA + + if(!is.na(diff2)) { + filtered_recipe_data$Target[2] <- NA + } + + # get number of differences and initial values + if(!is.na(diff1) & !is.na(diff2)) { + num_diffs <- 2 + initial_value <- c(diff1, diff2) + } else { + num_diffs <- 1 + initial_value <- diff1 + } + + # combine historical data with forecast, then undifference and return forecast + combined_data <- filtered_recipe_data %>% + dplyr::mutate(Forecast = Target) %>% + dplyr::select(Date, Target, Forecast) %>% + rbind( + fcst_temp_tbl %>% + dplyr::select(Date, Target, Forecast) + ) %>% + dplyr::arrange(Date) + + if(id == 1) { + target_tbl <- combined_data %>% + dplyr::select(-Forecast) %>% + dplyr::filter(Date < fcst_start_date) %>% + dplyr::mutate(Target = timetk::diff_inv_vec(Target, difference = num_diffs, initial_values = initial_value)) + } else { + target_tbl <- combined_data %>% + dplyr::select(-Forecast) %>% + dplyr::mutate(Target = timetk::diff_inv_vec(Target, difference = num_diffs, initial_values = initial_value)) + } + + forecast_tbl <- combined_data %>% + dplyr::select(-Target) %>% + dplyr::mutate(Forecast = timetk::diff_inv_vec(Forecast, difference = num_diffs, initial_values = initial_value)) + + final_forecast <- fcst_temp_tbl %>% + dplyr::select(-Target, -Forecast) %>% + dplyr::left_join(forecast_tbl, by = "Date") %>% + dplyr::left_join(target_tbl, by = "Date") + + return_tbl <- return_tbl %>% + rbind(final_forecast) + } + } + + return(return_tbl) +} + +#' Function to undifference recipe data +#' +#' @param recipe_data recipe data +#' @param diff_tbl diff table +#' @param hist_end_date historical data end date +#' +#' @return tbl with undifferenced recipe +#' @noRd +undifference_recipe <- function(recipe_data, + diff_tbl, + hist_end_date) { + + # check if data needs to be undifferenced + diff1 <- diff_tbl$Diff_Value1 + diff2 <- diff_tbl$Diff_Value2 + + if(is.na(diff1) & is.na(diff2)) { + return(recipe_data) + } + + # adjust recipe data + recipe_data$Target[1] <- NA + + if(!is.na(diff2)) { + recipe_data$Target[2] <- NA + } + + # get number of differences and initial values + if(!is.na(diff1) & !is.na(diff2)) { + num_diffs <- 2 + initial_value <- c(diff1, diff2) + } else { + num_diffs <- 1 + initial_value <- diff1 + } + + # undifference the data + undiff_recipe_data <- recipe_data %>% + dplyr::filter(Date <= hist_end_date) %>% + dplyr::mutate(Target = timetk::diff_inv_vec(Target, difference = num_diffs, initial_values = initial_value)) + + future_data <- recipe_data %>% + dplyr::filter(Date > hist_end_date) + + final_recipe_data <- undiff_recipe_data %>% + rbind(future_data) + + return(final_recipe_data) +} + diff --git a/R/utility.R b/R/utility.R index 31c28ace..4c0e2fc4 100644 --- a/R/utility.R +++ b/R/utility.R @@ -13,7 +13,7 @@ utils::globalVariables(c( "x", "num_cores", "run_info", "negative_forecast", "Forecast_Adj", "Final_Col", "lag_val", "libs", ".config", "Forecast_Tbl", "Model_Workflow", "id", "model_run", "Auto_Accept", "Feature", "Imp", "Importance", "LOFO_Var", "Var_RMSE", "Vote", "Votes", "desc", - "term" + "term", "Column" )) #' @importFrom magrittr %>% diff --git a/man/list_models.Rd b/man/list_models.Rd new file mode 100644 index 00000000..368db32f --- /dev/null +++ b/man/list_models.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/models.R +\name{list_models} +\alias{list_models} +\title{List all available models} +\usage{ +list_models() +} +\value{ +list of models +} +\description{ +List all available models +} diff --git a/man/prep_data.Rd b/man/prep_data.Rd index eb9fdcc4..c0e0dd71 100644 --- a/man/prep_data.Rd +++ b/man/prep_data.Rd @@ -18,6 +18,8 @@ prep_data( fiscal_year_start = 1, clean_missing_values = TRUE, clean_outliers = FALSE, + box_cox = TRUE, + stationary = TRUE, forecast_approach = "bottoms_up", parallel_processing = NULL, num_cores = NULL, @@ -63,6 +65,10 @@ values.} \item{clean_outliers}{If TRUE, outliers are cleaned and inputted with values more in line with historical data.} +\item{box_cox}{Apply box-cox transformation to normalize variance in data} + +\item{stationary}{Apply differencing to make data stationary} + \item{forecast_approach}{How the forecast is created. The default of 'bottoms_up' trains models for each individual time series. Value of 'grouped_hierarchy' creates a grouped time series to forecast at while 'standard_hierarchy' creates a more traditional hierarchical time series to forecast, both based on the hts package.} From f777bd7cc39d400c0b4f9333d7b5e797ce075ccb Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Tue, 29 Aug 2023 09:45:44 -0700 Subject: [PATCH 02/11] update feature engineering vignette --- vignettes/feature-engineering.Rmd | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/vignettes/feature-engineering.Rmd b/vignettes/feature-engineering.Rmd index 674f1587..6c3133ba 100644 --- a/vignettes/feature-engineering.Rmd +++ b/vignettes/feature-engineering.Rmd @@ -16,6 +16,24 @@ knitr::opts_chunk$set( Automated feature engineering is a cornerstone of the package. Below are some of the techniques we use in multivariate machine learning models, and the outside packages that make it possible. +## Missing Data and Outliers + +Missing data is filled in using the [pad_by_time](https://business-science.github.io/timetk/reference/pad_by_time.html) function from the timetk package. First, each time series is grouped and padded using their existing start and end dates. Missing values are padded using NA. Then the same process is ran again, this time padding data from the `hist_start_date` from `forecast_time_series()`, with missing values being filled in with zero. This ensures that missing data before a time series starts are all zeroes, but missing periods within the existing time series data are identified to be inputted with new values in the next step. + +After missing data is padded, the [ts_impute_vec](https://business-science.github.io/timetk/reference/ts_impute_vec.html) function from the timetk package is called to impute any NA values. This only happens if the `clean_missing_values` input from `forecast_time_series()` is set to TRUE, otherwise NA values are replaced with zero. + +Outliers are handled using the [ts_clean_vec](https://business-science.github.io/timetk/reference/ts_clean_vec.html) function from the timetk package. Outliers are replaced after the missing data process, and only runs if the `clean_outliers` input from `forecast_time_series()` is set to TRUE. + +**Important Note:** Missing values and outliers are replaced for the target variable and any numeric external regressors. + +## Box-Cox + +Stabilizes the variance in each time series using the [box_cox_vec](https://business-science.github.io/timetk/reference/box_cox_vec.html) function from the timetk package. Applied to both the target variable and any external regressor before other transformations like differencing. You can control this within `prep_models()`. + +## Differencing + +Uses the [feasts](https://feasts.tidyverts.org/reference/unitroot_ndiffs.html) package to check if each time series is stationary and applies the differencing required (up to two standard differences with lag one) in order to make the time series stationary. Uses the [diff_vec](https://business-science.github.io/timetk/reference/diff_vec.html) function from the timetk package to do the differencing. This is applied to the target variable and any external regressor before other features are created. Data is undifferenced before training for univariate models like arima, but differenced data is used for all multivariate models. You can control the differencing done within `prep_models()`. + ## Date Features The [tk_augment_timeseries_signature](https://business-science.github.io/timetk/reference/tk_augment_timeseries.html) function from the timetk package easily extracts out various date features from the time stamp. The function doesn't differentiate between date type, so features need to be removed depending on the date type. For example, features related to week and day for a monthly forecast are automatically removed. @@ -33,16 +51,6 @@ m4_monthly %>% dplyr::ungroup() ``` -## Missing Data and Outliers - -Missing data is filled in using the [pad_by_time](https://business-science.github.io/timetk/reference/pad_by_time.html) function from the timetk package. First, each time series is grouped and padded using their existing start and end dates. Missing values are padded using NA. Then the same process is ran again, this time padding data from the `hist_start_date` from `forecast_time_series()`, with missing values being filled in with zero. This ensures that missing data before a time series starts are all zeroes, but missing periods within the existing time series data are identified to be inputted with new values in the next step. - -After missing data is padded, the [ts_impute_vec](https://business-science.github.io/timetk/reference/ts_impute_vec.html) function from the timetk package is called to impute any NA values. This only happens if the `clean_missing_values` input from `forecast_time_series()` is set to TRUE, otherwise NA values are replaced with zero. - -Outliers are handled using the [ts_clean_vec](https://business-science.github.io/timetk/reference/ts_clean_vec.html) function from the timetk package. Outliers are replaced after the missing data process, and only runs if the `clean_outliers` input from `forecast_time_series()` is set to TRUE. - -**Important Note:** Missing values and outliers are replaced for the target variable and any numeric external regressors. - ## Lags, Rolling Windows, and Polynomial Transformations Lags of the target variable and external regressors are created using the [tk_augment_lags](https://business-science.github.io/timetk/reference/tk_augment_lags.html) function from timetk. From 211affec91bd257d5bda21bbf6a354f1bf0ba33e Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Tue, 29 Aug 2023 15:10:35 -0700 Subject: [PATCH 03/11] code formatting --- R/feature_selection.R | 6 +- R/models.R | 12 ++-- R/prep_data.R | 149 +++++++++++++++++++++--------------------- R/train_models.R | 127 ++++++++++++++++++----------------- 4 files changed, 151 insertions(+), 143 deletions(-) diff --git a/R/feature_selection.R b/R/feature_selection.R index 92194416..210449c4 100644 --- a/R/feature_selection.R +++ b/R/feature_selection.R @@ -146,14 +146,14 @@ select_features <- function(input_data, }, warning = function(w) { # do nothing - }, + }, error = function(e) { tibble::tibble() } ) - + if (nrow(vip_cubist_results) == 0) { - votes_needed <- votes_needed-1 + votes_needed <- votes_needed - 1 } # lasso regression feature importance diff --git a/R/models.R b/R/models.R index 15ca66ac..b5551837 100644 --- a/R/models.R +++ b/R/models.R @@ -8,7 +8,7 @@ list_models <- function() { "nnetar", "nnetar-xregs", "prophet", "prophet-boost", "prophet-xregs", "snaive", "stlm-arima", "stlm-ets", "svm-poly", "svm-rbf", "tbats", "theta", "xgboost" ) - + return(list) } @@ -23,7 +23,7 @@ list_hyperparmater_models <- function() { "nnetar", "nnetar-xregs", "prophet", "prophet-boost", "prophet-xregs", "svm-poly", "svm-rbf", "xgboost" ) - + return(list) } @@ -36,7 +36,7 @@ list_ensemble_models <- function() { list <- c( "cubist", "glmnet", "svm-poly", "svm-rbf", "xgboost" ) - + return(list) } @@ -47,7 +47,7 @@ list_ensemble_models <- function() { #' @noRd list_r2_models <- function() { list <- c("cubist", "glmnet", "svm-poly", "svm-rbf", "xgboost") - + return(list) } @@ -58,7 +58,7 @@ list_r2_models <- function() { #' @noRd list_global_models <- function() { list <- c("cubist", "glmnet", "mars", "svm-poly", "svm-rbf", "xgboost") - + return(list) } @@ -72,7 +72,7 @@ list_multivariate_models <- function() { list_global_models(), "arima-boost", "prophet-boost", "prophet-xregs", "nnetar-xregs" ) - + return(list) } diff --git a/R/prep_data.R b/R/prep_data.R index ab865587..9f464f90 100644 --- a/R/prep_data.R +++ b/R/prep_data.R @@ -82,7 +82,7 @@ prep_data <- function(run_info, fiscal_year_start = 1, clean_missing_values = TRUE, clean_outliers = FALSE, - box_cox = TRUE, + box_cox = TRUE, stationary = TRUE, forecast_approach = "bottoms_up", parallel_processing = NULL, @@ -296,9 +296,11 @@ prep_data <- function(run_info, # get specific time series combo <- x %>% dplyr::pull(Combo) - - return_tbl <- tibble::tibble(Combo = combo, - Combo_Hash = hash_data(combo)) + + return_tbl <- tibble::tibble( + Combo = combo, + Combo_Hash = hash_data(combo) + ) initial_prep_combo_tbl <- filtered_initial_prep_tbl %>% dplyr::filter(Combo == combo) %>% @@ -362,10 +364,9 @@ prep_data <- function(run_info, NA, Target )) - + # box-cox transformation - if(box_cox) { - + if (box_cox) { box_cox_tbl <- initial_tbl %>% apply_box_cox() @@ -374,14 +375,14 @@ prep_data <- function(run_info, return_tbl <- return_tbl %>% dplyr::left_join(box_cox_tbl$diff_info, by = "Combo") } - + # make stationary - if(stationary) { + if (stationary) { stationary_tbl <- initial_tbl %>% make_stationary() - + initial_tbl <- stationary_tbl$data - + return_tbl <- return_tbl %>% dplyr::left_join(stationary_tbl$diff_info, by = "Combo") } @@ -460,7 +461,6 @@ prep_data <- function(run_info, # clean up any parallel run process par_end(cl) } else if (parallel_processing == "spark") { - final_data <- filtered_initial_prep_tbl %>% adjust_df(return_type = "sdf") %>% sparklyr::spark_apply(function(df, context) { @@ -474,9 +474,11 @@ prep_data <- function(run_info, # get specific time series combo <- unique(df$Combo) - return_tbl <- tibble::tibble(Combo = combo, - Combo_Hash = hash_data(combo)) - + return_tbl <- tibble::tibble( + Combo = combo, + Combo_Hash = hash_data(combo) + ) + # handle external regressors xregs_future_tbl <- get_xregs_future_values_tbl( df, @@ -537,28 +539,27 @@ prep_data <- function(run_info, )) # box-cox transformation - if(box_cox) { - + if (box_cox) { box_cox_tbl <- initial_tbl %>% apply_box_cox() - + initial_tbl <- box_cox_tbl$data - + return_tbl <- return_tbl %>% dplyr::left_join(box_cox_tbl$diff_info, by = "Combo") } - + # make stationary - if(stationary) { + if (stationary) { stationary_tbl <- initial_tbl %>% make_stationary() - + initial_tbl <- stationary_tbl$data - + return_tbl <- return_tbl %>% dplyr::left_join(stationary_tbl$diff_info, by = "Combo") } - + # create date features date_features <- initial_tbl %>% dplyr::select(Date) %>% @@ -655,10 +656,10 @@ prep_data <- function(run_info, rolling_window_periods = rolling_window_periods, write_data = write_data, write_data_folder = write_data_folder, - write_data_type = write_data_type, + write_data_type = write_data_type, box_cox = box_cox, stationary = stationary, - make_stationary = make_stationary, + make_stationary = make_stationary, apply_box_cox = apply_box_cox ) ) @@ -716,7 +717,7 @@ prep_data <- function(run_info, fiscal_year_start = fiscal_year_start, clean_missing_values = clean_missing_values, clean_outliers = clean_outliers, - stationary = stationary, + stationary = stationary, box_cox = box_cox, forecast_approach = forecast_approach, parallel_processing = ifelse(is.null(parallel_processing), NA, parallel_processing), @@ -736,9 +737,9 @@ prep_data <- function(run_info, folder = "logs", suffix = NULL ) - + # write any transformation data - if(box_cox || stationary) { + if (box_cox || stationary) { write_data( x = final_data, combo = NULL, @@ -1030,47 +1031,47 @@ get_date_regex <- function(date_type) { #' @return Returns df of box cox transformed data #' @noRd apply_box_cox <- function(df) { - final_tbl <- df %>% dplyr::select(Date) - - diff_info <- tibble::tibble(Combo = unique(df$Combo), - Box_Cox_Lambda = NULL) - + + diff_info <- tibble::tibble( + Combo = unique(df$Combo), + Box_Cox_Lambda = NULL + ) + for (column_name in names(df)) { - + # Only check numeric columns with more than 2 unique values if (is.numeric(df[[column_name]]) & length(unique(df[[column_name]])) > 2) { - - temp_tbl <- df %>% + temp_tbl <- df %>% dplyr::select(Date, column_name) %>% dplyr::rename(Column = column_name) - + # get lambda value lambda_value <- timetk::auto_lambda(temp_tbl$Column) - if(column_name == "Target") { + if (column_name == "Target") { diff_info <- diff_info %>% - dplyr::mutate(Box_Cox_Lambda = lambda_value) + dplyr::mutate(Box_Cox_Lambda = lambda_value) } - + # box cox transformation temp_tbl <- temp_tbl %>% - dplyr::mutate(Column = timetk::box_cox_vec(Column, - lambda = lambda_value, - silent = TRUE)) - + dplyr::mutate(Column = timetk::box_cox_vec(Column, + lambda = lambda_value, + silent = TRUE + )) + # clean up names and add to final df colnames(temp_tbl)[colnames(temp_tbl) == "Column"] <- column_name - + final_tbl <- cbind(final_tbl, temp_tbl %>% dplyr::select(column_name)) - } - else { - if(column_name != "Date") { + } else { + if (column_name != "Date") { final_tbl <- cbind(final_tbl, df %>% dplyr::select(column_name)) } } } - + return(list(data = tibble::tibble(final_tbl), diff_info = diff_info)) } @@ -1081,55 +1082,55 @@ apply_box_cox <- function(df) { #' @return Returns df of differenced data #' @noRd make_stationary <- function(df) { - final_tbl <- df %>% dplyr::select(Date) - - diff_info <- tibble::tibble(Combo = unique(df$Combo), - Diff_Value1 = NA, - Diff_Value2 = NA) - + + diff_info <- tibble::tibble( + Combo = unique(df$Combo), + Diff_Value1 = NA, + Diff_Value2 = NA + ) + for (column_name in names(df)) { - + # Only check numeric columns with more than 2 unique values if (is.numeric(df[[column_name]]) & length(unique(df[[column_name]])) > 2) { - - temp_tbl <- df %>% + temp_tbl <- df %>% dplyr::select(Date, column_name) %>% dplyr::rename(Column = column_name) - + # check for standard difference ndiffs <- temp_tbl %>% dplyr::pull(Column) %>% feasts::unitroot_ndiffs() %>% as.numeric() - - if(ndiffs > 0) { - if(column_name == "Target") { + + if (ndiffs > 0) { + if (column_name == "Target") { diff_info <- diff_info %>% dplyr::mutate(Diff_Value1 = temp_tbl %>% dplyr::slice(1) %>% dplyr::pull(Column)) - - if(ndiffs > 1) { + + if (ndiffs > 1) { diff_info <- diff_info %>% dplyr::mutate(Diff_Value2 = temp_tbl %>% dplyr::slice(2) %>% dplyr::pull(Column)) } } temp_tbl <- temp_tbl %>% - dplyr::mutate(Column = timetk::diff_vec(Column, - difference = ndiffs, - silent = TRUE)) + dplyr::mutate(Column = timetk::diff_vec(Column, + difference = ndiffs, + silent = TRUE + )) } - + colnames(temp_tbl)[colnames(temp_tbl) == "Column"] <- column_name - + final_tbl <- cbind(final_tbl, temp_tbl %>% dplyr::select(column_name)) - } - else { - if(column_name != "Date") { + } else { + if (column_name != "Date") { final_tbl <- cbind(final_tbl, df %>% dplyr::select(column_name)) } } } - + return(list(data = tibble::tibble(final_tbl), diff_info = diff_info)) } diff --git a/R/train_models.R b/R/train_models.R index db3c9189..3702b418 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -141,15 +141,15 @@ train_models <- function(run_info, cli::cli_alert_info("Turning global models off since no multivariate models were chosen to run.") cli::cli_progress_update() } - + # get other time series info - if(box_cox || stationary) { + if (box_cox || stationary) { orig_combo_info_tbl <- read_file(run_info, - path = paste0( - "/prep_data/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), - "-orig_combo_info.", run_info$data_output - ), - return_type = "df" + path = paste0( + "/prep_data/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), + "-orig_combo_info.", run_info$data_output + ), + return_type = "df" ) } @@ -269,16 +269,16 @@ train_models <- function(run_info, .noexport = NULL ) %op% { - + # get time series combo_hash <- x model_recipe_tbl <- get_recipe_data(run_info, combo = x ) - + # get other time series info - if(box_cox || stationary) { + if (box_cox || stationary) { filtered_combo_info_tbl <- orig_combo_info_tbl %>% dplyr::filter(Combo_Hash == combo_hash) } @@ -426,12 +426,14 @@ train_models <- function(run_info, ) %>% dplyr::select(Hyperparameter_Combo, Hyperparameters) %>% tidyr::unnest(Hyperparameters) - - if(stationary & !(model %in% list_multivariate_models())) { + + if (stationary & !(model %in% list_multivariate_models())) { # undifference the data for a univariate model - prep_data <- prep_data %>% - undifference_recipe(filtered_combo_info_tbl, - model_train_test_tbl %>% dplyr::slice(1) %>% dplyr::pull(Train_End)) + prep_data <- prep_data %>% + undifference_recipe( + filtered_combo_info_tbl, + model_train_test_tbl %>% dplyr::slice(1) %>% dplyr::pull(Train_End) + ) } # tune hyperparameters @@ -504,18 +506,22 @@ train_models <- function(run_info, dplyr::select(-.row, -.config) # undo differencing transformation - if(stationary & model %in% list_multivariate_models()) { + if (stationary & model %in% list_multivariate_models()) { final_fcst <- final_fcst %>% - undifference_forecast(prep_data, - filtered_combo_info_tbl) + undifference_forecast( + prep_data, + filtered_combo_info_tbl + ) } # undo box-cox transformation - if(box_cox) { + if (box_cox) { lambda <- filtered_combo_info_tbl$Box_Cox_Lambda final_fcst <- final_fcst %>% - dplyr::mutate(Forecast = timetk::box_cox_inv_vec(Forecast, lambda = lambda), - Target = timetk::box_cox_inv_vec(Target, lambda = lambda)) + dplyr::mutate( + Forecast = timetk::box_cox_inv_vec(Forecast, lambda = lambda), + Target = timetk::box_cox_inv_vec(Target, lambda = lambda) + ) } # negative forecast adjustment @@ -756,54 +762,56 @@ create_splits <- function(data, train_test_splits) { #' @return tbl with undifferenced forecast #' @noRd undifference_forecast <- function(forecast_data, - recipe_data, - diff_tbl) { + recipe_data, + diff_tbl) { # check if data needs to be undifferenced diff1 <- diff_tbl$Diff_Value1 diff2 <- diff_tbl$Diff_Value2 - - if(is.na(diff1) & is.na(diff2)) { + + if (is.na(diff1) & is.na(diff2)) { return(forecast_data) } - + # return df return_tbl <- tibble::tibble() - + # train test id number train_test_id <- unique(forecast_data$Train_Test_ID) - + # non seasonal differencing - if(!is.na(diff1)) { - + if (!is.na(diff1)) { + # loop through each back test split - for(id in train_test_id) { + for (id in train_test_id) { # get specific train test split fcst_temp_tbl <- forecast_data %>% dplyr::filter(Train_Test_ID == id) fcst_start_date <- min(unique(fcst_temp_tbl$Date)) - + # prep recipe data - if("Horizon" %in% colnames(recipe_data)) { + if ("Horizon" %in% colnames(recipe_data)) { filtered_recipe_data <- recipe_data %>% - dplyr::filter(Date < fcst_start_date, - Horizon == min(unique(recipe_data$Horizon))) + dplyr::filter( + Date < fcst_start_date, + Horizon == min(unique(recipe_data$Horizon)) + ) } else { filtered_recipe_data <- recipe_data %>% dplyr::filter(Date < fcst_start_date) } - - # adjust recipe data + + # adjust recipe data filtered_recipe_data$Target[1] <- NA - - if(!is.na(diff2)) { + + if (!is.na(diff2)) { filtered_recipe_data$Target[2] <- NA } - + # get number of differences and initial values - if(!is.na(diff1) & !is.na(diff2)) { + if (!is.na(diff1) & !is.na(diff2)) { num_diffs <- 2 initial_value <- c(diff1, diff2) } else { @@ -820,8 +828,8 @@ undifference_forecast <- function(forecast_data, dplyr::select(Date, Target, Forecast) ) %>% dplyr::arrange(Date) - - if(id == 1) { + + if (id == 1) { target_tbl <- combined_data %>% dplyr::select(-Forecast) %>% dplyr::filter(Date < fcst_start_date) %>% @@ -845,7 +853,7 @@ undifference_forecast <- function(forecast_data, rbind(final_forecast) } } - + return(return_tbl) } @@ -858,44 +866,43 @@ undifference_forecast <- function(forecast_data, #' @return tbl with undifferenced recipe #' @noRd undifference_recipe <- function(recipe_data, - diff_tbl, + diff_tbl, hist_end_date) { - + # check if data needs to be undifferenced diff1 <- diff_tbl$Diff_Value1 diff2 <- diff_tbl$Diff_Value2 - - if(is.na(diff1) & is.na(diff2)) { + + if (is.na(diff1) & is.na(diff2)) { return(recipe_data) } - + # adjust recipe data recipe_data$Target[1] <- NA - - if(!is.na(diff2)) { + + if (!is.na(diff2)) { recipe_data$Target[2] <- NA } - + # get number of differences and initial values - if(!is.na(diff1) & !is.na(diff2)) { + if (!is.na(diff1) & !is.na(diff2)) { num_diffs <- 2 initial_value <- c(diff1, diff2) } else { num_diffs <- 1 initial_value <- diff1 } - + # undifference the data undiff_recipe_data <- recipe_data %>% - dplyr::filter(Date <= hist_end_date) %>% - dplyr::mutate(Target = timetk::diff_inv_vec(Target, difference = num_diffs, initial_values = initial_value)) - + dplyr::filter(Date <= hist_end_date) %>% + dplyr::mutate(Target = timetk::diff_inv_vec(Target, difference = num_diffs, initial_values = initial_value)) + future_data <- recipe_data %>% dplyr::filter(Date > hist_end_date) - + final_recipe_data <- undiff_recipe_data %>% rbind(future_data) - + return(final_recipe_data) } - From 404ab51ce0153b7850e784fc014de781270bdcfb Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Tue, 29 Aug 2023 15:12:36 -0700 Subject: [PATCH 04/11] update news --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index efcd12f1..f636e127 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# finnts 0.3.0.9002 (DEVELOPMENT VERSION) +# finnts 0.3.0.9003 (DEVELOPMENT VERSION) ## Improvements From 4b5ef7d18d3c1a81a2f7983a947b3bbbe1b0f8ea Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Tue, 29 Aug 2023 18:26:26 -0700 Subject: [PATCH 05/11] ensemble model bug fix --- R/train_models.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/train_models.R b/R/train_models.R index 3702b418..41c2d423 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -721,13 +721,14 @@ create_splits <- function(data, train_test_splits) { # Get the train and test end dates train_end <- train_test_splits$Train_End[i] test_end <- train_test_splits$Test_End[i] - - + train_test_id <- train_test_splits$Train_Test_ID[i] # Create the train and test indices train_indices <- which(data$Date <= train_end) - if ("Horizon" %in% colnames(data)) { + if ("Train_Test_ID" %in% colnames(data)) { + test_indices <- which(data$Train_Test_ID == train_test_id) + } else if ("Horizon" %in% colnames(data)) { # adjust for the horizon in R2 recipe data train_data <- data %>% dplyr::filter( From 792d917e2d090cbe0c0b7e0f533440f1edea3a22 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Wed, 30 Aug 2023 07:04:20 -0700 Subject: [PATCH 06/11] debugging --- R/train_models.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/train_models.R b/R/train_models.R index 41c2d423..e68f1a2c 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -292,6 +292,8 @@ train_models <- function(run_info, combo_variables <- combo_variables negative_fcst_adj <- negative_fcst_adj negative_forecast <- negative_forecast + stationary <- stationary + box_cox <- box_cox } if (feature_selection) { From 313c276cd7a1435d141a4f801e454570297070b1 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Wed, 30 Aug 2023 07:27:24 -0700 Subject: [PATCH 07/11] debugging --- R/train_models.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/train_models.R b/R/train_models.R index e68f1a2c..ec32d69e 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -373,7 +373,7 @@ train_models <- function(run_info, dplyr::select(Model_Name, Model_Recipe) %>% dplyr::group_split(dplyr::row_number(), .keep = FALSE), .combine = "rbind", - .errorhandling = "remove", + .errorhandling = "stop", .verbose = FALSE, .inorder = FALSE, .multicombine = TRUE, @@ -548,7 +548,7 @@ train_models <- function(run_info, par_end(inner_cl) # ensure at least one model ran successfully - if (nrow(model_tbl) < 1) { + if (is.null(model_tbl)) { stop("All models failed to train") } From 99ba1792e9407d3da81ef9c32c2262a4a7d59201 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Wed, 30 Aug 2023 07:49:11 -0700 Subject: [PATCH 08/11] debugging --- R/train_models.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/train_models.R b/R/train_models.R index ec32d69e..5ba22c46 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -294,6 +294,10 @@ train_models <- function(run_info, negative_forecast <- negative_forecast stationary <- stationary box_cox <- box_cox + undifference_forecast <- undifference_forecast + undifference_recipe <- undifference_recipe + list_global_models <- list_global_models + list_multivariate_models <- list_multivariate_models } if (feature_selection) { From cb96cf8689a5003045cec58c185dcfd23faf6e06 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Wed, 30 Aug 2023 08:10:54 -0700 Subject: [PATCH 09/11] debugging --- R/train_models.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/train_models.R b/R/train_models.R index 5ba22c46..ebea1db2 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -377,7 +377,7 @@ train_models <- function(run_info, dplyr::select(Model_Name, Model_Recipe) %>% dplyr::group_split(dplyr::row_number(), .keep = FALSE), .combine = "rbind", - .errorhandling = "stop", + .errorhandling = "remove", .verbose = FALSE, .inorder = FALSE, .multicombine = TRUE, From a73f6571e4ed05f7458d8480f76ea5768a2453f5 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Thu, 31 Aug 2023 11:38:07 -0700 Subject: [PATCH 10/11] fs fix --- R/feature_selection.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/feature_selection.R b/R/feature_selection.R index 210449c4..524dbabd 100644 --- a/R/feature_selection.R +++ b/R/feature_selection.R @@ -152,7 +152,7 @@ select_features <- function(input_data, } ) - if (nrow(vip_cubist_results) == 0) { + if (is.null(vip_cubist_results)) { votes_needed <- votes_needed - 1 } From bd8f177eb0da781e5390ea25ce0a025b0264fda7 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Tue, 12 Sep 2023 10:23:29 -0700 Subject: [PATCH 11/11] add arimax model --- R/models.R | 37 ++++++++++++++++++------------------- R/run_info.R | 2 +- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/R/models.R b/R/models.R index 1a12da6a..d4a54abb 100644 --- a/R/models.R +++ b/R/models.R @@ -4,7 +4,7 @@ #' @export list_models <- function() { list <- c( - "arima", "arima-boost", "cubist", "croston", "ets", "glmnet", "mars", "meanf", + "arima", "arima-boost", "arimax", "cubist", "croston", "ets", "glmnet", "mars", "meanf", "nnetar", "nnetar-xregs", "prophet", "prophet-boost", "prophet-xregs", "snaive", "stlm-arima", "stlm-ets", "svm-poly", "svm-rbf", "tbats", "theta", "xgboost" ) @@ -112,17 +112,17 @@ get_recipe_combo <- function(train_data) { #' @noRd get_recipe_configurable <- function(train_data, - mutate_adj_half = FALSE, - rm_date = "plain", - step_nzv = "zv", - norm_date_adj_year = FALSE, - dummy_one_hot = TRUE, - character_factor = FALSE, - center_scale = FALSE, - one_hot = FALSE, - pca = TRUE, - corr = FALSE, - lincomb = FALSE) { + mutate_adj_half = FALSE, + rm_date = "plain", + step_nzv = "zv", + norm_date_adj_year = FALSE, + dummy_one_hot = TRUE, + character_factor = FALSE, + center_scale = FALSE, + one_hot = FALSE, + pca = TRUE, + corr = FALSE, + lincomb = FALSE) { mutate_adj_half_fn <- function(df) { if (mutate_adj_half) { df %>% @@ -146,7 +146,7 @@ get_recipe_configurable <- function(train_data, "none" = df ) } - + corr_fn <- function(df) { if (corr) { df %>% @@ -215,7 +215,7 @@ get_recipe_configurable <- function(train_data, rm_lincomb_fn <- function(df) { if (lincomb) { df %>% - recipes::step_lincomb(recipes::all_numeric_predictors(), id = "remove_linear_combs") + recipes::step_lincomb(recipes::all_numeric_predictors(), id = "remove_linear_combs") } else { df } @@ -491,9 +491,8 @@ arima <- function(train_data, #' @return Get the ARIMAX based model #' @noRd arimax <- function(train_data, - frequency, - pca) { - + frequency, + pca) { recipe_spec_arimax <- train_data %>% get_recipe_configurable( step_nzv = "zv", @@ -506,12 +505,12 @@ arimax <- function(train_data, seasonal_period = frequency ) %>% parsnip::set_engine("auto_arima") - + wflw_spec <- get_workflow_simple( model_spec_arima, recipe_spec_arimax ) - + return(wflw_spec) } diff --git a/R/run_info.R b/R/run_info.R index 774d1f3f..bf9acc5b 100644 --- a/R/run_info.R +++ b/R/run_info.R @@ -80,7 +80,7 @@ set_run_info <- function(experiment_name = "finn_fcst", fs::dir_create(tempdir(), models_folder) fs::dir_create(tempdir(), forecasts_folder) fs::dir_create(tempdir(), logs_folder) - } else if (is.null(storage_object) & substr(path, 1, 6) == "/synfs") { + } else if (is.null(storage_object) & substr(path, 1, 6) == "/synfs") { temp_path <- stringr::str_replace(path, "/synfs/", "synfs:/") if (!dir.exists(fs::path(path, prep_data_folder) %>% as.character())) {