diff --git a/DESCRIPTION b/DESCRIPTION index 7fcd9e36..577c9e82 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: finnts Title: Microsoft Finance Time Series Forecasting Framework -Version: 0.3.0.9001 +Version: 0.3.0.9002 Authors@R: c(person(given = "Mike", family = "Tokic", diff --git a/NEWS.md b/NEWS.md index 97ae71f3..f6805dfd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,10 +1,15 @@ -# finnts 0.3.0.9001 (DEVELOPMENT VERSION) +# finnts 0.3.0.9002 (DEVELOPMENT VERSION) ## Improvements - Tidymodels speed up - Added external regressor support for ARIMA by introducing a new model option of `arimax`, which uses engineered features in addition to any external regressors supplied. - Automated feature selection, refer to feature selection vignette for more details +- Error handling in hierarchical forecast reconciliation + +## Bug Fixes + +- Best model selection # finnts 0.3.0 diff --git a/R/final_models.R b/R/final_models.R index 2ed69880..68510c0c 100644 --- a/R/final_models.R +++ b/R/final_models.R @@ -196,58 +196,105 @@ final_models <- function(run_info, single_model_tbl <- NULL if (run_local_models) { - suppressWarnings(try(single_model_tbl <- read_file(run_info, - path = paste0( - "/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), - "-", combo, "-single_models.", run_info$data_output - ), - return_type = "df" - ), - silent = TRUE - )) + single_model_tbl <- tryCatch( + { + read_file(run_info, + path = paste0( + "/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), + "-", combo, "-single_models.", run_info$data_output + ), + return_type = "df" + ) + }, + warning = function(w) { + # do nothing + }, + error = function(e) { + NULL + } + ) } ensemble_model_tbl <- NULL if (run_ensemble_models) { - suppressWarnings(try(ensemble_model_tbl <- read_file(run_info, - path = paste0( - "/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), - "-", combo, "-ensemble_models.", run_info$data_output - ), - return_type = "df" - ), - silent = TRUE - )) + ensemble_model_tbl <- tryCatch( + { + read_file(run_info, + path = paste0( + "/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), + "-", combo, "-ensemble_models.", run_info$data_output + ), + return_type = "df" + ) + }, + warning = function(w) { + # do nothing + }, + error = function(e) { + NULL + } + ) } global_model_tbl <- NULL if (run_global_models) { - suppressWarnings(try(global_model_tbl <- read_file(run_info, - path = paste0( - "/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), - "-", combo, "-global_models.", run_info$data_output - ), - return_type = "df" - ), - silent = TRUE - )) + global_model_tbl <- tryCatch( + { + read_file(run_info, + path = paste0( + "/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), + "-", combo, "-global_models.", run_info$data_output + ), + return_type = "df" + ) + }, + warning = function(w) { + # do nothing + }, + error = function(e) { + NULL + } + ) } local_model_tbl <- single_model_tbl %>% rbind(ensemble_model_tbl) - predictions_tbl <- local_model_tbl %>% - rbind(global_model_tbl) %>% - dplyr::select(Combo, Model_ID, Model_Name, Model_Type, Recipe_ID, Train_Test_ID, Date, Forecast, Target) %>% - dplyr::filter(Train_Test_ID %in% train_test_id_list) - # check if model averaging already happened if ("Best_Model" %in% colnames(local_model_tbl %>% rbind(global_model_tbl))) { + # see if average models file exists and add to model tbl + average_model_tbl <- tryCatch( + { + read_file(run_info, + path = paste0( + "/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), + "-", combo, "-average_models.", run_info$data_output + ), + return_type = "df" + ) + }, + warning = function(w) { + # do nothing + }, + error = function(e) { + NULL + } + ) + + local_model_tbl <- local_model_tbl %>% + rbind(average_model_tbl) + best_model_check <- TRUE } else { best_model_check <- FALSE } + # combine all forecasts + predictions_tbl <- local_model_tbl %>% + rbind(global_model_tbl) %>% + dplyr::select(Combo, Model_ID, Model_Name, Model_Type, Recipe_ID, Train_Test_ID, Date, Forecast, Target) %>% + dplyr::filter(Train_Test_ID %in% train_test_id_list) + # get model list if (!is.null(local_model_tbl)) { local_model_list <- local_model_tbl %>% diff --git a/R/hierarchy.R b/R/hierarchy.R index 65314380..06f468a1 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -360,71 +360,71 @@ reconcile_hierarchical_data <- function(run_info, { model <- x - if (model == "Best-Model") { - model_tbl <- unreconciled_tbl %>% - dplyr::filter(Best_Model == "Yes") %>% - dplyr::left_join(model_train_test_tbl %>% dplyr::select(Run_Type, Train_Test_ID), - by = "Train_Test_ID" - ) %>% - dplyr::filter(Run_Type %in% c("Future_Forecast", "Back_Test")) - } else { - model_tbl <- unreconciled_tbl %>% - dplyr::filter(Model_ID == model) %>% - dplyr::left_join(model_train_test_tbl %>% dplyr::select(Run_Type, Train_Test_ID), - by = "Train_Test_ID" - ) %>% - dplyr::filter(Run_Type %in% c("Future_Forecast", "Back_Test")) - } - - if (length(unique(model_tbl$Combo)) != length(hts_combo_list)) { - # add snaive fcst to missing combos to get a full hierarchy of forecasts to reconcile - snaive_combo_list <- setdiff(hts_combo_list, unique(model_tbl$Combo)) - - snaive_tbl <- unreconciled_tbl %>% - dplyr::filter(Model_Name == "snaive") %>% - dplyr::left_join(model_train_test_tbl %>% dplyr::select(Run_Type, Train_Test_ID), - by = "Train_Test_ID" - ) %>% - dplyr::filter( - Run_Type %in% c("Future_Forecast", "Back_Test"), - Combo %in% snaive_combo_list - ) + ts_combined <- NULL - model_tbl <- model_tbl %>% - rbind(snaive_tbl) - } + tryCatch( + { + if (model == "Best-Model") { + model_tbl <- unreconciled_tbl %>% + dplyr::filter(Best_Model == "Yes") %>% + dplyr::left_join(model_train_test_tbl %>% dplyr::select(Run_Type, Train_Test_ID), + by = "Train_Test_ID" + ) %>% + dplyr::filter(Run_Type %in% c("Future_Forecast", "Back_Test")) + } else { + model_tbl <- unreconciled_tbl %>% + dplyr::filter(Model_ID == model) %>% + dplyr::left_join(model_train_test_tbl %>% dplyr::select(Run_Type, Train_Test_ID), + by = "Train_Test_ID" + ) %>% + dplyr::filter(Run_Type %in% c("Future_Forecast", "Back_Test")) + } - forecast_tbl <- model_tbl %>% - dplyr::select(Date, Train_Test_ID, Combo, Forecast) %>% - tidyr::pivot_wider(names_from = Combo, values_from = Forecast) + if (length(unique(model_tbl$Combo)) != length(hts_combo_list)) { + # add snaive fcst to missing combos to get a full hierarchy of forecasts to reconcile + snaive_combo_list <- setdiff(hts_combo_list, unique(model_tbl$Combo)) + + snaive_tbl <- unreconciled_tbl %>% + dplyr::filter(Model_Name == "snaive") %>% + dplyr::left_join(model_train_test_tbl %>% dplyr::select(Run_Type, Train_Test_ID), + by = "Train_Test_ID" + ) %>% + dplyr::filter( + Run_Type %in% c("Future_Forecast", "Back_Test"), + Combo %in% snaive_combo_list + ) + + model_tbl <- model_tbl %>% + rbind(snaive_tbl) + } - forecast_tbl[is.na(forecast_tbl)] <- 0 + forecast_tbl <- model_tbl %>% + dplyr::select(Date, Train_Test_ID, Combo, Forecast) %>% + tidyr::pivot_wider(names_from = Combo, values_from = Forecast) - date_tbl <- forecast_tbl %>% - dplyr::select(Date, Train_Test_ID) + forecast_tbl[is.na(forecast_tbl)] <- 0 - ts <- forecast_tbl %>% - dplyr::select(-Date, -Train_Test_ID) %>% - dplyr::select(hts_combo_list) %>% - stats::ts() + date_tbl <- forecast_tbl %>% + dplyr::select(Date, Train_Test_ID) - residual_multiplier <- 10 # shrink extra large residuals to prevent recon issues + ts <- forecast_tbl %>% + dplyr::select(-Date, -Train_Test_ID) %>% + dplyr::select(hts_combo_list) %>% + stats::ts() - residuals_tbl <- model_tbl %>% - dplyr::filter(Run_Type == "Back_Test") %>% - dplyr::mutate( - Forecast_Adj = ifelse((abs(Target) + 1) * residual_multiplier < abs(Forecast), (Target + 1) * residual_multiplier, Forecast), # prevent hts recon issues - Residual = Target - Forecast_Adj - ) %>% - dplyr::select(Combo, Date, Train_Test_ID, Residual) %>% - tidyr::pivot_wider(names_from = Combo, values_from = Residual) %>% - dplyr::select(-Date, -Train_Test_ID) %>% - dplyr::select(hts_combo_list) %>% - as.matrix() + residual_multiplier <- 10 # shrink extra large residuals to prevent recon issues - tryCatch( - { - ts_combined <- NULL + residuals_tbl <- model_tbl %>% + dplyr::filter(Run_Type == "Back_Test") %>% + dplyr::mutate( + Forecast_Adj = ifelse((abs(Target) + 1) * residual_multiplier < abs(Forecast), (Target + 1) * residual_multiplier, Forecast), # prevent hts recon issues + Residual = Target - Forecast_Adj + ) %>% + dplyr::select(Combo, Date, Train_Test_ID, Residual) %>% + tidyr::pivot_wider(names_from = Combo, values_from = Residual) %>% + dplyr::select(-Date, -Train_Test_ID) %>% + dplyr::select(hts_combo_list) %>% + as.matrix() if (forecast_approach == "standard_hierarchy") { ts_combined <- data.frame(hts::combinef(ts, @@ -453,10 +453,12 @@ reconcile_hierarchical_data <- function(run_info, } ) + # return if there was an error in the recon process for non best-model if (is.null(ts_combined)) { return(tibble::tibble()) } + # final transformations before writing to disk reconciled_tbl <- ts_combined %>% tibble::add_column( Train_Test_ID = date_tbl$Train_Test_ID, @@ -536,110 +538,110 @@ reconcile_hierarchical_data <- function(run_info, { model <- x - hist_tbl <- read_file(run_info, - path = paste0("/prep_data/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), "-hts_data.", run_info$data_output) - ) %>% - dplyr::select(Combo, Date, Target) + ts_combined <- NULL - fcst_path <- paste0( - "/forecasts/*", hash_data(run_info$experiment_name), "-", - hash_data(run_info$run_name), "*models", ".", run_info$data_output - ) - - schema <- arrow::schema( - arrow::field("Combo_ID", arrow::string()), - arrow::field("Model_ID", arrow::string()), - arrow::field("Model_Name", arrow::string()), - arrow::field("Model_Type", arrow::string()), - arrow::field("Recipe_ID", arrow::string()), - arrow::field("Train_Test_ID", arrow::float64()), - arrow::field("Hyperparameter_ID", arrow::float64()), - arrow::field("Best_Model", arrow::string()), - arrow::field("Combo", arrow::string()), - arrow::field("Horizon", arrow::float64()), - arrow::field("Date", arrow::date32()), - arrow::field("Target", arrow::float64()), - arrow::field("Forecast", arrow::float64()), - arrow::field("lo_95", arrow::float64()), - arrow::field("lo_80", arrow::float64()), - arrow::field("hi_80", arrow::float64()), - arrow::field("hi_95", arrow::float64()) - ) - - unreconciled_tbl <- read_file(run_info, - path = fcst_path, - return_type = "arrow", - schema = schema - ) - - if (model == "Best-Model") { - model_tbl <- unreconciled_tbl %>% - dplyr::filter(Best_Model == "Yes") %>% - dplyr::collect() %>% - dplyr::left_join(model_train_test_tbl %>% dplyr::select(Run_Type, Train_Test_ID), - by = "Train_Test_ID" - ) %>% - dplyr::filter(Run_Type %in% c("Future_Forecast", "Back_Test")) - } else { - model_tbl <- unreconciled_tbl %>% - dplyr::filter(Model_ID == model) %>% - dplyr::collect() %>% - dplyr::left_join(model_train_test_tbl %>% dplyr::select(Run_Type, Train_Test_ID), - by = "Train_Test_ID" + tryCatch( + { + hist_tbl <- read_file(run_info, + path = paste0("/prep_data/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), "-hts_data.", run_info$data_output) ) %>% - dplyr::filter(Run_Type %in% c("Future_Forecast", "Back_Test")) - } + dplyr::select(Combo, Date, Target) - if (length(unique(model_tbl$Combo)) != length(hts_combo_list)) { - # add snaive fcst to missing combos to get a full hierarchy of forecasts to reconcile - snaive_combo_list <- setdiff(hts_combo_list, unique(model_tbl$Combo)) + fcst_path <- paste0( + "/forecasts/*", hash_data(run_info$experiment_name), "-", + hash_data(run_info$run_name), "*models", ".", run_info$data_output + ) - snaive_tbl <- unreconciled_tbl %>% - dplyr::filter(Model_Name == "snaive") %>% - dplyr::collect() %>% - dplyr::left_join(model_train_test_tbl %>% dplyr::select(Run_Type, Train_Test_ID), - by = "Train_Test_ID" - ) %>% - dplyr::filter( - Run_Type %in% c("Future_Forecast", "Back_Test"), - Combo %in% snaive_combo_list + schema <- arrow::schema( + arrow::field("Combo_ID", arrow::string()), + arrow::field("Model_ID", arrow::string()), + arrow::field("Model_Name", arrow::string()), + arrow::field("Model_Type", arrow::string()), + arrow::field("Recipe_ID", arrow::string()), + arrow::field("Train_Test_ID", arrow::float64()), + arrow::field("Hyperparameter_ID", arrow::float64()), + arrow::field("Best_Model", arrow::string()), + arrow::field("Combo", arrow::string()), + arrow::field("Horizon", arrow::float64()), + arrow::field("Date", arrow::date32()), + arrow::field("Target", arrow::float64()), + arrow::field("Forecast", arrow::float64()), + arrow::field("lo_95", arrow::float64()), + arrow::field("lo_80", arrow::float64()), + arrow::field("hi_80", arrow::float64()), + arrow::field("hi_95", arrow::float64()) ) - model_tbl <- model_tbl %>% - rbind(snaive_tbl) - } + unreconciled_tbl <- read_file(run_info, + path = fcst_path, + return_type = "arrow", + schema = schema + ) - forecast_tbl <- model_tbl %>% - dplyr::select(Date, Train_Test_ID, Combo, Forecast) %>% - tidyr::pivot_wider(names_from = Combo, values_from = Forecast) + if (model == "Best-Model") { + model_tbl <- unreconciled_tbl %>% + dplyr::filter(Best_Model == "Yes") %>% + dplyr::collect() %>% + dplyr::left_join(model_train_test_tbl %>% dplyr::select(Run_Type, Train_Test_ID), + by = "Train_Test_ID" + ) %>% + dplyr::filter(Run_Type %in% c("Future_Forecast", "Back_Test")) + } else { + model_tbl <- unreconciled_tbl %>% + dplyr::filter(Model_ID == model) %>% + dplyr::collect() %>% + dplyr::left_join(model_train_test_tbl %>% dplyr::select(Run_Type, Train_Test_ID), + by = "Train_Test_ID" + ) %>% + dplyr::filter(Run_Type %in% c("Future_Forecast", "Back_Test")) + } - forecast_tbl[is.na(forecast_tbl)] <- 0 + if (length(unique(model_tbl$Combo)) != length(hts_combo_list)) { + # add snaive fcst to missing combos to get a full hierarchy of forecasts to reconcile + snaive_combo_list <- setdiff(hts_combo_list, unique(model_tbl$Combo)) + + snaive_tbl <- unreconciled_tbl %>% + dplyr::filter(Model_Name == "snaive") %>% + dplyr::collect() %>% + dplyr::left_join(model_train_test_tbl %>% dplyr::select(Run_Type, Train_Test_ID), + by = "Train_Test_ID" + ) %>% + dplyr::filter( + Run_Type %in% c("Future_Forecast", "Back_Test"), + Combo %in% snaive_combo_list + ) + + model_tbl <- model_tbl %>% + rbind(snaive_tbl) + } - date_tbl <- forecast_tbl %>% - dplyr::select(Date, Train_Test_ID) + forecast_tbl <- model_tbl %>% + dplyr::select(Date, Train_Test_ID, Combo, Forecast) %>% + tidyr::pivot_wider(names_from = Combo, values_from = Forecast) - ts <- forecast_tbl %>% - tibble::as_tibble() %>% - dplyr::select(tidyselect::all_of(hts_combo_list)) %>% - stats::ts() + forecast_tbl[is.na(forecast_tbl)] <- 0 - residual_multiplier <- 10 # shrink extra large residuals to prevent recon issues + date_tbl <- forecast_tbl %>% + dplyr::select(Date, Train_Test_ID) - residuals_tbl <- model_tbl %>% - dplyr::filter(Run_Type == "Back_Test") %>% - dplyr::mutate( - Forecast_Adj = ifelse((abs(Target) + 1) * residual_multiplier < abs(Forecast), (Target + 1) * residual_multiplier, Forecast), # prevent hts recon issues - Residual = Target - Forecast_Adj - ) %>% - dplyr::select(Combo, Date, Train_Test_ID, Residual) %>% - tidyr::pivot_wider(names_from = Combo, values_from = Residual) %>% - tibble::as_tibble() %>% - dplyr::select(tidyselect::all_of(hts_combo_list)) %>% - as.matrix() + ts <- forecast_tbl %>% + tibble::as_tibble() %>% + dplyr::select(tidyselect::all_of(hts_combo_list)) %>% + stats::ts() - tryCatch( - { - ts_combined <- NULL + residual_multiplier <- 10 # shrink extra large residuals to prevent recon issues + + residuals_tbl <- model_tbl %>% + dplyr::filter(Run_Type == "Back_Test") %>% + dplyr::mutate( + Forecast_Adj = ifelse((abs(Target) + 1) * residual_multiplier < abs(Forecast), (Target + 1) * residual_multiplier, Forecast), # prevent hts recon issues + Residual = Target - Forecast_Adj + ) %>% + dplyr::select(Combo, Date, Train_Test_ID, Residual) %>% + tidyr::pivot_wider(names_from = Combo, values_from = Residual) %>% + tibble::as_tibble() %>% + dplyr::select(tidyselect::all_of(hts_combo_list)) %>% + as.matrix() if (forecast_approach == "standard_hierarchy") { ts_combined <- data.frame(hts::combinef(ts, @@ -668,10 +670,12 @@ reconcile_hierarchical_data <- function(run_info, } ) + # return if there was an error in reconciling a non best-model if (is.null(ts_combined)) { return(tibble::tibble()) } + # final transformations before writing to disk reconciled_tbl <- ts_combined %>% tibble::add_column( Train_Test_ID = date_tbl$Train_Test_ID,