From 88fbb8a9bfe4ce086055b76c56ce429b27fa2589 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Tue, 30 Jul 2024 13:30:55 -0700 Subject: [PATCH 1/8] multistep horizon bug fix --- DESCRIPTION | 2 +- NEWS.md | 3 ++- R/ensemble_models.R | 5 ++++- R/multistep_cubist.R | 4 ++-- R/multistep_glmnet.R | 4 ++-- R/multistep_mars.R | 4 ++-- R/multistep_svm_poly.R | 6 +++--- R/multistep_svm_rbf.R | 4 ++-- R/multistep_xgboost.R | 4 ++-- R/prep_data.R | 2 +- R/train_models.R | 13 ++++++++++--- 11 files changed, 31 insertions(+), 20 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9354060e..953f3270 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: finnts Title: Microsoft Finance Time Series Forecasting Framework -Version: 0.4.0.9005 +Version: 0.4.0.9006 Authors@R: c(person(given = "Mike", family = "Tokic", diff --git a/NEWS.md b/NEWS.md index ee1b746b..f29b3617 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# finnts 0.4.0.9005 (DEVELOPMENT VERSION) +# finnts 0.4.0.9006 (DEVELOPMENT VERSION) ## Improvements @@ -8,6 +8,7 @@ - Always save the most accurate model average, regardless if selected as best model. This allows for improved scaling with large data sets. - Automatically condense large forecasts (+10k time series) into smaller amount of files to make it easier to read forecast outputs - Improved weighted MAPE calculation across all time series +- Changed default for box_cox argument in `prep_data()` to FALSE ## Bug Fixes diff --git a/R/ensemble_models.R b/R/ensemble_models.R index e39b5270..21c3ce02 100644 --- a/R/ensemble_models.R +++ b/R/ensemble_models.R @@ -366,6 +366,7 @@ ensemble_models <- function(run_info, parallel_over = "everything" ) ) %>% + base::suppressMessages() %>% base::suppressWarnings() best_param <- tune::select_best(tune_results, metric = "rmse") @@ -397,7 +398,9 @@ ensemble_models <- function(run_info, pkgs = inner_packages, parallel_over = "everything" ) - ) + ) %>% + base::suppressMessages() %>% + base::suppressWarnings() final_fcst <- tune::collect_predictions(refit_tbl) %>% dplyr::rename( diff --git a/R/multistep_cubist.R b/R/multistep_cubist.R index e03c047f..08f849a2 100644 --- a/R/multistep_cubist.R +++ b/R/multistep_cubist.R @@ -469,8 +469,8 @@ cubist_multistep_predict_impl <- function(object, new_data, ...) { xreg_tbl_final <- xreg_tbl %>% dplyr::filter( - Run_Number >= start_val, - Run_Number <= lag_number + Run_Number >= as.numeric(start_val), + Run_Number <= as.numeric(lag_number) ) if (!is.null(xreg_tbl)) { diff --git a/R/multistep_glmnet.R b/R/multistep_glmnet.R index b67e8217..1c290f9a 100644 --- a/R/multistep_glmnet.R +++ b/R/multistep_glmnet.R @@ -457,8 +457,8 @@ glmnet_multistep_predict_impl <- function(object, new_data, ...) { xreg_tbl_final <- xreg_tbl %>% dplyr::filter( - Run_Number >= start_val, - Run_Number <= lag_number + Run_Number >= as.numeric(start_val), + Run_Number <= as.numeric(lag_number) ) if (!is.null(xreg_tbl)) { diff --git a/R/multistep_mars.R b/R/multistep_mars.R index 68899d9e..de13e595 100644 --- a/R/multistep_mars.R +++ b/R/multistep_mars.R @@ -480,8 +480,8 @@ mars_multistep_predict_impl <- function(object, new_data, ...) { xreg_tbl_final <- xreg_tbl %>% dplyr::filter( - Run_Number >= start_val, - Run_Number <= lag_number + Run_Number >= as.numeric(start_val), + Run_Number <= as.numeric(lag_number) ) if (!is.null(xreg_tbl)) { diff --git a/R/multistep_svm_poly.R b/R/multistep_svm_poly.R index 4b57a8b6..19810ab8 100644 --- a/R/multistep_svm_poly.R +++ b/R/multistep_svm_poly.R @@ -506,8 +506,8 @@ svm_poly_multistep_predict_impl <- function(object, new_data, ...) { xreg_tbl_final <- xreg_tbl %>% dplyr::filter( - Run_Number >= start_val, - Run_Number <= lag_number + Run_Number >= as.numeric(start_val), + Run_Number <= as.numeric(lag_number) ) if (!is.null(xreg_tbl)) { @@ -518,7 +518,7 @@ svm_poly_multistep_predict_impl <- function(object, new_data, ...) { preds_svm_poly <- preds_svm_poly %>% dplyr::mutate(Row_Num = xreg_tbl_final$Row_Num) - + start_val <- as.numeric(lag_number) + 1 final_prediction <- rbind(final_prediction, preds_svm_poly) } diff --git a/R/multistep_svm_rbf.R b/R/multistep_svm_rbf.R index d7480912..01e925a4 100644 --- a/R/multistep_svm_rbf.R +++ b/R/multistep_svm_rbf.R @@ -486,8 +486,8 @@ svm_rbf_multistep_predict_impl <- function(object, new_data, ...) { xreg_tbl_final <- xreg_tbl %>% dplyr::filter( - Run_Number >= start_val, - Run_Number <= lag_number + Run_Number >= as.numeric(start_val), + Run_Number <= as.numeric(lag_number) ) if (!is.null(xreg_tbl)) { diff --git a/R/multistep_xgboost.R b/R/multistep_xgboost.R index 9ce5332d..2edc9c53 100644 --- a/R/multistep_xgboost.R +++ b/R/multistep_xgboost.R @@ -568,8 +568,8 @@ xgboost_multistep_predict_impl <- function(object, new_data, ...) { xreg_tbl_temp <- xreg_tbl %>% dplyr::filter( - Run_Number >= start_val, - Run_Number <= lag_number + Run_Number >= as.numeric(start_val), + Run_Number <= as.numeric(lag_number) ) xreg_tbl_final <- xreg_tbl_temp %>% diff --git a/R/prep_data.R b/R/prep_data.R index 92108afd..b9b9efac 100644 --- a/R/prep_data.R +++ b/R/prep_data.R @@ -83,7 +83,7 @@ prep_data <- function(run_info, fiscal_year_start = 1, clean_missing_values = TRUE, clean_outliers = FALSE, - box_cox = TRUE, + box_cox = FALSE, stationary = TRUE, forecast_approach = "bottoms_up", parallel_processing = NULL, diff --git a/R/train_models.R b/R/train_models.R index d8f406e1..88a67598 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -515,9 +515,11 @@ train_models <- function(run_info, pkgs = inner_packages, parallel_over = "everything" ) - ) %>% - tune::collect_predictions() - + ) %>% + tune::collect_predictions() %>% + base::suppressMessages() %>% + base::suppressWarnings() + # finalize forecast final_fcst <- refit_tbl %>% dplyr::rename( @@ -533,6 +535,11 @@ train_models <- function(run_info, ) %>% dplyr::mutate(Hyperparameter_ID = hyperparameter_id) %>% dplyr::select(-.row, -.config) + + # check for future forecast + if(as.numeric(min(unique(final_fcst$Train_Test_ID))) != 1) { + stop("model is missing future forecast") + } # undo differencing transformation if (stationary & model %in% list_multivariate_models()) { From 81687c873e06c4fddef6a6a34272d594f6498dba Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Tue, 30 Jul 2024 13:34:08 -0700 Subject: [PATCH 2/8] code formatting --- R/ensemble_models.R | 2 +- R/multistep_svm_poly.R | 2 +- R/train_models.R | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/ensemble_models.R b/R/ensemble_models.R index 21c3ce02..5fab7aef 100644 --- a/R/ensemble_models.R +++ b/R/ensemble_models.R @@ -398,7 +398,7 @@ ensemble_models <- function(run_info, pkgs = inner_packages, parallel_over = "everything" ) - ) %>% + ) %>% base::suppressMessages() %>% base::suppressWarnings() diff --git a/R/multistep_svm_poly.R b/R/multistep_svm_poly.R index 19810ab8..55e6e084 100644 --- a/R/multistep_svm_poly.R +++ b/R/multistep_svm_poly.R @@ -518,7 +518,7 @@ svm_poly_multistep_predict_impl <- function(object, new_data, ...) { preds_svm_poly <- preds_svm_poly %>% dplyr::mutate(Row_Num = xreg_tbl_final$Row_Num) - + start_val <- as.numeric(lag_number) + 1 final_prediction <- rbind(final_prediction, preds_svm_poly) } diff --git a/R/train_models.R b/R/train_models.R index 88a67598..35b98028 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -515,11 +515,11 @@ train_models <- function(run_info, pkgs = inner_packages, parallel_over = "everything" ) - ) %>% + ) %>% tune::collect_predictions() %>% base::suppressMessages() %>% base::suppressWarnings() - + # finalize forecast final_fcst <- refit_tbl %>% dplyr::rename( @@ -535,9 +535,9 @@ train_models <- function(run_info, ) %>% dplyr::mutate(Hyperparameter_ID = hyperparameter_id) %>% dplyr::select(-.row, -.config) - + # check for future forecast - if(as.numeric(min(unique(final_fcst$Train_Test_ID))) != 1) { + if (as.numeric(min(unique(final_fcst$Train_Test_ID))) != 1) { stop("model is missing future forecast") } From ebe952cee5857733902b35498513bc076f6c6558 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Tue, 30 Jul 2024 13:57:08 -0700 Subject: [PATCH 3/8] doc update --- man/prep_data.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/prep_data.Rd b/man/prep_data.Rd index d802a053..a70731a5 100644 --- a/man/prep_data.Rd +++ b/man/prep_data.Rd @@ -18,7 +18,7 @@ prep_data( fiscal_year_start = 1, clean_missing_values = TRUE, clean_outliers = FALSE, - box_cox = TRUE, + box_cox = FALSE, stationary = TRUE, forecast_approach = "bottoms_up", parallel_processing = NULL, From 1095722907fd9e594437ecc1b07c5440241cb77a Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Thu, 1 Aug 2024 08:32:47 -0700 Subject: [PATCH 4/8] ensure proper date formatting --- R/run_info.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/run_info.R b/R/run_info.R index bffece82..4ee4b9e1 100644 --- a/R/run_info.R +++ b/R/run_info.R @@ -202,7 +202,7 @@ set_run_info <- function(experiment_name = "finn_fcst", output_tbl <- tibble::tibble( experiment_name = experiment_name, run_name = run_name, - created = created, + created = format(created, "%Y-%m-%d %H:%M:%S"), path = path, data_output = data_output, object_output = object_output From 40a2c0b66a582e430b92368fff581a54865f5170 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Thu, 1 Aug 2024 11:34:44 -0700 Subject: [PATCH 5/8] date format updates to all mjaor function calls --- R/final_models.R | 3 ++- R/prep_data.R | 3 ++- R/prep_models.R | 11 ++++++++--- R/run_info.R | 2 +- R/train_models.R | 3 ++- 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/R/final_models.R b/R/final_models.R index c8cac696..c9a82c78 100644 --- a/R/final_models.R +++ b/R/final_models.R @@ -667,7 +667,8 @@ final_models <- function(run_info, dplyr::mutate( average_models = average_models, max_model_average = max_model_average, - weighted_mape = round(weighted_mape, digits = 4) + weighted_mape = round(weighted_mape, digits = 4), + created = format(created, "%Y-%m-%d %H:%M:%S") ) write_data( diff --git a/R/prep_data.R b/R/prep_data.R index b9b9efac..4e0ac650 100644 --- a/R/prep_data.R +++ b/R/prep_data.R @@ -730,7 +730,8 @@ prep_data <- function(run_info, lag_periods = ifelse(is.null(lag_periods), NA, paste(lag_periods, collapse = "---")), rolling_window_periods = ifelse(is.null(rolling_window_periods), NA, paste(rolling_window_periods, collapse = "---")), recipes_to_run = ifelse(is.null(recipes_to_run), NA, paste(recipes_to_run, collapse = "---")), - multistep_horizon = multistep_horizon + multistep_horizon = multistep_horizon, + created = format(created, "%Y-%m-%d %H:%M:%S") ) write_data( diff --git a/R/prep_models.R b/R/prep_models.R index 5602820d..62cfa62a 100644 --- a/R/prep_models.R +++ b/R/prep_models.R @@ -389,7 +389,8 @@ train_test_split <- function(run_info, dplyr::mutate( back_test_scenarios = ifelse(is.null(back_test_scenarios), NA, back_test_scenarios), back_test_spacing = ifelse(is.null(back_test_spacing), NA, back_test_spacing), - run_ensemble_models = run_ensemble_models + run_ensemble_models = run_ensemble_models, + created = format(created, "%Y-%m-%d %H:%M:%S") ) write_data( @@ -625,7 +626,8 @@ model_workflows <- function(run_info, dplyr::mutate( models_to_run = ifelse(is.null(models_to_run), NA, paste(models_to_run, collapse = "---")), models_not_to_run = ifelse(is.null(models_not_to_run), NA, paste(models_not_to_run, collapse = "---")), - pca = ifelse(is.null(pca), NA, pca) + pca = ifelse(is.null(pca), NA, pca), + created = format(created, "%Y-%m-%d %H:%M:%S") ) write_data( @@ -804,7 +806,10 @@ model_hyperparameters <- function(run_info, # update logging file log_df <- log_df %>% - dplyr::mutate(num_hyperparameters = num_hyperparameters) + dplyr::mutate( + num_hyperparameters = num_hyperparameters, + created = format(created, "%Y-%m-%d %H:%M:%S") + ) write_data( x = log_df, diff --git a/R/run_info.R b/R/run_info.R index 4ee4b9e1..c831d3cd 100644 --- a/R/run_info.R +++ b/R/run_info.R @@ -192,7 +192,7 @@ set_run_info <- function(experiment_name = "finn_fcst", output_list <- list( experiment_name = experiment_name, run_name = run_name, - created = created, + created = format(created, "%Y-%m-%d %H:%M:%S"), storage_object = storage_object, path = path, data_output = data_output, diff --git a/R/train_models.R b/R/train_models.R index 35b98028..2d8e1ea3 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -741,7 +741,8 @@ train_models <- function(run_info, feature_selection = feature_selection, seed = seed, negative_forecast = negative_forecast, - inner_parallel = inner_parallel + inner_parallel = inner_parallel, + created = format(created, "%Y-%m-%d %H:%M:%S") ) write_data( From 9c99089304e190d4645a10fbae96de6547627f79 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Fri, 2 Aug 2024 09:06:45 -0700 Subject: [PATCH 6/8] date formatting in csv files --- R/final_models.R | 3 +-- R/prep_data.R | 3 +-- R/prep_models.R | 9 +++------ R/read_write_data.R | 7 +++++++ R/run_info.R | 4 ++-- R/train_models.R | 3 +-- 6 files changed, 15 insertions(+), 14 deletions(-) diff --git a/R/final_models.R b/R/final_models.R index c9a82c78..c8cac696 100644 --- a/R/final_models.R +++ b/R/final_models.R @@ -667,8 +667,7 @@ final_models <- function(run_info, dplyr::mutate( average_models = average_models, max_model_average = max_model_average, - weighted_mape = round(weighted_mape, digits = 4), - created = format(created, "%Y-%m-%d %H:%M:%S") + weighted_mape = round(weighted_mape, digits = 4) ) write_data( diff --git a/R/prep_data.R b/R/prep_data.R index 4e0ac650..b9b9efac 100644 --- a/R/prep_data.R +++ b/R/prep_data.R @@ -730,8 +730,7 @@ prep_data <- function(run_info, lag_periods = ifelse(is.null(lag_periods), NA, paste(lag_periods, collapse = "---")), rolling_window_periods = ifelse(is.null(rolling_window_periods), NA, paste(rolling_window_periods, collapse = "---")), recipes_to_run = ifelse(is.null(recipes_to_run), NA, paste(recipes_to_run, collapse = "---")), - multistep_horizon = multistep_horizon, - created = format(created, "%Y-%m-%d %H:%M:%S") + multistep_horizon = multistep_horizon ) write_data( diff --git a/R/prep_models.R b/R/prep_models.R index 62cfa62a..c848a12e 100644 --- a/R/prep_models.R +++ b/R/prep_models.R @@ -389,8 +389,7 @@ train_test_split <- function(run_info, dplyr::mutate( back_test_scenarios = ifelse(is.null(back_test_scenarios), NA, back_test_scenarios), back_test_spacing = ifelse(is.null(back_test_spacing), NA, back_test_spacing), - run_ensemble_models = run_ensemble_models, - created = format(created, "%Y-%m-%d %H:%M:%S") + run_ensemble_models = run_ensemble_models ) write_data( @@ -626,8 +625,7 @@ model_workflows <- function(run_info, dplyr::mutate( models_to_run = ifelse(is.null(models_to_run), NA, paste(models_to_run, collapse = "---")), models_not_to_run = ifelse(is.null(models_not_to_run), NA, paste(models_not_to_run, collapse = "---")), - pca = ifelse(is.null(pca), NA, pca), - created = format(created, "%Y-%m-%d %H:%M:%S") + pca = ifelse(is.null(pca), NA, pca) ) write_data( @@ -807,8 +805,7 @@ model_hyperparameters <- function(run_info, # update logging file log_df <- log_df %>% dplyr::mutate( - num_hyperparameters = num_hyperparameters, - created = format(created, "%Y-%m-%d %H:%M:%S") + num_hyperparameters = num_hyperparameters ) write_data( diff --git a/R/read_write_data.R b/R/read_write_data.R index 2e43572a..403debac 100644 --- a/R/read_write_data.R +++ b/R/read_write_data.R @@ -407,10 +407,17 @@ write_data <- function(x, write_data_type <- function(x, path, type) { + if (type == "csv") { + if (nrow(x) == 1) { + type <- "log" + } + } + switch(type, rds = saveRDS(x, path), parquet = arrow::write_parquet(x, path), csv = vroom::vroom_write(x, path, delim = ",", progress = FALSE), + log = write.csv(x, path, row.names = FALSE), qs = qs::qsave(x, path) ) } diff --git a/R/run_info.R b/R/run_info.R index c831d3cd..bffece82 100644 --- a/R/run_info.R +++ b/R/run_info.R @@ -192,7 +192,7 @@ set_run_info <- function(experiment_name = "finn_fcst", output_list <- list( experiment_name = experiment_name, run_name = run_name, - created = format(created, "%Y-%m-%d %H:%M:%S"), + created = created, storage_object = storage_object, path = path, data_output = data_output, @@ -202,7 +202,7 @@ set_run_info <- function(experiment_name = "finn_fcst", output_tbl <- tibble::tibble( experiment_name = experiment_name, run_name = run_name, - created = format(created, "%Y-%m-%d %H:%M:%S"), + created = created, path = path, data_output = data_output, object_output = object_output diff --git a/R/train_models.R b/R/train_models.R index 2d8e1ea3..35b98028 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -741,8 +741,7 @@ train_models <- function(run_info, feature_selection = feature_selection, seed = seed, negative_forecast = negative_forecast, - inner_parallel = inner_parallel, - created = format(created, "%Y-%m-%d %H:%M:%S") + inner_parallel = inner_parallel ) write_data( From 6e493fb26e540d995937ca75588acfffd891cbfc Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Fri, 2 Aug 2024 09:14:37 -0700 Subject: [PATCH 7/8] attatch package --- R/read_write_data.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/read_write_data.R b/R/read_write_data.R index 403debac..d37e2ecc 100644 --- a/R/read_write_data.R +++ b/R/read_write_data.R @@ -417,7 +417,7 @@ write_data_type <- function(x, rds = saveRDS(x, path), parquet = arrow::write_parquet(x, path), csv = vroom::vroom_write(x, path, delim = ",", progress = FALSE), - log = write.csv(x, path, row.names = FALSE), + log = base::write.csv(x, path, row.names = FALSE), qs = qs::qsave(x, path) ) } From 97aae40236683955cacd3e4299b80493c702b679 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Fri, 2 Aug 2024 09:30:18 -0700 Subject: [PATCH 8/8] attatch correct package --- R/read_write_data.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/read_write_data.R b/R/read_write_data.R index d37e2ecc..b0587a55 100644 --- a/R/read_write_data.R +++ b/R/read_write_data.R @@ -417,7 +417,7 @@ write_data_type <- function(x, rds = saveRDS(x, path), parquet = arrow::write_parquet(x, path), csv = vroom::vroom_write(x, path, delim = ",", progress = FALSE), - log = base::write.csv(x, path, row.names = FALSE), + log = utils::write.csv(x, path, row.names = FALSE), qs = qs::qsave(x, path) ) }