microsoft · mitokic · Jul 29, 2024 · Jul 17, 2024 · Jul 17, 2024 · Jul 18, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: finnts
 Title: Microsoft Finance Time Series Forecasting Framework
-Version: 0.4.0.9004
+Version: 0.4.0.9005
 Authors@R: 
     c(person(given = "Mike",
            family = "Tokic",
@@ -24,7 +24,7 @@ License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.1
+RoxygenNote: 7.3.1
 Imports: 
     cli,
     Cubist,

diff --git a/NEWS.md b/NEWS.md
@@ -1,10 +1,13 @@
-# finnts 0.4.0.9004 (DEVELOPMENT VERSION)
+# finnts 0.4.0.9005 (DEVELOPMENT VERSION)
 
 ## Improvements
 
 -   Added support for hierarchical forecasting with external regressors
 -   Allow global models for hierarchical forecasts
 -   Multistep horizon forecasts for R1 recipe, listed as `multistep_horizon` within `prep_data()` 
+-   Always save the most accurate model average, regardless if selected as best model. This allows for improved scaling with large data sets.
+-   Automatically condense large forecasts (+10k time series) into smaller amount of files to make it easier to read forecast outputs
+-   Improved weighted MAPE calculation across all time series
 
 ## Bug Fixes
 

diff --git a/R/ensemble_models.R b/R/ensemble_models.R
@@ -181,27 +181,29 @@ ensemble_models <- function(run_info,
       # model forecasts
       single_model_tbl <- NULL
       if (run_local_models) {
-        suppressWarnings(try(single_model_tbl <- read_file(run_info,
-          path = paste0(
-            "/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
-            "-", combo, "-single_models.", run_info$data_output
+        suppressWarnings(try(
+          single_model_tbl <- read_file(run_info,
+            path = paste0(
+              "/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
+              "-", combo, "-single_models.", run_info$data_output
+            ),
+            return_type = "df"
           ),
-          return_type = "df"
-        ),
-        silent = TRUE
+          silent = TRUE
         ))
       }
 
       global_model_tbl <- NULL
       if (run_global_models) {
-        suppressWarnings(try(global_model_tbl <- read_file(run_info,
-          path = paste0(
-            "/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
-            "-", combo, "-global_models.", run_info$data_output
+        suppressWarnings(try(
+          global_model_tbl <- read_file(run_info,
+            path = paste0(
+              "/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
+              "-", combo, "-global_models.", run_info$data_output
+            ),
+            return_type = "df"
           ),
-          return_type = "df"
-        ),
-        silent = TRUE
+          silent = TRUE
         ))
       }
 
@@ -336,7 +338,6 @@ ensemble_models <- function(run_info,
         .multicombine = TRUE,
         .noexport = NULL
       ) %do% {
-
         # get initial run info
         model <- model_run %>%
           dplyr::pull(Model_Name)

diff --git a/R/feature_selection.R b/R/feature_selection.R
@@ -23,10 +23,8 @@ run_feature_selection <- function(input_data,
                                   forecast_horizon,
                                   external_regressors,
                                   multistep_horizon = FALSE) {
-
   # check for more than one unique target value
   if (input_data %>% tidyr::drop_na(Target) %>% dplyr::pull(Target) %>% unique() %>% length() < 2) {
-
     # just return the date features
     fs_list <- input_data %>%
       dplyr::select(tidyselect::contains("Date"))
@@ -83,7 +81,6 @@ run_feature_selection <- function(input_data,
 
     # run feature selection
     if (date_type %in% c("day", "week")) {
-
       # number of votes needed for feature to be selected
       votes_needed <- 3
 
@@ -410,7 +407,6 @@ lofo_fn <- function(run_info,
                     parallel_processing,
                     pca = FALSE,
                     seed = 123) {
-
   # parallel run info
   par_info <- par_start(
     run_info = run_info,

diff --git a/R/final_models.R b/R/final_models.R
@@ -3,7 +3,8 @@
 #' Select Best Models and Prep Final Outputs
 #'
 #' @param run_info run info using the [set_run_info()] function.
-#' @param average_models If TRUE, create simple averages of individual models.
+#' @param average_models If TRUE, create simple averages of individual models
+#'  and save the most accurate one.
 #' @param max_model_average Max number of models to average together. Will
 #'   create model averages for 2 models up until input value or max number of
 #'   models ran.
@@ -124,7 +125,8 @@ final_models <- function(run_info,
   current_combo_list_final <- setdiff(
     current_combo_list,
     prev_combo_list
-  )
+  ) %>%
+    sample()
 
   prev_log_df <- read_file(run_info,
     path = paste0("logs/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), ".csv"),
@@ -138,8 +140,7 @@ final_models <- function(run_info,
   run_local_models <- prev_log_df$run_local_models
   run_ensemble_models <- prev_log_df$run_ensemble_models
 
-  if ((length(current_combo_list_final) == 0 & length(prev_combo_list) > 0) | sum(colnames(prev_log_df) %in% "weighted_mape")) {
-
+  if (sum(colnames(prev_log_df) %in% "weighted_mape")) {
     # check if input values have changed
     current_log_df <- tibble::tibble(
       average_models = average_models,
@@ -175,7 +176,7 @@ final_models <- function(run_info,
 
   # submit tasks
   best_model_tbl <- foreach::foreach(
-    x = current_combo_list,
+    x = current_combo_list_final,
     .combine = "rbind",
     .packages = packages,
     .errorhandling = "stop",
@@ -262,31 +263,7 @@ final_models <- function(run_info,
 
       # check if model averaging already happened
       if ("Best_Model" %in% colnames(local_model_tbl %>% rbind(global_model_tbl))) {
-        # see if average models file exists and add to model tbl
-        average_model_tbl <- tryCatch(
-          {
-            read_file(run_info,
-              path = paste0(
-                "/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
-                "-", combo, "-average_models.", run_info$data_output
-              ),
-              return_type = "df"
-            )
-          },
-          warning = function(w) {
-            # do nothing
-          },
-          error = function(e) {
-            NULL
-          }
-        )
-
-        local_model_tbl <- local_model_tbl %>%
-          rbind(average_model_tbl)
-
-        best_model_check <- TRUE
-      } else {
-        best_model_check <- FALSE
+        return(data.frame(Combo_Hash = combo))
       }
 
       # combine all forecasts
@@ -315,8 +292,7 @@ final_models <- function(run_info,
       final_model_list <- c(local_model_list, global_model_list)
 
       # simple model averaging
-      if (average_models & length(final_model_list) > 1 & !best_model_check) {
-
+      if (average_models & length(final_model_list) > 1) {
         # create model combinations list
         model_combinations <- tibble::tibble()
 
@@ -360,7 +336,6 @@ final_models <- function(run_info,
           .noexport = NULL
         ) %op%
           {
-
             # get list of models to average
             model_list <- strsplit(x, "_")[[1]]
 
@@ -385,7 +360,34 @@ final_models <- function(run_info,
         averages_tbl <- NULL
       }
 
-      # choose best model
+      # choose best average model
+      if (!is.null(averages_tbl)) {
+        avg_back_test_mape <- averages_tbl %>%
+          dplyr::mutate(
+            Train_Test_ID = as.numeric(Train_Test_ID),
+            Target = ifelse(Target == 0, 0.1, Target)
+          ) %>%
+          dplyr::filter(Train_Test_ID != 1) %>%
+          dplyr::mutate(MAPE = round(abs((Forecast - Target) / Target), digits = 4))
+
+        avg_best_model_mape <- avg_back_test_mape %>%
+          dplyr::group_by(Model_ID, Combo) %>%
+          dplyr::mutate(
+            Combo_Total = sum(abs(Target), na.rm = TRUE),
+            weighted_MAPE = (abs(Target) / Combo_Total) * MAPE
+          ) %>%
+          dplyr::summarise(Rolling_MAPE = sum(weighted_MAPE, na.rm = TRUE)) %>%
+          dplyr::arrange(Rolling_MAPE) %>%
+          dplyr::ungroup() %>%
+          dplyr::group_by(Combo) %>%
+          dplyr::slice(1) %>%
+          dplyr::ungroup()
+
+        avg_best_model_tbl <- avg_best_model_mape %>%
+          dplyr::select(Combo, Model_ID)
+      }
+
+      # choose best overall model
       final_predictions_tbl <- predictions_tbl %>%
         dplyr::select(Combo, Model_ID, Train_Test_ID, Date, Forecast, Target) %>%
         rbind(averages_tbl)
@@ -513,7 +515,6 @@ final_models <- function(run_info,
           )
         }
       } else { # choose the most accurate individual model and write outputs
-
         final_model_tbl <- tibble::tibble(Model_ID = final_model_list) %>%
           dplyr::left_join(
             best_model_final_tbl %>%
@@ -522,6 +523,35 @@ final_models <- function(run_info,
           ) %>%
           dplyr::mutate(Best_Model = ifelse(!is.na(Best_Model), "Yes", "No"))
 
+        if (!is.null(averages_tbl)) {
+          avg_model_final_tbl <- averages_tbl %>%
+            dplyr::right_join(avg_best_model_tbl,
+              by = c("Combo", "Model_ID")
+            ) %>%
+            dplyr::mutate(
+              Combo_ID = Combo,
+              Model_Name = "NA",
+              Model_Type = "local",
+              Recipe_ID = "simple_average",
+              Hyperparameter_ID = "NA",
+              Best_Model = "No"
+            ) %>%
+            dplyr::group_by(Combo_ID, Model_ID, Train_Test_ID) %>%
+            dplyr::mutate(Horizon = dplyr::row_number()) %>%
+            dplyr::ungroup() %>%
+            create_prediction_intervals(model_train_test_tbl) %>%
+            convert_weekly_to_daily(date_type, weekly_to_daily)
+
+          write_data(
+            x = avg_model_final_tbl,
+            combo = unique(avg_model_final_tbl$Combo),
+            run_info = run_info,
+            output_type = "data",
+            folder = "forecasts",
+            suffix = "-average_models"
+          )
+        }
+
         if (!is.null(single_model_tbl)) {
           single_model_final_tbl <- single_model_tbl %>%
             remove_best_model() %>%
@@ -580,13 +610,24 @@ final_models <- function(run_info,
         }
       }
 
-      return(best_model_mape)
+      return(data.frame(Combo_Hash = combo))
     } %>%
     base::suppressPackageStartupMessages()
 
   # clean up any parallel run process
   par_end(cl)
 
+  # condense outputs into less files for larger runs
+  if (length(combo_list) > 10000) {
+    cli::cli_progress_step("Condensing Forecasts")
+
+    condense_data(
+      run_info,
+      parallel_processing,
+      num_cores
+    )
+  }
+
   # reconcile hierarchical forecasts
   if (forecast_approach != "bottoms_up") {
     cli::cli_progress_step("Reconciling Hierarchical Forecasts")
@@ -600,6 +641,24 @@ final_models <- function(run_info,
     )
   }
 
+  # calculate weighted mape
+  weighted_mape <- get_forecast_data(run_info = run_info) %>%
+    dplyr::filter(
+      Run_Type == "Back_Test",
+      Best_Model == "Yes"
+    ) %>%
+    dplyr::mutate(
+      Target = ifelse(Target == 0, 0.1, Target)
+    ) %>%
+    dplyr::mutate(
+      MAPE = round(abs((Forecast - Target) / Target), digits = 4),
+      Total = sum(Target, na.rm = TRUE),
+      Weight = (MAPE * Target) / Total
+    ) %>%
+    dplyr::pull(Weight) %>%
+    sum() %>%
+    round(digits = 4)
+
   # update logging file
   log_df <- read_file(run_info,
     path = paste0("logs/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), ".csv"),
@@ -608,7 +667,7 @@ final_models <- function(run_info,
     dplyr::mutate(
       average_models = average_models,
       max_model_average = max_model_average,
-      weighted_mape = base::mean(best_model_tbl$Rolling_MAPE, na.rm = TRUE)
+      weighted_mape = round(weighted_mape, digits = 4)
     )
 
   write_data(

diff --git a/R/forecast_time_series.R b/R/forecast_time_series.R
@@ -323,7 +323,6 @@ forecast_backwards_compatibility <- function(run_info,
       dplyr::select(Combo, Model, Best_Model) %>%
       dplyr::distinct()
   } else {
-
     # read in unreconciled results
     best_model_tbl <- read_file(run_info,
       path = paste0(