Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mitokic/07172024/best model scaling #163

Merged
merged 17 commits into from
Jul 29, 2024
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: finnts
Title: Microsoft Finance Time Series Forecasting Framework
Version: 0.4.0.9004
Version: 0.4.0.9005
Authors@R:
c(person(given = "Mike",
family = "Tokic",
Expand All @@ -24,7 +24,7 @@ License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
RoxygenNote: 7.3.1
Imports:
cli,
Cubist,
Expand Down
5 changes: 4 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# finnts 0.4.0.9004 (DEVELOPMENT VERSION)
# finnts 0.4.0.9005 (DEVELOPMENT VERSION)

## Improvements

- Added support for hierarchical forecasting with external regressors
- Allow global models for hierarchical forecasts
- Multistep horizon forecasts for R1 recipe, listed as `multistep_horizon` within `prep_data()`
- Always save the most accurate model average, regardless if selected as best model. This allows for improved scaling with large data sets.
- Automatically condense large forecasts (+10k time series) into smaller amount of files to make it easier to read forecast outputs
- Improved weighted MAPE calculation across all time series

## Bug Fixes

Expand Down
31 changes: 16 additions & 15 deletions R/ensemble_models.R
Original file line number Diff line number Diff line change
Expand Up @@ -181,27 +181,29 @@ ensemble_models <- function(run_info,
# model forecasts
single_model_tbl <- NULL
if (run_local_models) {
suppressWarnings(try(single_model_tbl <- read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-single_models.", run_info$data_output
suppressWarnings(try(
single_model_tbl <- read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-single_models.", run_info$data_output
),
return_type = "df"
),
return_type = "df"
),
silent = TRUE
silent = TRUE
))
}

global_model_tbl <- NULL
if (run_global_models) {
suppressWarnings(try(global_model_tbl <- read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-global_models.", run_info$data_output
suppressWarnings(try(
global_model_tbl <- read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-global_models.", run_info$data_output
),
return_type = "df"
),
return_type = "df"
),
silent = TRUE
silent = TRUE
))
}

Expand Down Expand Up @@ -336,7 +338,6 @@ ensemble_models <- function(run_info,
.multicombine = TRUE,
.noexport = NULL
) %do% {

# get initial run info
model <- model_run %>%
dplyr::pull(Model_Name)
Expand Down
4 changes: 0 additions & 4 deletions R/feature_selection.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@ run_feature_selection <- function(input_data,
forecast_horizon,
external_regressors,
multistep_horizon = FALSE) {

# check for more than one unique target value
if (input_data %>% tidyr::drop_na(Target) %>% dplyr::pull(Target) %>% unique() %>% length() < 2) {

# just return the date features
fs_list <- input_data %>%
dplyr::select(tidyselect::contains("Date"))
Expand Down Expand Up @@ -83,7 +81,6 @@ run_feature_selection <- function(input_data,

# run feature selection
if (date_type %in% c("day", "week")) {

# number of votes needed for feature to be selected
votes_needed <- 3

Expand Down Expand Up @@ -410,7 +407,6 @@ lofo_fn <- function(run_info,
parallel_processing,
pca = FALSE,
seed = 123) {

# parallel run info
par_info <- par_start(
run_info = run_info,
Expand Down
133 changes: 96 additions & 37 deletions R/final_models.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
#' Select Best Models and Prep Final Outputs
#'
#' @param run_info run info using the [set_run_info()] function.
#' @param average_models If TRUE, create simple averages of individual models.
#' @param average_models If TRUE, create simple averages of individual models
#' and save the most accurate one.
#' @param max_model_average Max number of models to average together. Will
#' create model averages for 2 models up until input value or max number of
#' models ran.
Expand Down Expand Up @@ -124,7 +125,8 @@ final_models <- function(run_info,
current_combo_list_final <- setdiff(
current_combo_list,
prev_combo_list
)
) %>%
sample()

prev_log_df <- read_file(run_info,
path = paste0("logs/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), ".csv"),
Expand All @@ -138,8 +140,7 @@ final_models <- function(run_info,
run_local_models <- prev_log_df$run_local_models
run_ensemble_models <- prev_log_df$run_ensemble_models

if ((length(current_combo_list_final) == 0 & length(prev_combo_list) > 0) | sum(colnames(prev_log_df) %in% "weighted_mape")) {

if (sum(colnames(prev_log_df) %in% "weighted_mape")) {
# check if input values have changed
current_log_df <- tibble::tibble(
average_models = average_models,
Expand Down Expand Up @@ -175,7 +176,7 @@ final_models <- function(run_info,

# submit tasks
best_model_tbl <- foreach::foreach(
x = current_combo_list,
x = current_combo_list_final,
.combine = "rbind",
.packages = packages,
.errorhandling = "stop",
Expand Down Expand Up @@ -262,31 +263,7 @@ final_models <- function(run_info,

# check if model averaging already happened
if ("Best_Model" %in% colnames(local_model_tbl %>% rbind(global_model_tbl))) {
# see if average models file exists and add to model tbl
average_model_tbl <- tryCatch(
{
read_file(run_info,
path = paste0(
"/forecasts/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name),
"-", combo, "-average_models.", run_info$data_output
),
return_type = "df"
)
},
warning = function(w) {
# do nothing
},
error = function(e) {
NULL
}
)

local_model_tbl <- local_model_tbl %>%
rbind(average_model_tbl)

best_model_check <- TRUE
} else {
best_model_check <- FALSE
return(data.frame(Combo_Hash = combo))
}

# combine all forecasts
Expand Down Expand Up @@ -315,8 +292,7 @@ final_models <- function(run_info,
final_model_list <- c(local_model_list, global_model_list)

# simple model averaging
if (average_models & length(final_model_list) > 1 & !best_model_check) {

if (average_models & length(final_model_list) > 1) {
# create model combinations list
model_combinations <- tibble::tibble()

Expand Down Expand Up @@ -360,7 +336,6 @@ final_models <- function(run_info,
.noexport = NULL
) %op%
{

# get list of models to average
model_list <- strsplit(x, "_")[[1]]

Expand All @@ -385,7 +360,34 @@ final_models <- function(run_info,
averages_tbl <- NULL
}

# choose best model
# choose best average model
if (!is.null(averages_tbl)) {
avg_back_test_mape <- averages_tbl %>%
dplyr::mutate(
Train_Test_ID = as.numeric(Train_Test_ID),
Target = ifelse(Target == 0, 0.1, Target)
) %>%
dplyr::filter(Train_Test_ID != 1) %>%
dplyr::mutate(MAPE = round(abs((Forecast - Target) / Target), digits = 4))

avg_best_model_mape <- avg_back_test_mape %>%
dplyr::group_by(Model_ID, Combo) %>%
dplyr::mutate(
Combo_Total = sum(abs(Target), na.rm = TRUE),
weighted_MAPE = (abs(Target) / Combo_Total) * MAPE
) %>%
dplyr::summarise(Rolling_MAPE = sum(weighted_MAPE, na.rm = TRUE)) %>%
dplyr::arrange(Rolling_MAPE) %>%
dplyr::ungroup() %>%
dplyr::group_by(Combo) %>%
dplyr::slice(1) %>%
dplyr::ungroup()

avg_best_model_tbl <- avg_best_model_mape %>%
dplyr::select(Combo, Model_ID)
}

# choose best overall model
final_predictions_tbl <- predictions_tbl %>%
dplyr::select(Combo, Model_ID, Train_Test_ID, Date, Forecast, Target) %>%
rbind(averages_tbl)
Expand Down Expand Up @@ -513,7 +515,6 @@ final_models <- function(run_info,
)
}
} else { # choose the most accurate individual model and write outputs

final_model_tbl <- tibble::tibble(Model_ID = final_model_list) %>%
dplyr::left_join(
best_model_final_tbl %>%
Expand All @@ -522,6 +523,35 @@ final_models <- function(run_info,
) %>%
dplyr::mutate(Best_Model = ifelse(!is.na(Best_Model), "Yes", "No"))

if (!is.null(averages_tbl)) {
avg_model_final_tbl <- averages_tbl %>%
dplyr::right_join(avg_best_model_tbl,
by = c("Combo", "Model_ID")
) %>%
dplyr::mutate(
Combo_ID = Combo,
Model_Name = "NA",
Model_Type = "local",
Recipe_ID = "simple_average",
Hyperparameter_ID = "NA",
Best_Model = "No"
) %>%
dplyr::group_by(Combo_ID, Model_ID, Train_Test_ID) %>%
dplyr::mutate(Horizon = dplyr::row_number()) %>%
dplyr::ungroup() %>%
create_prediction_intervals(model_train_test_tbl) %>%
convert_weekly_to_daily(date_type, weekly_to_daily)

write_data(
x = avg_model_final_tbl,
combo = unique(avg_model_final_tbl$Combo),
run_info = run_info,
output_type = "data",
folder = "forecasts",
suffix = "-average_models"
)
}

if (!is.null(single_model_tbl)) {
single_model_final_tbl <- single_model_tbl %>%
remove_best_model() %>%
Expand Down Expand Up @@ -580,13 +610,24 @@ final_models <- function(run_info,
}
}

return(best_model_mape)
return(data.frame(Combo_Hash = combo))
} %>%
base::suppressPackageStartupMessages()

# clean up any parallel run process
par_end(cl)

# condense outputs into less files for larger runs
if (length(combo_list) > 10000) {
cli::cli_progress_step("Condensing Forecasts")

condense_data(
run_info,
parallel_processing,
num_cores
)
}

# reconcile hierarchical forecasts
if (forecast_approach != "bottoms_up") {
cli::cli_progress_step("Reconciling Hierarchical Forecasts")
Expand All @@ -600,6 +641,24 @@ final_models <- function(run_info,
)
}

# calculate weighted mape
weighted_mape <- get_forecast_data(run_info = run_info) %>%
dplyr::filter(
Run_Type == "Back_Test",
Best_Model == "Yes"
) %>%
dplyr::mutate(
Target = ifelse(Target == 0, 0.1, Target)
) %>%
dplyr::mutate(
MAPE = round(abs((Forecast - Target) / Target), digits = 4),
Total = sum(Target, na.rm = TRUE),
Weight = (MAPE * Target) / Total
) %>%
dplyr::pull(Weight) %>%
sum() %>%
round(digits = 4)

# update logging file
log_df <- read_file(run_info,
path = paste0("logs/", hash_data(run_info$experiment_name), "-", hash_data(run_info$run_name), ".csv"),
Expand All @@ -608,7 +667,7 @@ final_models <- function(run_info,
dplyr::mutate(
average_models = average_models,
max_model_average = max_model_average,
weighted_mape = base::mean(best_model_tbl$Rolling_MAPE, na.rm = TRUE)
weighted_mape = round(weighted_mape, digits = 4)
)

write_data(
Expand Down
1 change: 0 additions & 1 deletion R/forecast_time_series.R
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,6 @@ forecast_backwards_compatibility <- function(run_info,
dplyr::select(Combo, Model, Best_Model) %>%
dplyr::distinct()
} else {

# read in unreconciled results
best_model_tbl <- read_file(run_info,
path = paste0(
Expand Down
Loading
Loading