diff --git a/DESCRIPTION b/DESCRIPTION index eaf30d1e..8228925b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: finnts Title: Microsoft Finance Time Series Forecasting Framework -Version: 0.4.0.9000 +Version: 0.4.0.9001 Authors@R: c(person(given = "Mike", family = "Tokic", diff --git a/NEWS.md b/NEWS.md index 4ded76cf..0198b1cd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,8 @@ -# finnts 0.4.0.9000 (DEVELOPMENT VERSION) +# finnts 0.4.0.9001 (DEVELOPMENT VERSION) + +## Improvements + +- Added support for hierarchical forecasting with external regressors ## Bug Fixes diff --git a/R/forecast_time_series.R b/R/forecast_time_series.R index ad57c33d..3177edd2 100644 --- a/R/forecast_time_series.R +++ b/R/forecast_time_series.R @@ -266,11 +266,11 @@ forecast_backwards_compatibility <- function(run_info, dplyr::collect() %>% tidyr::unite( col = "Combo", - combo_variables, + tidyselect::all_of(combo_variables), sep = "---", remove = FALSE ) %>% - dplyr::rename(Target = target_variable) %>% + dplyr::rename(Target = tidyselect::all_of(target_variable)) %>% dplyr::mutate( Model_ID = NA, lo_95 = Target, @@ -278,7 +278,7 @@ forecast_backwards_compatibility <- function(run_info, hi_80 = Target, hi_95 = Target ) %>% - dplyr::select(Combo, combo_variables, Model_ID, Date, Target, lo_95, lo_80, hi_80, hi_95) + dplyr::select(Combo, tidyselect::all_of(combo_variables), Model_ID, Date, Target, lo_95, lo_80, hi_80, hi_95) ) %>% dplyr::mutate(Type = ifelse(is.na(Model_ID), "Historical", "Forecast")) %>% dplyr::relocate(Type, .before = Date) %>% @@ -298,7 +298,7 @@ forecast_backwards_compatibility <- function(run_info, dplyr::filter(Run_Type == "Back_Test") %>% dplyr::mutate(MAPE = abs((Forecast - Target) / Target)) %>% dplyr::select( - Combo, combo_variables, Train_Test_ID, Date, Model_ID, Horizon, + Combo, tidyselect::all_of(combo_variables), Train_Test_ID, Date, Model_ID, Horizon, Forecast, Target, MAPE, Best_Model ) %>% dplyr::rename( diff --git a/R/hierarchy.R b/R/hierarchy.R index 5ebfffda..dbf2523d 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -3,6 +3,7 @@ #' @param input_data initial historical data #' @param run_info run info #' @param combo_variables combo variables +#' @param external_regressors external regressors #' @param forecast_approach whether it's a bottoms up or hierarchical forecast #' @param frequency_number frequency of time series #' @@ -11,6 +12,7 @@ prep_hierarchical_data <- function(input_data, run_info, combo_variables, + external_regressors, forecast_approach, frequency_number) { if (forecast_approach == "bottoms_up") { @@ -42,40 +44,163 @@ prep_hierarchical_data <- function(input_data, dplyr::mutate_if(is.numeric, list(~ replace(., is.na(.), 0))) %>% base::suppressWarnings() - # create aggregations - Date <- bottom_level_tbl$Date + # create aggregations for target variable + hierarchical_tbl <- sum_hts_data( + bottom_level_tbl, + hts_nodes, + "Target", + forecast_approach, + frequency_number + ) - hierarchical_object <- bottom_level_tbl %>% - dplyr::select(-Date) %>% - stats::ts(frequency = frequency_number) %>% - get_hts( - hts_nodes, - forecast_approach + # create aggregations for external regressors + if (!is.null(external_regressors)) { + regressor_mapping <- external_regressor_mapping( + input_data_adj, + combo_variables, + external_regressors ) - hts_nodes_final <- get_hts_nodes( - hierarchical_object, - forecast_approach - ) + regressor_agg <- foreach::foreach( + regressor_tbl = regressor_mapping %>% + dplyr::group_split(dplyr::row_number(), .keep = FALSE), + .combine = "rbind", + .errorhandling = "stop", + .verbose = FALSE, + .inorder = FALSE, + .multicombine = TRUE, + .noexport = NULL + ) %do% { + regressor_var <- regressor_tbl$Regressor + value_level <- regressor_tbl$Var + + if (value_level == "Global") { + temp_tbl <- input_data_adj %>% + dplyr::select(Date, tidyselect::all_of(regressor_var)) %>% + dplyr::distinct() + + hierarchical_tbl <- hierarchical_tbl %>% + dplyr::left_join(temp_tbl, by = c("Date")) + } else if (value_level != "All") { + + # agg by lowest level + bottom_tbl <- input_data_adj %>% + tidyr::unite("Combo", + tidyselect::all_of(combo_variables), + sep = "_", + remove = F + ) %>% + dplyr::select(Date, Combo, tidyselect::all_of(regressor_var)) %>% + dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) + + bottom_combos <- unique(bottom_tbl$Combo) + + hier_temp_tbl_1 <- hierarchical_tbl %>% + dplyr::select(Combo, Date) %>% + dplyr::filter(Combo %in% bottom_combos) %>% + dplyr::left_join(bottom_tbl, by = c("Combo", "Date")) + + # agg by specific combo variable level + value_level <- strsplit(value_level, split = "---")[[1]] + + hier_temp_tbl_2 <- foreach::foreach( + value_level_iter = value_level, + .combine = "rbind", + .errorhandling = "stop", + .verbose = FALSE, + .inorder = FALSE, + .multicombine = TRUE, + .noexport = NULL + ) %do% { + temp_tbl <- input_data_adj %>% + dplyr::select(Date, tidyselect::all_of(value_level_iter), tidyselect::all_of(regressor_var)) %>% + dplyr::distinct() + + if (length(value_level) > 1) { + temp_tbl <- temp_tbl %>% + dplyr::group_by(dplyr::across(tidyselect::all_of(c("Date", value_level_iter)))) %>% + dplyr::summarise(Value = sum(.data[[regressor_var]], na.rm = TRUE)) %>% + dplyr::ungroup() + + names(temp_tbl)[names(temp_tbl) == "Value"] <- regressor_var + } - hierarchical_tbl <- hierarchical_object %>% - hts::allts() %>% - data.frame() %>% - tibble::add_column( - Date = Date, - .before = 1 - ) %>% - tidyr::pivot_longer(!Date, - names_to = "Combo", - values_to = "Target" - ) %>% - dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) + colnames(temp_tbl) <- c("Date", "Combo", regressor_var) + + temp_tbl$Combo <- paste0(value_level_iter, "_", temp_tbl$Combo) + + temp_tbl <- temp_tbl %>% + dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) + + temp_combos <- unique(temp_tbl$Combo) + + hier_temp_tbl <- hierarchical_tbl %>% + dplyr::select(Combo, Date) %>% + dplyr::filter(Combo %in% temp_combos) %>% + dplyr::distinct() %>% + dplyr::left_join(temp_tbl, by = c("Combo", "Date")) + + return(hier_temp_tbl) + } + + # agg by total + total_tbl <- input_data_adj %>% + dplyr::select(Date, value_level[[1]], tidyselect::all_of(regressor_var)) %>% + dplyr::distinct() %>% + dplyr::group_by(Date) %>% + dplyr::rename("Agg" = tidyselect::all_of(regressor_var)) %>% + dplyr::summarise(Agg = sum(Agg, na.rm = TRUE)) + + colnames(total_tbl)[colnames(total_tbl) == "Agg"] <- regressor_var + + hier_temp_tbl_3 <- hierarchical_tbl %>% + dplyr::select(Combo, Date) %>% + dplyr::filter(Combo %in% setdiff(unique(hierarchical_tbl$Combo), c(unique(hier_temp_tbl_2$Combo), bottom_combos))) %>% + dplyr::left_join(total_tbl, by = c("Date")) + + # combine together + hierarchical_tbl <- hierarchical_tbl %>% + dplyr::left_join( + rbind(hier_temp_tbl_1, hier_temp_tbl_2, hier_temp_tbl_3), + by = c("Combo", "Date") + ) + } else if (value_level == "All") { + + bottom_level_temp_tbl <- input_data_adj %>% + dplyr::select(Combo, Date, tidyselect::all_of(regressor_var)) %>% + tidyr::pivot_wider( + names_from = Combo, + values_from = tidyselect::all_of(regressor_var) + ) %>% + dplyr::mutate_if(is.numeric, list(~ replace(., is.na(.), 0))) %>% + base::suppressWarnings() + + temp_tbl <- sum_hts_data( + bottom_level_temp_tbl, + hts_nodes, + regressor_var, + forecast_approach, + frequency_number + ) + + hierarchical_tbl <- hierarchical_tbl %>% + dplyr::left_join(temp_tbl, by = c("Date", "Combo")) + } + return(regressor_tbl) + } + } # write hierarchy structure to disk hts_list <- list( original_combos = colnames(bottom_level_tbl %>% dplyr::select(-Date)), hts_combos = hierarchical_tbl %>% dplyr::pull(Combo) %>% unique(), - nodes = hts_nodes_final + nodes = sum_hts_data(bottom_level_tbl, + hts_nodes, + "Target", + forecast_approach, + frequency_number, + return_type = "nodes" + ) ) write_data( @@ -99,7 +224,7 @@ prep_hierarchical_data <- function(input_data, return_data <- hierarchical_tbl %>% adjust_df(return_type = df_return_type) %>% - dplyr::select(Combo, Date, Target) + dplyr::select(Combo, Date, Target, tidyselect::any_of(external_regressors)) return(return_data) } @@ -412,7 +537,7 @@ reconcile_hierarchical_data <- function(run_info, ts <- forecast_tbl %>% dplyr::select(-Date, -Train_Test_ID) %>% - dplyr::select(hts_combo_list) %>% + dplyr::select(tidyselect::all_of(hts_combo_list)) %>% stats::ts() residual_multiplier <- 10 # shrink extra large residuals to prevent recon issues @@ -429,7 +554,7 @@ reconcile_hierarchical_data <- function(run_info, dplyr::select(Combo, Date, Train_Test_ID, Residual) %>% tidyr::pivot_wider(names_from = Combo, values_from = Residual) %>% dplyr::select(-Date, -Train_Test_ID) %>% - dplyr::select(hts_combo_list) %>% + dplyr::select(tidyselect::all_of(hts_combo_list)) %>% as.matrix() if (forecast_approach == "standard_hierarchy") { @@ -737,3 +862,162 @@ reconcile_hierarchical_data <- function(run_info, par_end(cl) } } + +#' Determine how external regressors should be aggregated +#' +#' @param data data +#' @param combo_variables combo variables +#' @param external_regressors external regressors +#' +#' @return data frame of regressor mappings +#' @noRd +external_regressor_mapping <- function(data, + combo_variables, + external_regressors) { + + # create var combinations list + var_combinations <- tibble::tibble() + + for (number in 2:min(length(combo_variables), 10)) { + temp <- data.frame(gtools::combinations(v = combo_variables, n = length(combo_variables), r = number)) + + temp <- temp %>% + tidyr::unite(Var_Combo, tidyselect::all_of(colnames(temp)), sep = "---") %>% + dplyr::select(Var_Combo) %>% + tibble::tibble() + + var_combinations <- rbind(var_combinations, temp) + } + + iter_list <- var_combinations %>% + dplyr::pull(Var_Combo) %>% + c(combo_variables) + + # get final mapping of regressor to combo var level + regressor_mapping_tbl <- foreach::foreach( + regressor = external_regressors, + .combine = "rbind", + .errorhandling = "stop", + .verbose = FALSE, + .inorder = FALSE, + .multicombine = TRUE, + .noexport = NULL + ) %do% { + + # get unique values of regressor per combo variable iteration + var_unique_tbl <- foreach::foreach( + var = iter_list, + .combine = "rbind", + .errorhandling = "stop", + .verbose = FALSE, + .inorder = FALSE, + .multicombine = TRUE, + .noexport = NULL + ) %do% { + var_list <- strsplit(var, split = "---")[[1]] + + if (length(var_list) == length(combo_variables)) { + var <- "All" + } + + temp_unique <- data %>% + tidyr::unite(Unique, tidyselect::all_of(c(var_list, "Date", regressor)), sep = "_") %>% + dplyr::pull(Unique) %>% + unique() %>% + length() + + return(data.frame(Var = var, Unique = temp_unique)) + } + + # determine regressor mappings + if (length(unique(var_unique_tbl$Unique)) > 1) { + all_unique <- var_unique_tbl %>% + dplyr::filter(Var == "All") %>% + dplyr::pull(Unique) + + regressor_test <- var_unique_tbl %>% + dplyr::filter(Unique < all_unique) %>% + dplyr::pull(Var) + + if (length(unique(data$Date)) == data %>% + dplyr::select(Date, tidyselect::all_of(regressor)) %>% + dplyr::distinct() %>% + nrow()) { + regressor_test <- "Global" + } else if (length(regressor_test) > 1) { + combo_unique <- var_unique_tbl %>% + dplyr::filter(Var %in% combo_variables) + + min_val <- min(unique(combo_unique$Unique)) + + regressor_test <- combo_unique %>% + dplyr::filter(Unique == min_val) %>% + dplyr::pull(Var) + } + + if (length(regressor_test) > 1) { + regressor_test <- paste0(regressor_test, collapse = "---") + } + + return(data.frame(Regressor = regressor, Var = regressor_test)) + } else { + return(data.frame(Regressor = regressor, Var = "All")) + } + } + + return(regressor_mapping_tbl) +} + +#' Create hierarchical aggregations +#' +#' @param bottom_level_tbl bottom level table +#' @param hts_nodes hts nodes +#' @param sum_var column to get aggregated +#' @param forecast_approach forecast approach +#' @param frequency_number frequency number +#' @param return_type return type +#' +#' @return data frame of hierarchical aggregations +#' @noRd +sum_hts_data <- function(bottom_level_tbl, + hts_nodes, + sum_var, + forecast_approach, + frequency_number, + return_type = "data") { + + # create aggregations for target variable + Date <- bottom_level_tbl$Date + + hierarchical_object <- bottom_level_tbl %>% + dplyr::select(-Date) %>% + stats::ts(frequency = frequency_number) %>% + get_hts( + hts_nodes, + forecast_approach + ) + + hts_nodes_final <- get_hts_nodes( + hierarchical_object, + forecast_approach + ) + + if (return_type == "nodes") { + return(hts_nodes_final) + } + + hierarchical_tbl <- hierarchical_object %>% + hts::allts() %>% + data.frame() %>% + tibble::add_column( + Date = Date, + .before = 1 + ) %>% + tidyr::pivot_longer(!Date, + names_to = "Combo", + values_to = sum_var + ) %>% + dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) + + return(hierarchical_tbl) +} diff --git a/R/prep_data.R b/R/prep_data.R index 3d3533d0..eac2a483 100644 --- a/R/prep_data.R +++ b/R/prep_data.R @@ -158,11 +158,11 @@ prep_data <- function(run_info, # prep initial data before feature engineering initial_prep_tbl <- input_data %>% tidyr::unite("Combo", - combo_variables, + tidyselect::all_of(combo_variables), sep = "--", remove = F ) %>% - dplyr::rename("Target" = target_variable) %>% + dplyr::rename("Target" = tidyselect::all_of(target_variable)) %>% dplyr::select(c( "Combo", tidyselect::all_of(combo_variables), @@ -175,6 +175,7 @@ prep_data <- function(run_info, ) %>% prep_hierarchical_data(run_info, combo_variables, + external_regressors, forecast_approach, frequency_number = get_frequency_number(date_type) ) @@ -1043,8 +1044,8 @@ apply_box_cox <- function(df) { # Only check numeric columns with more than 2 unique values if (is.numeric(df[[column_name]]) & length(unique(df[[column_name]])) > 2) { temp_tbl <- df %>% - dplyr::select(Date, column_name) %>% - dplyr::rename(Column = column_name) + dplyr::select(Date, tidyselect::all_of(column_name)) %>% + dplyr::rename(Column = tidyselect::all_of(column_name)) # get lambda value lambda_value <- timetk::auto_lambda(temp_tbl$Column) @@ -1064,10 +1065,10 @@ apply_box_cox <- function(df) { # clean up names and add to final df colnames(temp_tbl)[colnames(temp_tbl) == "Column"] <- column_name - final_tbl <- cbind(final_tbl, temp_tbl %>% dplyr::select(column_name)) + final_tbl <- cbind(final_tbl, temp_tbl %>% dplyr::select(tidyselect::all_of(column_name))) } else { if (column_name != "Date") { - final_tbl <- cbind(final_tbl, df %>% dplyr::select(column_name)) + final_tbl <- cbind(final_tbl, df %>% dplyr::select(tidyselect::all_of(column_name))) } } } @@ -1095,8 +1096,8 @@ make_stationary <- function(df) { # Only check numeric columns with more than 2 unique values if (is.numeric(df[[column_name]]) & length(unique(df[[column_name]])) > 2) { temp_tbl <- df %>% - dplyr::select(Date, column_name) %>% - dplyr::rename(Column = column_name) + dplyr::select(Date, tidyselect::all_of(column_name)) %>% + dplyr::rename(Column = tidyselect::all_of(column_name)) # check for standard difference ndiffs <- temp_tbl %>% @@ -1123,10 +1124,10 @@ make_stationary <- function(df) { colnames(temp_tbl)[colnames(temp_tbl) == "Column"] <- column_name - final_tbl <- cbind(final_tbl, temp_tbl %>% dplyr::select(column_name)) + final_tbl <- cbind(final_tbl, temp_tbl %>% dplyr::select(tidyselect::all_of(column_name))) } else { if (column_name != "Date") { - final_tbl <- cbind(final_tbl, df %>% dplyr::select(column_name)) + final_tbl <- cbind(final_tbl, df %>% dplyr::select(tidyselect::all_of(column_name))) } } } @@ -1162,7 +1163,7 @@ multivariate_prep_recipe_1 <- function(data, df_poly <- data for (column in c("Target", external_regressors)) { - if (is.numeric(dplyr::select(data, column)[[1]])) { + if (is.numeric(dplyr::select(data, tidyselect::all_of(column))[[1]])) { column_names_final <- c(column) if ((column %in% external_regressors) & !(column %in% xregs_future_values_list)) { @@ -1172,7 +1173,7 @@ multivariate_prep_recipe_1 <- function(data, if (column %in% external_regressors) { df_poly_column <- data %>% - dplyr::select(column) %>% + dplyr::select(tidyselect::all_of(column)) %>% dplyr::rename(Col = column) temp_squared <- df_poly_column^2 @@ -1240,7 +1241,7 @@ multivariate_prep_recipe_1 <- function(data, ) %>% timetk::tk_augment_fourier(Date, .periods = fourier_periods, .K = 2) %>% # add fourier series tidyr::fill(tidyselect::contains("_roll"), .direction = "down") %>% - dplyr::select(-numeric_xregs) + dplyr::select(-tidyselect::all_of(numeric_xregs)) is.na(data_lag_window) <- sapply( data_lag_window, @@ -1284,7 +1285,7 @@ multivariate_prep_recipe_2 <- function(data, df_poly <- data for (column in c("Target", external_regressors)) { - if (is.numeric(dplyr::select(data, column)[[1]])) { + if (is.numeric(dplyr::select(data, tidyselect::all_of(column))[[1]])) { column_names_final <- c(column) if ((column %in% external_regressors) & !(column %in% xregs_future_values_list)) { @@ -1294,7 +1295,7 @@ multivariate_prep_recipe_2 <- function(data, if (column %in% external_regressors) { df_poly_column <- data %>% - dplyr::select(column) %>% + dplyr::select(tidyselect::all_of(column)) %>% dplyr::rename(Col = column) temp_squared <- df_poly_column^2 @@ -1376,7 +1377,7 @@ multivariate_prep_recipe_2 <- function(data, ) %>% tidyr::fill(tidyselect::contains("_roll"), .direction = "down") %>% timetk::tk_augment_fourier(Date, .periods = fourier_periods, .K = 2) %>% # add fourier series - dplyr::select(-numeric_xregs) # drop xregs that do not contain future values + dplyr::select(-tidyselect::all_of(numeric_xregs)) # drop xregs that do not contain future values is.na(data_lag_window) <- sapply( data_lag_window, diff --git a/R/utility.R b/R/utility.R index 75b8fdba..384bb420 100644 --- a/R/utility.R +++ b/R/utility.R @@ -13,7 +13,8 @@ utils::globalVariables(c( "x", "num_cores", "run_info", "negative_forecast", "Forecast_Adj", "Final_Col", "lag_val", "libs", ".config", "Forecast_Tbl", "Model_Workflow", "id", "model_run", "Auto_Accept", "Feature", "Imp", "Importance", "LOFO_Var", "Var_RMSE", "Vote", "Votes", "desc", - "term", "Column", "Box_Cox_Lambda", "get_recipie_configurable" + "term", "Column", "Box_Cox_Lambda", "get_recipie_configurable", "Agg", "Unique", "Var", + "Var_Combo", "regressor", "regressor_tbl", "value_level_iter" )) #' @importFrom magrittr %>% diff --git a/tests/testthat/test-hierarchical.R b/tests/testthat/test-hierarchical.R new file mode 100644 index 00000000..1b6ada93 --- /dev/null +++ b/tests/testthat/test-hierarchical.R @@ -0,0 +1,124 @@ + +test_that("prep_hierarchical_data returns correct grouped hierarchies", { + + skip_if(getRversion() < "3.7.0", "Skipping for R 3.6.0 and below") + + # Mock data setup + data <- tibble::tibble( + Segment = as.character(c( + "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", + "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", + "Consumer", "Consumer", "Consumer", "Consumer", "Consumer", "Consumer", + "Consumer", "Consumer", "Consumer", "Consumer", "Consumer", "Consumer" + )), + Country = as.character(c( + "United (States)", "United (States)", "United (States)", "United (States)", "United (States)", "United (States)", + "UK", "UK", "UK", "UK", "UK", "UK", + "United (States)", "United (States)", "United (States)", "United (States)", "United (States)", "United (States)", + "UK", "UK", "UK", "UK", "UK", "UK" + )), + Product = as.character(c( + "Office", "Office", "Office", "Excel", "Excel", "Excel", + "Office", "Office", "Office", "Excel", "Excel", "Excel", + "Office", "Office", "Office", "Excel", "Excel", "Excel", + "Office", "Office", "Office", "Excel", "Excel", "Excel" + )), + Date = as.Date(c( + "1/1/2020", "2/1/2020", "3/1/2020", "1/1/2020", "2/1/2020", "3/1/2020", + "1/1/2020", "2/1/2020", "3/1/2020", "1/1/2020", "2/1/2020", "3/1/2020", + "1/1/2020", "2/1/2020", "3/1/2020", "1/1/2020", "2/1/2020", "3/1/2020", + "1/1/2020", "2/1/2020", "3/1/2020", "1/1/2020", "2/1/2020", "3/1/2020" + ), format = "%m/%d/%Y"), + Target = c(1, 2, 3, 13, 14, 15, 25, 26, 27, 37, 38, 39, 1, 2, 3, 13, 14, 15, 25, 26, 27, 37, 38, 39), + Value_Country = c(1, 2, 3, 1, 2, 3, 10, 20, 30, 10, 20, 30, 1, 2, 3, 1, 2, 3, 10, 20, 30, 10, 20, 30), + Value_All = c(1, 2, 3, 37, 38, 39, 73, 74, 75, 109, 110, 111, 145, 146, 147, 181, 182, 183, 217, 218, 219, 253, 254, 255), + Value_Product = c(1, 2, 3, 10, 11, 12, 1, 2, 3, 10, 11, 12, 1, 2, 3, 10, 11, 12, 1, 2, 3, 10, 11, 12), + Value_Global = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3), + Value_Segment_Product = c(100, 101, 102, 200, 201, 202, 100, 101, 102, 200, 201, 202, 300, 301, 302, 400, 401, 402, 300, 301, 302, 400, 401, 402) + ) %>% + tidyr::unite("Combo", + c("Segment", "Country", "Product"), + sep = "--", + remove = F + ) + + # run prep hts function + result_data <- prep_hierarchical_data( + input_data = data, + run_info = set_run_info(), + combo_variables = c("Segment", "Country", "Product"), + external_regressors = c("Value_Country", "Value_All", "Value_Product", "Value_Global", "Value_Segment_Product"), + forecast_approach = "grouped_hierarchy", + frequency_number = 12 + ) %>% + dplyr::filter(Date == "2020-01-01") + + # Expected output setup + expected_data <- tibble::tibble( + Combo = as.character(c( + "Total", "Segment_Commercial", "Segment_Consumer", "Country_United_States", "Country_UK", + "Product_Office", "Product_Excel", "Commercial_United_States_Office", "Commercial_United_States_Excel", + "Commercial_UK_Office", "Commercial_UK_Excel", "Consumer_United_States_Office", "Consumer_United_States_Excel", + "Consumer_UK_Office", "Consumer_UK_Excel" + )), + Date = as.Date(c( + "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", + "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", + "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01" + )), + Target = c(152, 76, 76, 28, 124, 52, 100, 1, 13, 25, 37, 1, 13, 25, 37), + Value_Country = c(11, 11, 11, 1, 10, 11, 11, 1, 1, 10, 10, 1, 1, 10, 10), + Value_All = c(1016, 220, 796, 364, 652, 436, 580, 1, 37, 73, 109, 145, 181, 217, 253), + Value_Product = c(11, 11, 11, 11, 11, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10), + Value_Global = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), + Value_Segment_Product = c(1000, 300, 700, 1000, 1000, 400, 600, 100, 200, 100, 200, 300, 400, 300, 400) + ) + + # Assertions + expect_equal(result_data, expected_data) +}) + +test_that("prep_hierarchical_data returns correct standard hierarchies", { + + skip_if(getRversion() < "3.7.0", "Skipping for R 3.6.0 and below") + + # Mock data setup + data <- tibble::tibble( + Area = as.character(c("EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "United States", "United States", "United States", "United States")), + Country = as.character(c("Croatia", "Croatia", "Croatia", "Croatia", "Greece", "Greece", "Greece", "Greece", "United States", "United States", "United States", "United States")), + Date = as.Date(c("2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01")), + Target = c(1, 2, 3, 4, 100, 101, 102, 103, 1000, 1001, 1002, 1003), + Value_All = c(10, 11, 12, 13, 46, 47, 48, 49, 82, 83, 84, 85), + Value_Global = c(50, 51, 52, 53, 50, 51, 52, 53, 50, 51, 52, 53), + Value_Area = c(20, 21, 22, 23, 20, 21, 22, 23, 70, 71, 72, 73) + ) %>% + tidyr::unite("Combo", + c("Area", "Country"), + sep = "--", + remove = F + ) + + # run prep hts function for standard hierarchy + result_data <- prep_hierarchical_data( + input_data = data, + run_info = set_run_info(), + combo_variables = c("Area", "Country"), + external_regressors = c("Value_All", "Value_Global", "Value_Area"), + forecast_approach = "standard_hierarchy", + frequency_number = 12 + ) %>% + dplyr::filter(Date == "2020-01-01") + + # Expected output setup for a standard hierarchical forecast + expected_data <- tibble::tibble( + Combo = as.character(c("Total", "A", "B", "EMEA_Croatia", "EMEA_Greece", "United_States_United_States")), + Date = as.Date(c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01")), + Target = c(1101, 101, 1000, 1, 100, 1000), + Value_All = c(138, 56, 82, 10, 46, 82), + Value_Global = c(50, 50, 50, 50, 50, 50), + Value_Area = c(90, 90, 90, 20, 20, 70) + ) + + # Assertions + expect_equal(result_data, expected_data) +}) diff --git a/vignettes/hierarchical-forecasting.Rmd b/vignettes/hierarchical-forecasting.Rmd index 4c596dd3..f358456d 100644 --- a/vignettes/hierarchical-forecasting.Rmd +++ b/vignettes/hierarchical-forecasting.Rmd @@ -71,7 +71,11 @@ It would be hard to aggregate the above data in a traditional hierarchy. The sam ### External Regressors -Currently finnts does not allow for external regressors to be used in a hierarchical forecast. This is something we plan to implement in a future release. If you select a hierarchical forecast method and include external regressor input values, Finn will ignore the external regressors and will not use them in modeling. +The aggregation process is automatically calculated for each external regressor, depending on how the regressor maps to the combo variables. If a regressor is unique for each time series, then the standard aggregation process is implemented (same as the process for the target variable). If the regressor repeats the same values across all time series, then only one global copy is retained and applied to all new aggregated time series. If the regressor specifically maps to one or more combo variables, then the relationship is respected while still being able to aggregate to the total level. Aggregations of drivers are always summed. + +The only limitation is when an external regressor maps to the middle of a standard hierarchy (not at the global level or bottom level), in this case the regressor will be summed at the global level across the hierarchy. This is due to limitations of aggregation naming in the hts package used in finnts. + +Explore the final results of the aggregations by seeing the end result using `get_prepped_data()`. ### Spark Parallel Processing