From dce1f8e33916097f7585ba05dc66e1d12f4fe000 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Sun, 28 Jan 2024 09:38:10 -0800 Subject: [PATCH 01/18] added external regressor support for hierarchical forecast --- DESCRIPTION | 2 +- NEWS.md | 6 + R/hierarchy.R | 298 ++++++++++++++++++++++--- R/prep_data.R | 1 + vignettes/hierarchical-forecasting.Rmd | 4 +- 5 files changed, 279 insertions(+), 32 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index de0f58dc..eaf30d1e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: finnts Title: Microsoft Finance Time Series Forecasting Framework -Version: 0.4.0 +Version: 0.4.0.9000 Authors@R: c(person(given = "Mike", family = "Tokic", diff --git a/NEWS.md b/NEWS.md index 7745b630..b21935f0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# finnts 0.4.0.9000 (DEVELOPMENT VERSION) + +## Improvements + +- Added support for hierarchical forecasting with external regressors + # finnts 0.4.0 ## Improvements diff --git a/R/hierarchy.R b/R/hierarchy.R index 5ebfffda..b9092e97 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -3,6 +3,7 @@ #' @param input_data initial historical data #' @param run_info run info #' @param combo_variables combo variables +#' @param external_regressors external regressors #' @param forecast_approach whether it's a bottoms up or hierarchical forecast #' @param frequency_number frequency of time series #' @@ -11,6 +12,7 @@ prep_hierarchical_data <- function(input_data, run_info, combo_variables, + external_regressors, forecast_approach, frequency_number) { if (forecast_approach == "bottoms_up") { @@ -42,40 +44,134 @@ prep_hierarchical_data <- function(input_data, dplyr::mutate_if(is.numeric, list(~ replace(., is.na(.), 0))) %>% base::suppressWarnings() - # create aggregations - Date <- bottom_level_tbl$Date - - hierarchical_object <- bottom_level_tbl %>% - dplyr::select(-Date) %>% - stats::ts(frequency = frequency_number) %>% - get_hts( - hts_nodes, - forecast_approach - ) - - hts_nodes_final <- get_hts_nodes( - hierarchical_object, - forecast_approach - ) - - hierarchical_tbl <- hierarchical_object %>% - hts::allts() %>% - data.frame() %>% - tibble::add_column( - Date = Date, - .before = 1 - ) %>% - tidyr::pivot_longer(!Date, - names_to = "Combo", - values_to = "Target" - ) %>% - dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) + # create aggregations for target variable + hierarchical_tbl <- sum_hts_data(bottom_level_tbl, + hts_nodes, + "Target", + forecast_approach, + frequency_number) + + # create aggregations for external regressors + if(!is.null(external_regressors)) { + regressor_mapping <- external_regressor_mapping(input_data_adj, + combo_variables, + external_regressors) + + regressor_agg <- foreach::foreach( + regressor_tbl = regressor_mapping %>% + dplyr::group_split(dplyr::row_number(), .keep = FALSE), + .combine = "rbind", + .errorhandling = "stop", + .verbose = FALSE, + .inorder = FALSE, + .multicombine = TRUE, + .noexport = NULL + ) %do% { + + regressor_var <- regressor_tbl$Regressor + value_level <- regressor_tbl$Var + + if(value_level == "Global") { + + temp_tbl <- input_data_adj %>% + dplyr::select(Date, regressor_var) %>% + dplyr::distinct() + + hierarchical_tbl <- hierarchical_tbl %>% + dplyr::left_join(temp_tbl, by = c("Date")) + + } else if(value_level != "All") { + + # agg by lowest level + bottom_tbl <- input_data_adj %>% + tidyr::unite("Combo", + combo_variables, + sep = "_", + remove = F + ) %>% + dplyr::select(Date, Combo, regressor_var) %>% + dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) + + bottom_combos <- unique(bottom_tbl$Combo) + + hier_temp_tbl_1 <- hierarchical_tbl %>% + dplyr::select(Combo, Date) %>% + dplyr::filter(Combo %in% bottom_combos) %>% + dplyr::left_join(bottom_tbl, by = c("Combo", "Date")) + + # agg by value level + temp_tbl <- input_data_adj %>% + dplyr::select(Date, value_level, regressor_var) %>% + dplyr::distinct() + + colnames(temp_tbl) <- c("Date", "Combo", regressor_var) + + temp_tbl$Combo <- paste0(value_level, "_", temp_tbl$Combo) + + temp_tbl <- temp_tbl %>% + dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) + + temp_combos <- unique(temp_tbl$Combo) + + hier_temp_tbl_2 <- hierarchical_tbl %>% + dplyr::select(Combo, Date) %>% + dplyr::filter(Combo %in% temp_combos) %>% + dplyr::left_join(temp_tbl, by = c("Combo", "Date")) + + # agg by total + total_tbl <- temp_tbl %>% + dplyr::group_by(Date) %>% + dplyr::rename("Agg" = regressor_var) %>% + dplyr::summarise(Agg = sum(Agg)) + + colnames(total_tbl)[colnames(total_tbl) == "Agg"] <- regressor_var + + hier_temp_tbl_3 <- hierarchical_tbl %>% + dplyr::select(Combo, Date) %>% + dplyr::filter(Combo %in% setdiff(unique(hierarchical_tbl$Combo), c(temp_combos, bottom_combos))) %>% + dplyr::left_join(total_tbl, by = c("Date")) + + # combine together + hierarchical_tbl <- hierarchical_tbl %>% + dplyr::left_join( + rbind(hier_temp_tbl_1, hier_temp_tbl_2, hier_temp_tbl_3), + by = c("Combo", "Date") + ) + + } else if(value_level == "All") { + + bottom_level_temp_tbl <- input_data_adj %>% + dplyr::select(Combo, Date, tidyselect::all_of(regressor_var)) %>% + tidyr::pivot_wider( + names_from = Combo, + values_from = regressor_var + ) %>% + dplyr::mutate_if(is.numeric, list(~ replace(., is.na(.), 0))) %>% + base::suppressWarnings() + + temp_tbl <- sum_hts_data(bottom_level_temp_tbl, + hts_nodes, + regressor_var, + forecast_approach, + frequency_number) + + hierarchical_tbl <- hierarchical_tbl %>% + dplyr::left_join(temp_tbl, by = c("Date", "Combo")) + } + return(regressor_tbl) + } + } # write hierarchy structure to disk hts_list <- list( original_combos = colnames(bottom_level_tbl %>% dplyr::select(-Date)), hts_combos = hierarchical_tbl %>% dplyr::pull(Combo) %>% unique(), - nodes = hts_nodes_final + nodes = sum_hts_data(bottom_level_tbl, + hts_nodes, + "Target", + forecast_approach, + frequency_number, + return_type = "nodes") ) write_data( @@ -99,7 +195,7 @@ prep_hierarchical_data <- function(input_data, return_data <- hierarchical_tbl %>% adjust_df(return_type = df_return_type) %>% - dplyr::select(Combo, Date, Target) + dplyr::select(Combo, Date, Target, tidyselect::any_of(external_regressors)) return(return_data) } @@ -737,3 +833,145 @@ reconcile_hierarchical_data <- function(run_info, par_end(cl) } } + +#' Determine how external regressors should be aggregated +#' +#' @param data data +#' @param combo_variables combo variables +#' @param external_regressors external regressors +#' +#' @return data frame of regressor mappings +#' @noRd +external_regressor_mapping <- function(data, + combo_variables, + external_regressors) { + + # create var combinations list + var_combinations <- tibble::tibble() + + for (number in 2:min(length(combo_variables), 10)) { + temp <- data.frame(gtools::combinations(v = combo_variables, n = length(combo_variables), r = number)) + + temp <- temp %>% + tidyr::unite(Var_Combo, colnames(temp), sep = "---") %>% + dplyr::select(Var_Combo) %>% + tibble::tibble() + + var_combinations <- rbind(var_combinations, temp) + } + + iter_list <- var_combinations %>% + dplyr::pull(Var_Combo) %>% + c(combo_variables) + + # get final counts per var per regressor + regressor_unique_tbl <- foreach::foreach( + regressor = external_regressors, + .combine = "rbind", + .errorhandling = "stop", + .verbose = FALSE, + .inorder = FALSE, + .multicombine = TRUE, + .noexport = NULL + ) %do% { + + var_unique_tbl <- foreach::foreach( + var = iter_list, + .combine = "rbind", + .errorhandling = "stop", + .verbose = FALSE, + .inorder = FALSE, + .multicombine = TRUE, + .noexport = NULL + ) %do% + { + var_list <- strsplit(var, split = "---")[[1]] + + if(length(var_list) == length(combo_variables)) { + var <- "All" + } + + temp_unique <- data %>% + tidyr::unite(Unique, tidyselect::all_of(c(var_list, "Date", regressor)), sep = "_") %>% + dplyr::pull(Unique) %>% + unique() %>% + length() + + return(data.frame(Var = var, Unique = temp_unique)) + } + + if(length(unique(var_unique_tbl$Unique)) > 1) { + all_unique <- var_unique_tbl %>% + dplyr::filter(Var == "All") %>% + dplyr::pull(Unique) + + regressor_test <- var_unique_tbl %>% + dplyr::filter(Unique < all_unique) %>% + dplyr::pull(Var) + + if(length(regressor_test) > 1) { + regressor_test <- "Global" + } + + return(data.frame(Regressor = regressor, Var = regressor_test)) + } else { + return(data.frame(Regressor = regressor, Var = "All")) + } + } + + return(regressor_unique_tbl) +} + +#' Create hierarchical aggregations +#' +#' @param bottom_level_tbl bottom level table +#' @param hts_nodes hts nodes +#' @param sum_var column to get aggregated +#' @param forecast_approach forecast approach +#' @param frequency_number frequency number +#' @param return_type return type +#' +#' @return data frame of hierarchical aggregations +#' @noRd +sum_hts_data <- function(bottom_level_tbl, + hts_nodes, + sum_var, + forecast_approach, + frequency_number, + return_type = "data") { + + # create aggregations for target variable + Date <- bottom_level_tbl$Date + + hierarchical_object <- bottom_level_tbl %>% + dplyr::select(-Date) %>% + stats::ts(frequency = frequency_number) %>% + get_hts( + hts_nodes, + forecast_approach + ) + + hts_nodes_final <- get_hts_nodes( + hierarchical_object, + forecast_approach + ) + + if(return_type == "nodes") { + return(hts_nodes_final) + } + + hierarchical_tbl <- hierarchical_object %>% + hts::allts() %>% + data.frame() %>% + tibble::add_column( + Date = Date, + .before = 1 + ) %>% + tidyr::pivot_longer(!Date, + names_to = "Combo", + values_to = sum_var + ) %>% + dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) + + return(hierarchical_tbl) +} diff --git a/R/prep_data.R b/R/prep_data.R index 3d3533d0..90fa6b1e 100644 --- a/R/prep_data.R +++ b/R/prep_data.R @@ -175,6 +175,7 @@ prep_data <- function(run_info, ) %>% prep_hierarchical_data(run_info, combo_variables, + external_regressors, forecast_approach, frequency_number = get_frequency_number(date_type) ) diff --git a/vignettes/hierarchical-forecasting.Rmd b/vignettes/hierarchical-forecasting.Rmd index 4c596dd3..bc7027d2 100644 --- a/vignettes/hierarchical-forecasting.Rmd +++ b/vignettes/hierarchical-forecasting.Rmd @@ -71,7 +71,9 @@ It would be hard to aggregate the above data in a traditional hierarchy. The sam ### External Regressors -Currently finnts does not allow for external regressors to be used in a hierarchical forecast. This is something we plan to implement in a future release. If you select a hierarchical forecast method and include external regressor input values, Finn will ignore the external regressors and will not use them in modeling. +The aggregation process is automatically calculated for each external regressor, depending on how the regressor maps to the combo variables. If a regressor is unique for each time series, then the standard aggregation process is implemented (same as the process for the target variable). If the regressor repeats the same values across all time series, then only one global copy is retained and applied to all new aggregated time series. If the regressor specifically maps to one regressor, then the relationship is respected while still being able to aggregate to the total level. + +Explore the final results of the aggregations by seeing the end result using `get_prepped_data()`. ### Spark Parallel Processing From 8ca6c3b50b8d9db2fb09e3efe4df9b255c1d0ef3 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Thu, 22 Feb 2024 10:18:18 -0800 Subject: [PATCH 02/18] update hts driver aggregations for regressors that map to multiple combo variables --- DESCRIPTION | 2 +- NEWS.md | 6 +- R/hierarchy.R | 96 +++++++++++++++++++------- vignettes/hierarchical-forecasting.Rmd | 2 +- 4 files changed, 78 insertions(+), 28 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index eaf30d1e..8228925b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: finnts Title: Microsoft Finance Time Series Forecasting Framework -Version: 0.4.0.9000 +Version: 0.4.0.9001 Authors@R: c(person(given = "Mike", family = "Tokic", diff --git a/NEWS.md b/NEWS.md index b21935f0..0198b1cd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,13 @@ -# finnts 0.4.0.9000 (DEVELOPMENT VERSION) +# finnts 0.4.0.9001 (DEVELOPMENT VERSION) ## Improvements - Added support for hierarchical forecasting with external regressors +## Bug Fixes + +- Error in run_type column join in final forecast output + # finnts 0.4.0 ## Improvements diff --git a/R/hierarchy.R b/R/hierarchy.R index b9092e97..588c1101 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -98,37 +98,64 @@ prep_hierarchical_data <- function(input_data, dplyr::select(Combo, Date) %>% dplyr::filter(Combo %in% bottom_combos) %>% dplyr::left_join(bottom_tbl, by = c("Combo", "Date")) - - # agg by value level - temp_tbl <- input_data_adj %>% - dplyr::select(Date, value_level, regressor_var) %>% - dplyr::distinct() - - colnames(temp_tbl) <- c("Date", "Combo", regressor_var) - - temp_tbl$Combo <- paste0(value_level, "_", temp_tbl$Combo) - temp_tbl <- temp_tbl %>% - dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) - - temp_combos <- unique(temp_tbl$Combo) + # agg by specific combo variable level + value_level <- strsplit(value_level, split = "---")[[1]] + + hier_temp_tbl_2 <- foreach::foreach( + value_level_iter = value_level, + .combine = "rbind", + .errorhandling = "stop", + .verbose = FALSE, + .inorder = FALSE, + .multicombine = TRUE, + .noexport = NULL + ) %do% { + + temp_tbl <- input_data_adj %>% + dplyr::select(Date, value_level_iter, regressor_var) %>% + dplyr::distinct() + + if(length(value_level) > 1) { + temp_tbl <- temp_tbl %>% + dplyr::group_by(dplyr::across(tidyselect::all_of(c("Date", value_level_iter)))) %>% + dplyr::summarise(Value = sum(.data[[regressor_var]], na.rm = TRUE)) %>% + dplyr::ungroup() + + names(temp_tbl)[names(temp_tbl) == "Value"] <- regressor_var + } + + colnames(temp_tbl) <- c("Date", "Combo", regressor_var) + + temp_tbl$Combo <- paste0(value_level_iter, "_", temp_tbl$Combo) + + temp_tbl <- temp_tbl %>% + dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) + + temp_combos <- unique(temp_tbl$Combo) + + hier_temp_tbl <- hierarchical_tbl %>% + dplyr::select(Combo, Date) %>% + dplyr::filter(Combo %in% temp_combos) %>% + dplyr::distinct() %>% + dplyr::left_join(temp_tbl, by = c("Combo", "Date")) - hier_temp_tbl_2 <- hierarchical_tbl %>% - dplyr::select(Combo, Date) %>% - dplyr::filter(Combo %in% temp_combos) %>% - dplyr::left_join(temp_tbl, by = c("Combo", "Date")) + return(hier_temp_tbl) + } # agg by total - total_tbl <- temp_tbl %>% + total_tbl <- input_data_adj %>% + dplyr::select(Date, value_level[[1]], regressor_var) %>% + dplyr::distinct() %>% dplyr::group_by(Date) %>% dplyr::rename("Agg" = regressor_var) %>% - dplyr::summarise(Agg = sum(Agg)) + dplyr::summarise(Agg = sum(Agg, na.rm = TRUE)) colnames(total_tbl)[colnames(total_tbl) == "Agg"] <- regressor_var hier_temp_tbl_3 <- hierarchical_tbl %>% dplyr::select(Combo, Date) %>% - dplyr::filter(Combo %in% setdiff(unique(hierarchical_tbl$Combo), c(temp_combos, bottom_combos))) %>% + dplyr::filter(Combo %in% setdiff(unique(hierarchical_tbl$Combo), c(unique(hier_temp_tbl_2$Combo), bottom_combos))) %>% dplyr::left_join(total_tbl, by = c("Date")) # combine together @@ -864,8 +891,8 @@ external_regressor_mapping <- function(data, dplyr::pull(Var_Combo) %>% c(combo_variables) - # get final counts per var per regressor - regressor_unique_tbl <- foreach::foreach( + # get final mapping of regressor to combo var level + regressor_mapping_tbl <- foreach::foreach( regressor = external_regressors, .combine = "rbind", .errorhandling = "stop", @@ -875,6 +902,7 @@ external_regressor_mapping <- function(data, .noexport = NULL ) %do% { + # get unique values of regressor per combo variable iteration var_unique_tbl <- foreach::foreach( var = iter_list, .combine = "rbind", @@ -899,7 +927,8 @@ external_regressor_mapping <- function(data, return(data.frame(Var = var, Unique = temp_unique)) } - + + # determine regressor mappings if(length(unique(var_unique_tbl$Unique)) > 1) { all_unique <- var_unique_tbl %>% dplyr::filter(Var == "All") %>% @@ -910,7 +939,24 @@ external_regressor_mapping <- function(data, dplyr::pull(Var) if(length(regressor_test) > 1) { - regressor_test <- "Global" + + combo_unique <- var_unique_tbl %>% + dplyr::filter(Var %in% combo_variables) + + if(length(unique(combo_unique$Unique)) == 1) { + regressor_test <- "Global" + } else { + + min_val <- min(unique(combo_unique$Unique)) + + regressor_test <- combo_unique %>% + dplyr::filter(Unique == min_val) %>% + dplyr::pull(Var) + } + } + + if(length(regressor_test) > 1) { + regressor_test <- paste0(regressor_test, collapse = "---") } return(data.frame(Regressor = regressor, Var = regressor_test)) @@ -919,7 +965,7 @@ external_regressor_mapping <- function(data, } } - return(regressor_unique_tbl) + return(regressor_mapping_tbl) } #' Create hierarchical aggregations diff --git a/vignettes/hierarchical-forecasting.Rmd b/vignettes/hierarchical-forecasting.Rmd index bc7027d2..0bfd8dfb 100644 --- a/vignettes/hierarchical-forecasting.Rmd +++ b/vignettes/hierarchical-forecasting.Rmd @@ -71,7 +71,7 @@ It would be hard to aggregate the above data in a traditional hierarchy. The sam ### External Regressors -The aggregation process is automatically calculated for each external regressor, depending on how the regressor maps to the combo variables. If a regressor is unique for each time series, then the standard aggregation process is implemented (same as the process for the target variable). If the regressor repeats the same values across all time series, then only one global copy is retained and applied to all new aggregated time series. If the regressor specifically maps to one regressor, then the relationship is respected while still being able to aggregate to the total level. +The aggregation process is automatically calculated for each external regressor, depending on how the regressor maps to the combo variables. If a regressor is unique for each time series, then the standard aggregation process is implemented (same as the process for the target variable). If the regressor repeats the same values across all time series, then only one global copy is retained and applied to all new aggregated time series. If the regressor specifically maps to one or more combo variables, then the relationship is respected while still being able to aggregate to the total level. Aggregations of drivers are always summed. Explore the final results of the aggregations by seeing the end result using `get_prepped_data()`. From e61a9f424af941c0c01b011d290b4605d88d5177 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Thu, 22 Feb 2024 10:55:44 -0800 Subject: [PATCH 03/18] tidyselect fixes --- R/hierarchy.R | 6 +++--- R/prep_data.R | 32 ++++++++++++++++---------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/R/hierarchy.R b/R/hierarchy.R index 588c1101..8556ef77 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -85,7 +85,7 @@ prep_hierarchical_data <- function(input_data, # agg by lowest level bottom_tbl <- input_data_adj %>% tidyr::unite("Combo", - combo_variables, + tidyselect::all_of(combo_variables), sep = "_", remove = F ) %>% @@ -535,7 +535,7 @@ reconcile_hierarchical_data <- function(run_info, ts <- forecast_tbl %>% dplyr::select(-Date, -Train_Test_ID) %>% - dplyr::select(hts_combo_list) %>% + dplyr::select(tidyselect::all_of(hts_combo_list)) %>% stats::ts() residual_multiplier <- 10 # shrink extra large residuals to prevent recon issues @@ -552,7 +552,7 @@ reconcile_hierarchical_data <- function(run_info, dplyr::select(Combo, Date, Train_Test_ID, Residual) %>% tidyr::pivot_wider(names_from = Combo, values_from = Residual) %>% dplyr::select(-Date, -Train_Test_ID) %>% - dplyr::select(hts_combo_list) %>% + dplyr::select(tidyselect::all_of(hts_combo_list)) %>% as.matrix() if (forecast_approach == "standard_hierarchy") { diff --git a/R/prep_data.R b/R/prep_data.R index 90fa6b1e..eac2a483 100644 --- a/R/prep_data.R +++ b/R/prep_data.R @@ -158,11 +158,11 @@ prep_data <- function(run_info, # prep initial data before feature engineering initial_prep_tbl <- input_data %>% tidyr::unite("Combo", - combo_variables, + tidyselect::all_of(combo_variables), sep = "--", remove = F ) %>% - dplyr::rename("Target" = target_variable) %>% + dplyr::rename("Target" = tidyselect::all_of(target_variable)) %>% dplyr::select(c( "Combo", tidyselect::all_of(combo_variables), @@ -1044,8 +1044,8 @@ apply_box_cox <- function(df) { # Only check numeric columns with more than 2 unique values if (is.numeric(df[[column_name]]) & length(unique(df[[column_name]])) > 2) { temp_tbl <- df %>% - dplyr::select(Date, column_name) %>% - dplyr::rename(Column = column_name) + dplyr::select(Date, tidyselect::all_of(column_name)) %>% + dplyr::rename(Column = tidyselect::all_of(column_name)) # get lambda value lambda_value <- timetk::auto_lambda(temp_tbl$Column) @@ -1065,10 +1065,10 @@ apply_box_cox <- function(df) { # clean up names and add to final df colnames(temp_tbl)[colnames(temp_tbl) == "Column"] <- column_name - final_tbl <- cbind(final_tbl, temp_tbl %>% dplyr::select(column_name)) + final_tbl <- cbind(final_tbl, temp_tbl %>% dplyr::select(tidyselect::all_of(column_name))) } else { if (column_name != "Date") { - final_tbl <- cbind(final_tbl, df %>% dplyr::select(column_name)) + final_tbl <- cbind(final_tbl, df %>% dplyr::select(tidyselect::all_of(column_name))) } } } @@ -1096,8 +1096,8 @@ make_stationary <- function(df) { # Only check numeric columns with more than 2 unique values if (is.numeric(df[[column_name]]) & length(unique(df[[column_name]])) > 2) { temp_tbl <- df %>% - dplyr::select(Date, column_name) %>% - dplyr::rename(Column = column_name) + dplyr::select(Date, tidyselect::all_of(column_name)) %>% + dplyr::rename(Column = tidyselect::all_of(column_name)) # check for standard difference ndiffs <- temp_tbl %>% @@ -1124,10 +1124,10 @@ make_stationary <- function(df) { colnames(temp_tbl)[colnames(temp_tbl) == "Column"] <- column_name - final_tbl <- cbind(final_tbl, temp_tbl %>% dplyr::select(column_name)) + final_tbl <- cbind(final_tbl, temp_tbl %>% dplyr::select(tidyselect::all_of(column_name))) } else { if (column_name != "Date") { - final_tbl <- cbind(final_tbl, df %>% dplyr::select(column_name)) + final_tbl <- cbind(final_tbl, df %>% dplyr::select(tidyselect::all_of(column_name))) } } } @@ -1163,7 +1163,7 @@ multivariate_prep_recipe_1 <- function(data, df_poly <- data for (column in c("Target", external_regressors)) { - if (is.numeric(dplyr::select(data, column)[[1]])) { + if (is.numeric(dplyr::select(data, tidyselect::all_of(column))[[1]])) { column_names_final <- c(column) if ((column %in% external_regressors) & !(column %in% xregs_future_values_list)) { @@ -1173,7 +1173,7 @@ multivariate_prep_recipe_1 <- function(data, if (column %in% external_regressors) { df_poly_column <- data %>% - dplyr::select(column) %>% + dplyr::select(tidyselect::all_of(column)) %>% dplyr::rename(Col = column) temp_squared <- df_poly_column^2 @@ -1241,7 +1241,7 @@ multivariate_prep_recipe_1 <- function(data, ) %>% timetk::tk_augment_fourier(Date, .periods = fourier_periods, .K = 2) %>% # add fourier series tidyr::fill(tidyselect::contains("_roll"), .direction = "down") %>% - dplyr::select(-numeric_xregs) + dplyr::select(-tidyselect::all_of(numeric_xregs)) is.na(data_lag_window) <- sapply( data_lag_window, @@ -1285,7 +1285,7 @@ multivariate_prep_recipe_2 <- function(data, df_poly <- data for (column in c("Target", external_regressors)) { - if (is.numeric(dplyr::select(data, column)[[1]])) { + if (is.numeric(dplyr::select(data, tidyselect::all_of(column))[[1]])) { column_names_final <- c(column) if ((column %in% external_regressors) & !(column %in% xregs_future_values_list)) { @@ -1295,7 +1295,7 @@ multivariate_prep_recipe_2 <- function(data, if (column %in% external_regressors) { df_poly_column <- data %>% - dplyr::select(column) %>% + dplyr::select(tidyselect::all_of(column)) %>% dplyr::rename(Col = column) temp_squared <- df_poly_column^2 @@ -1377,7 +1377,7 @@ multivariate_prep_recipe_2 <- function(data, ) %>% tidyr::fill(tidyselect::contains("_roll"), .direction = "down") %>% timetk::tk_augment_fourier(Date, .periods = fourier_periods, .K = 2) %>% # add fourier series - dplyr::select(-numeric_xregs) # drop xregs that do not contain future values + dplyr::select(-tidyselect::all_of(numeric_xregs)) # drop xregs that do not contain future values is.na(data_lag_window) <- sapply( data_lag_window, From f0f7cdcb7c0449e17281e6be37e14d5ecbbe8b30 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Mon, 26 Feb 2024 16:30:36 -0800 Subject: [PATCH 04/18] fix issue with standard hierarchy --- R/hierarchy.R | 20 +++++++++----------- vignettes/hierarchical-forecasting.Rmd | 2 ++ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R/hierarchy.R b/R/hierarchy.R index 8556ef77..6bd223b0 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -901,7 +901,7 @@ external_regressor_mapping <- function(data, .multicombine = TRUE, .noexport = NULL ) %do% { - + # get unique values of regressor per combo variable iteration var_unique_tbl <- foreach::foreach( var = iter_list, @@ -930,6 +930,7 @@ external_regressor_mapping <- function(data, # determine regressor mappings if(length(unique(var_unique_tbl$Unique)) > 1) { + all_unique <- var_unique_tbl %>% dplyr::filter(Var == "All") %>% dplyr::pull(Unique) @@ -938,21 +939,18 @@ external_regressor_mapping <- function(data, dplyr::filter(Unique < all_unique) %>% dplyr::pull(Var) - if(length(regressor_test) > 1) { + if(length(unique(data$Date)) == data %>% dplyr::select(Date, tidyselect::all_of(regressor)) %>% dplyr::distinct() %>% nrow()) { + regressor_test <- "Global" + } else if(length(regressor_test) > 1) { combo_unique <- var_unique_tbl %>% dplyr::filter(Var %in% combo_variables) - - if(length(unique(combo_unique$Unique)) == 1) { - regressor_test <- "Global" - } else { - min_val <- min(unique(combo_unique$Unique)) + min_val <- min(unique(combo_unique$Unique)) - regressor_test <- combo_unique %>% - dplyr::filter(Unique == min_val) %>% - dplyr::pull(Var) - } + regressor_test <- combo_unique %>% + dplyr::filter(Unique == min_val) %>% + dplyr::pull(Var) } if(length(regressor_test) > 1) { diff --git a/vignettes/hierarchical-forecasting.Rmd b/vignettes/hierarchical-forecasting.Rmd index 0bfd8dfb..f358456d 100644 --- a/vignettes/hierarchical-forecasting.Rmd +++ b/vignettes/hierarchical-forecasting.Rmd @@ -73,6 +73,8 @@ It would be hard to aggregate the above data in a traditional hierarchy. The sam The aggregation process is automatically calculated for each external regressor, depending on how the regressor maps to the combo variables. If a regressor is unique for each time series, then the standard aggregation process is implemented (same as the process for the target variable). If the regressor repeats the same values across all time series, then only one global copy is retained and applied to all new aggregated time series. If the regressor specifically maps to one or more combo variables, then the relationship is respected while still being able to aggregate to the total level. Aggregations of drivers are always summed. +The only limitation is when an external regressor maps to the middle of a standard hierarchy (not at the global level or bottom level), in this case the regressor will be summed at the global level across the hierarchy. This is due to limitations of aggregation naming in the hts package used in finnts. + Explore the final results of the aggregations by seeing the end result using `get_prepped_data()`. ### Spark Parallel Processing From 0eba0c8df06612370ac800b66a1ba6bf6db90535 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Mon, 26 Feb 2024 17:03:47 -0800 Subject: [PATCH 05/18] tidyselect updates --- R/forecast_time_series.R | 8 ++++---- R/hierarchy.R | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/forecast_time_series.R b/R/forecast_time_series.R index ad57c33d..3177edd2 100644 --- a/R/forecast_time_series.R +++ b/R/forecast_time_series.R @@ -266,11 +266,11 @@ forecast_backwards_compatibility <- function(run_info, dplyr::collect() %>% tidyr::unite( col = "Combo", - combo_variables, + tidyselect::all_of(combo_variables), sep = "---", remove = FALSE ) %>% - dplyr::rename(Target = target_variable) %>% + dplyr::rename(Target = tidyselect::all_of(target_variable)) %>% dplyr::mutate( Model_ID = NA, lo_95 = Target, @@ -278,7 +278,7 @@ forecast_backwards_compatibility <- function(run_info, hi_80 = Target, hi_95 = Target ) %>% - dplyr::select(Combo, combo_variables, Model_ID, Date, Target, lo_95, lo_80, hi_80, hi_95) + dplyr::select(Combo, tidyselect::all_of(combo_variables), Model_ID, Date, Target, lo_95, lo_80, hi_80, hi_95) ) %>% dplyr::mutate(Type = ifelse(is.na(Model_ID), "Historical", "Forecast")) %>% dplyr::relocate(Type, .before = Date) %>% @@ -298,7 +298,7 @@ forecast_backwards_compatibility <- function(run_info, dplyr::filter(Run_Type == "Back_Test") %>% dplyr::mutate(MAPE = abs((Forecast - Target) / Target)) %>% dplyr::select( - Combo, combo_variables, Train_Test_ID, Date, Model_ID, Horizon, + Combo, tidyselect::all_of(combo_variables), Train_Test_ID, Date, Model_ID, Horizon, Forecast, Target, MAPE, Best_Model ) %>% dplyr::rename( diff --git a/R/hierarchy.R b/R/hierarchy.R index 6bd223b0..b4c247ad 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -880,7 +880,7 @@ external_regressor_mapping <- function(data, temp <- data.frame(gtools::combinations(v = combo_variables, n = length(combo_variables), r = number)) temp <- temp %>% - tidyr::unite(Var_Combo, colnames(temp), sep = "---") %>% + tidyr::unite(Var_Combo, tidyselect::all_of(colnames(temp)), sep = "---") %>% dplyr::select(Var_Combo) %>% tibble::tibble() From ba95ef4cff982668e48679acaf5a5559cf533e13 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Mon, 26 Feb 2024 17:35:46 -0800 Subject: [PATCH 06/18] code formatting --- R/hierarchy.R | 197 +++++++++++++++++++++++++------------------------- 1 file changed, 99 insertions(+), 98 deletions(-) diff --git a/R/hierarchy.R b/R/hierarchy.R index b4c247ad..c392b0bd 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -45,17 +45,21 @@ prep_hierarchical_data <- function(input_data, base::suppressWarnings() # create aggregations for target variable - hierarchical_tbl <- sum_hts_data(bottom_level_tbl, - hts_nodes, - "Target", - forecast_approach, - frequency_number) - + hierarchical_tbl <- sum_hts_data( + bottom_level_tbl, + hts_nodes, + "Target", + forecast_approach, + frequency_number + ) + # create aggregations for external regressors - if(!is.null(external_regressors)) { - regressor_mapping <- external_regressor_mapping(input_data_adj, - combo_variables, - external_regressors) + if (!is.null(external_regressors)) { + regressor_mapping <- external_regressor_mapping( + input_data_adj, + combo_variables, + external_regressors + ) regressor_agg <- foreach::foreach( regressor_tbl = regressor_mapping %>% @@ -67,38 +71,35 @@ prep_hierarchical_data <- function(input_data, .multicombine = TRUE, .noexport = NULL ) %do% { - regressor_var <- regressor_tbl$Regressor value_level <- regressor_tbl$Var - - if(value_level == "Global") { - + + if (value_level == "Global") { temp_tbl <- input_data_adj %>% dplyr::select(Date, regressor_var) %>% dplyr::distinct() - + hierarchical_tbl <- hierarchical_tbl %>% dplyr::left_join(temp_tbl, by = c("Date")) - - } else if(value_level != "All") { - + } else if (value_level != "All") { + # agg by lowest level bottom_tbl <- input_data_adj %>% tidyr::unite("Combo", - tidyselect::all_of(combo_variables), - sep = "_", - remove = F + tidyselect::all_of(combo_variables), + sep = "_", + remove = F ) %>% dplyr::select(Date, Combo, regressor_var) %>% dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) - + bottom_combos <- unique(bottom_tbl$Combo) - + hier_temp_tbl_1 <- hierarchical_tbl %>% dplyr::select(Combo, Date) %>% dplyr::filter(Combo %in% bottom_combos) %>% dplyr::left_join(bottom_tbl, by = c("Combo", "Date")) - + # agg by specific combo variable level value_level <- strsplit(value_level, split = "---")[[1]] @@ -111,29 +112,28 @@ prep_hierarchical_data <- function(input_data, .multicombine = TRUE, .noexport = NULL ) %do% { - temp_tbl <- input_data_adj %>% dplyr::select(Date, value_level_iter, regressor_var) %>% dplyr::distinct() - - if(length(value_level) > 1) { + + if (length(value_level) > 1) { temp_tbl <- temp_tbl %>% dplyr::group_by(dplyr::across(tidyselect::all_of(c("Date", value_level_iter)))) %>% dplyr::summarise(Value = sum(.data[[regressor_var]], na.rm = TRUE)) %>% dplyr::ungroup() - + names(temp_tbl)[names(temp_tbl) == "Value"] <- regressor_var } - + colnames(temp_tbl) <- c("Date", "Combo", regressor_var) - + temp_tbl$Combo <- paste0(value_level_iter, "_", temp_tbl$Combo) - + temp_tbl <- temp_tbl %>% dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) temp_combos <- unique(temp_tbl$Combo) - + hier_temp_tbl <- hierarchical_tbl %>% dplyr::select(Combo, Date) %>% dplyr::filter(Combo %in% temp_combos) %>% @@ -150,7 +150,7 @@ prep_hierarchical_data <- function(input_data, dplyr::group_by(Date) %>% dplyr::rename("Agg" = regressor_var) %>% dplyr::summarise(Agg = sum(Agg, na.rm = TRUE)) - + colnames(total_tbl)[colnames(total_tbl) == "Agg"] <- regressor_var hier_temp_tbl_3 <- hierarchical_tbl %>% @@ -161,12 +161,10 @@ prep_hierarchical_data <- function(input_data, # combine together hierarchical_tbl <- hierarchical_tbl %>% dplyr::left_join( - rbind(hier_temp_tbl_1, hier_temp_tbl_2, hier_temp_tbl_3), + rbind(hier_temp_tbl_1, hier_temp_tbl_2, hier_temp_tbl_3), by = c("Combo", "Date") ) - - } else if(value_level == "All") { - + } else if (value_level == "All") { bottom_level_temp_tbl <- input_data_adj %>% dplyr::select(Combo, Date, tidyselect::all_of(regressor_var)) %>% tidyr::pivot_wider( @@ -175,30 +173,33 @@ prep_hierarchical_data <- function(input_data, ) %>% dplyr::mutate_if(is.numeric, list(~ replace(., is.na(.), 0))) %>% base::suppressWarnings() - - temp_tbl <- sum_hts_data(bottom_level_temp_tbl, - hts_nodes, - regressor_var, - forecast_approach, - frequency_number) + + temp_tbl <- sum_hts_data( + bottom_level_temp_tbl, + hts_nodes, + regressor_var, + forecast_approach, + frequency_number + ) hierarchical_tbl <- hierarchical_tbl %>% dplyr::left_join(temp_tbl, by = c("Date", "Combo")) } return(regressor_tbl) - } + } } # write hierarchy structure to disk hts_list <- list( original_combos = colnames(bottom_level_tbl %>% dplyr::select(-Date)), hts_combos = hierarchical_tbl %>% dplyr::pull(Combo) %>% unique(), - nodes = sum_hts_data(bottom_level_tbl, - hts_nodes, - "Target", - forecast_approach, - frequency_number, - return_type = "nodes") + nodes = sum_hts_data(bottom_level_tbl, + hts_nodes, + "Target", + forecast_approach, + frequency_number, + return_type = "nodes" + ) ) write_data( @@ -869,28 +870,28 @@ reconcile_hierarchical_data <- function(run_info, #' #' @return data frame of regressor mappings #' @noRd -external_regressor_mapping <- function(data, - combo_variables, +external_regressor_mapping <- function(data, + combo_variables, external_regressors) { - + # create var combinations list var_combinations <- tibble::tibble() - + for (number in 2:min(length(combo_variables), 10)) { temp <- data.frame(gtools::combinations(v = combo_variables, n = length(combo_variables), r = number)) - + temp <- temp %>% tidyr::unite(Var_Combo, tidyselect::all_of(colnames(temp)), sep = "---") %>% dplyr::select(Var_Combo) %>% tibble::tibble() - + var_combinations <- rbind(var_combinations, temp) } - + iter_list <- var_combinations %>% dplyr::pull(Var_Combo) %>% c(combo_variables) - + # get final mapping of regressor to combo var level regressor_mapping_tbl <- foreach::foreach( regressor = external_regressors, @@ -911,58 +912,58 @@ external_regressor_mapping <- function(data, .inorder = FALSE, .multicombine = TRUE, .noexport = NULL - ) %do% - { - var_list <- strsplit(var, split = "---")[[1]] - - if(length(var_list) == length(combo_variables)) { - var <- "All" - } - - temp_unique <- data %>% - tidyr::unite(Unique, tidyselect::all_of(c(var_list, "Date", regressor)), sep = "_") %>% - dplyr::pull(Unique) %>% - unique() %>% - length() - - return(data.frame(Var = var, Unique = temp_unique)) + ) %do% { + var_list <- strsplit(var, split = "---")[[1]] + + if (length(var_list) == length(combo_variables)) { + var <- "All" } + temp_unique <- data %>% + tidyr::unite(Unique, tidyselect::all_of(c(var_list, "Date", regressor)), sep = "_") %>% + dplyr::pull(Unique) %>% + unique() %>% + length() + + return(data.frame(Var = var, Unique = temp_unique)) + } + # determine regressor mappings - if(length(unique(var_unique_tbl$Unique)) > 1) { - + if (length(unique(var_unique_tbl$Unique)) > 1) { all_unique <- var_unique_tbl %>% dplyr::filter(Var == "All") %>% dplyr::pull(Unique) - + regressor_test <- var_unique_tbl %>% dplyr::filter(Unique < all_unique) %>% dplyr::pull(Var) - - if(length(unique(data$Date)) == data %>% dplyr::select(Date, tidyselect::all_of(regressor)) %>% dplyr::distinct() %>% nrow()) { + + if (length(unique(data$Date)) == data %>% + dplyr::select(Date, tidyselect::all_of(regressor)) %>% + dplyr::distinct() %>% + nrow()) { regressor_test <- "Global" - } else if(length(regressor_test) > 1) { - + } else if (length(regressor_test) > 1) { combo_unique <- var_unique_tbl %>% dplyr::filter(Var %in% combo_variables) - + min_val <- min(unique(combo_unique$Unique)) - + regressor_test <- combo_unique %>% dplyr::filter(Unique == min_val) %>% dplyr::pull(Var) } - - if(length(regressor_test) > 1) { + + if (length(regressor_test) > 1) { regressor_test <- paste0(regressor_test, collapse = "---") } - + return(data.frame(Regressor = regressor, Var = regressor_test)) } else { return(data.frame(Regressor = regressor, Var = "All")) } } - + return(regressor_mapping_tbl) } @@ -977,16 +978,16 @@ external_regressor_mapping <- function(data, #' #' @return data frame of hierarchical aggregations #' @noRd -sum_hts_data <- function(bottom_level_tbl, +sum_hts_data <- function(bottom_level_tbl, hts_nodes, sum_var, - forecast_approach, - frequency_number, + forecast_approach, + frequency_number, return_type = "data") { - + # create aggregations for target variable Date <- bottom_level_tbl$Date - + hierarchical_object <- bottom_level_tbl %>% dplyr::select(-Date) %>% stats::ts(frequency = frequency_number) %>% @@ -994,16 +995,16 @@ sum_hts_data <- function(bottom_level_tbl, hts_nodes, forecast_approach ) - + hts_nodes_final <- get_hts_nodes( hierarchical_object, forecast_approach ) - - if(return_type == "nodes") { + + if (return_type == "nodes") { return(hts_nodes_final) } - + hierarchical_tbl <- hierarchical_object %>% hts::allts() %>% data.frame() %>% @@ -1012,10 +1013,10 @@ sum_hts_data <- function(bottom_level_tbl, .before = 1 ) %>% tidyr::pivot_longer(!Date, - names_to = "Combo", - values_to = sum_var + names_to = "Combo", + values_to = sum_var ) %>% dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) - + return(hierarchical_tbl) } From 82dbbd1cd7d2a18dbd906a3e230ffa11e4f48740 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Tue, 27 Feb 2024 18:31:50 -0800 Subject: [PATCH 07/18] add hts drivers unit tests --- R/hierarchy.R | 10 ++--- tests/testthat/test-hierarchical.R | 61 ++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 5 deletions(-) create mode 100644 tests/testthat/test-hierarchical.R diff --git a/R/hierarchy.R b/R/hierarchy.R index c392b0bd..4f1f7ad3 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -76,7 +76,7 @@ prep_hierarchical_data <- function(input_data, if (value_level == "Global") { temp_tbl <- input_data_adj %>% - dplyr::select(Date, regressor_var) %>% + dplyr::select(Date, tidyselect::all_of(regressor_var)) %>% dplyr::distinct() hierarchical_tbl <- hierarchical_tbl %>% @@ -90,7 +90,7 @@ prep_hierarchical_data <- function(input_data, sep = "_", remove = F ) %>% - dplyr::select(Date, Combo, regressor_var) %>% + dplyr::select(Date, Combo, tidyselect::all_of(regressor_var)) %>% dplyr::mutate(Combo = snakecase::to_any_case(Combo, case = "none")) bottom_combos <- unique(bottom_tbl$Combo) @@ -113,7 +113,7 @@ prep_hierarchical_data <- function(input_data, .noexport = NULL ) %do% { temp_tbl <- input_data_adj %>% - dplyr::select(Date, value_level_iter, regressor_var) %>% + dplyr::select(Date, tidyselect::all_of(value_level_iter), tidyselect::all_of(regressor_var)) %>% dplyr::distinct() if (length(value_level) > 1) { @@ -145,10 +145,10 @@ prep_hierarchical_data <- function(input_data, # agg by total total_tbl <- input_data_adj %>% - dplyr::select(Date, value_level[[1]], regressor_var) %>% + dplyr::select(Date, value_level[[1]], tidyselect::all_of(regressor_var)) %>% dplyr::distinct() %>% dplyr::group_by(Date) %>% - dplyr::rename("Agg" = regressor_var) %>% + dplyr::rename("Agg" = tidyselect::all_of(regressor_var)) %>% dplyr::summarise(Agg = sum(Agg, na.rm = TRUE)) colnames(total_tbl)[colnames(total_tbl) == "Agg"] <- regressor_var diff --git a/tests/testthat/test-hierarchical.R b/tests/testthat/test-hierarchical.R new file mode 100644 index 00000000..13744669 --- /dev/null +++ b/tests/testthat/test-hierarchical.R @@ -0,0 +1,61 @@ + +test_that("prep_hierarchical_data returns correct grouped hierarchies", { + # Mock data setup + # Sample data creation with 2 regions and 4 products + set.seed(123) # For reproducibility + data <- tibble::tibble( + Region = rep(c("North", "South"), each = 50), # 2 regions + Product = rep(c("A", "B", "C", "D"), times = 25), # 4 products repeated across regions + Date = rep(seq(as.Date("2023-01-01"), length = 25, by = "month"), 4), + Target = round(runif(100, 100, 500)), # Random target data + All_Driver = round(runif(100, 100, 500)) # Random driver data for every time series + ) + + # Generate unique combinations of Region and Date with Region_Driver + region_data <- data %>% + dplyr::select(Region, Date) %>% + dplyr::distinct() %>% + dplyr::mutate(Region_Driver = round(runif(dplyr::n(), 500000, 2500000))) # Random region-specific driver + + # Generate global driver data unique for each Date + global_driver_data <- data %>% + dplyr::select(Date) %>% + dplyr::distinct() %>% + dplyr::mutate(Global_Driver = round(runif(dplyr::n(), 100, 500))) # Random global driver + + # Join the region driver data back to the main dataset + data_with_region_driver <- data %>% + dplyr::left_join(region_data, by = c("Region", "Date")) + + # Join the global driver data back to the dataset + final_data <- data_with_region_driver %>% + dplyr::left_join(global_driver_data, by = "Date") %>% + tidyr::unite("Combo", + c("Region", "Product"), + sep = "--", + remove = F + ) + + # run prep hts function + result_data <- prep_hierarchical_data(input_data = final_data, + run_info = set_run_info(), + combo_variables = c("Region", "Product"), + external_regressors = c("All_Driver", "Region_Driver", "Global_Driver"), + forecast_approach = "grouped_hierarchy", + frequency_number = 12) %>% + dplyr::filter(Date == "2023-01-01") + + # Expected output setup + expected_data <- tibble::tibble( + Combo = c("Total", "Region_North", "Region_South", "Product_A", "Product_B", "Product_C", "Product_D", + "North_A", "North_B", "North_C", "North_D", "South_C", "South_D", "South_A", "South_B"), + Date = as.Date(rep("2023-01-01", 15)), + Target = c(904, 598, 306, 215, 383, 118, 188, 215, 383, 0, 0, 118, 188, 0, 0), + All_Driver = c(1620, 834, 786, 340, 494, 439, 347, 340, 494, 0, 0, 439, 347, 0, 0), + Region_Driver = c(2267892, 977452, 1290440, 2267892, 2267892, 2267892, 2267892, 977452, 977452, NA, NA, 1290440, 1290440, NA, NA), + Global_Driver = rep(203, 15) + ) + + # Assertions + expect_equal(result_data, expected_data) +}) From b32c3121d37954988bdfc4174646911968efa265 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Tue, 27 Feb 2024 19:00:34 -0800 Subject: [PATCH 08/18] code formatting --- tests/testthat/test-hierarchical.R | 40 ++++++++++++++++-------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/tests/testthat/test-hierarchical.R b/tests/testthat/test-hierarchical.R index 13744669..cefb8cc7 100644 --- a/tests/testthat/test-hierarchical.R +++ b/tests/testthat/test-hierarchical.R @@ -10,52 +10,56 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", { Target = round(runif(100, 100, 500)), # Random target data All_Driver = round(runif(100, 100, 500)) # Random driver data for every time series ) - + # Generate unique combinations of Region and Date with Region_Driver region_data <- data %>% dplyr::select(Region, Date) %>% dplyr::distinct() %>% dplyr::mutate(Region_Driver = round(runif(dplyr::n(), 500000, 2500000))) # Random region-specific driver - + # Generate global driver data unique for each Date global_driver_data <- data %>% dplyr::select(Date) %>% dplyr::distinct() %>% dplyr::mutate(Global_Driver = round(runif(dplyr::n(), 100, 500))) # Random global driver - + # Join the region driver data back to the main dataset data_with_region_driver <- data %>% dplyr::left_join(region_data, by = c("Region", "Date")) - + # Join the global driver data back to the dataset final_data <- data_with_region_driver %>% dplyr::left_join(global_driver_data, by = "Date") %>% tidyr::unite("Combo", - c("Region", "Product"), - sep = "--", - remove = F + c("Region", "Product"), + sep = "--", + remove = F ) - + # run prep hts function - result_data <- prep_hierarchical_data(input_data = final_data, - run_info = set_run_info(), - combo_variables = c("Region", "Product"), - external_regressors = c("All_Driver", "Region_Driver", "Global_Driver"), - forecast_approach = "grouped_hierarchy", - frequency_number = 12) %>% + result_data <- prep_hierarchical_data( + input_data = final_data, + run_info = set_run_info(), + combo_variables = c("Region", "Product"), + external_regressors = c("All_Driver", "Region_Driver", "Global_Driver"), + forecast_approach = "grouped_hierarchy", + frequency_number = 12 + ) %>% dplyr::filter(Date == "2023-01-01") - + # Expected output setup expected_data <- tibble::tibble( - Combo = c("Total", "Region_North", "Region_South", "Product_A", "Product_B", "Product_C", "Product_D", - "North_A", "North_B", "North_C", "North_D", "South_C", "South_D", "South_A", "South_B"), + Combo = c( + "Total", "Region_North", "Region_South", "Product_A", "Product_B", "Product_C", "Product_D", + "North_A", "North_B", "North_C", "North_D", "South_C", "South_D", "South_A", "South_B" + ), Date = as.Date(rep("2023-01-01", 15)), Target = c(904, 598, 306, 215, 383, 118, 188, 215, 383, 0, 0, 118, 188, 0, 0), All_Driver = c(1620, 834, 786, 340, 494, 439, 347, 340, 494, 0, 0, 439, 347, 0, 0), Region_Driver = c(2267892, 977452, 1290440, 2267892, 2267892, 2267892, 2267892, 977452, 977452, NA, NA, 1290440, 1290440, NA, NA), Global_Driver = rep(203, 15) ) - + # Assertions expect_equal(result_data, expected_data) }) From 11aa84dffc7b12d5aa5bd8167905f1946a892aa3 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Sun, 3 Mar 2024 16:35:33 -0800 Subject: [PATCH 09/18] update hts driver tests --- tests/testthat/test-hierarchical.R | 125 +++++++++++++++++++---------- 1 file changed, 83 insertions(+), 42 deletions(-) diff --git a/tests/testthat/test-hierarchical.R b/tests/testthat/test-hierarchical.R index cefb8cc7..13dde456 100644 --- a/tests/testthat/test-hierarchical.R +++ b/tests/testthat/test-hierarchical.R @@ -1,65 +1,106 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", { # Mock data setup - # Sample data creation with 2 regions and 4 products - set.seed(123) # For reproducibility data <- tibble::tibble( - Region = rep(c("North", "South"), each = 50), # 2 regions - Product = rep(c("A", "B", "C", "D"), times = 25), # 4 products repeated across regions - Date = rep(seq(as.Date("2023-01-01"), length = 25, by = "month"), 4), - Target = round(runif(100, 100, 500)), # Random target data - All_Driver = round(runif(100, 100, 500)) # Random driver data for every time series - ) - - # Generate unique combinations of Region and Date with Region_Driver - region_data <- data %>% - dplyr::select(Region, Date) %>% - dplyr::distinct() %>% - dplyr::mutate(Region_Driver = round(runif(dplyr::n(), 500000, 2500000))) # Random region-specific driver - - # Generate global driver data unique for each Date - global_driver_data <- data %>% - dplyr::select(Date) %>% - dplyr::distinct() %>% - dplyr::mutate(Global_Driver = round(runif(dplyr::n(), 100, 500))) # Random global driver - - # Join the region driver data back to the main dataset - data_with_region_driver <- data %>% - dplyr::left_join(region_data, by = c("Region", "Date")) - - # Join the global driver data back to the dataset - final_data <- data_with_region_driver %>% - dplyr::left_join(global_driver_data, by = "Date") %>% + Segment = c('Commercial', 'Commercial', 'Commercial', 'Commercial', 'Commercial', 'Commercial', + 'Commercial', 'Commercial', 'Commercial', 'Commercial', 'Commercial', 'Commercial', + 'Consumer', 'Consumer', 'Consumer', 'Consumer', 'Consumer', 'Consumer', + 'Consumer', 'Consumer', 'Consumer', 'Consumer', 'Consumer', 'Consumer'), + Country = c('United (States)', 'United (States)', 'United (States)', 'United (States)', 'United (States)', 'United (States)', + 'UK', 'UK', 'UK', 'UK', 'UK', 'UK', + 'United (States)', 'United (States)', 'United (States)', 'United (States)', 'United (States)', 'United (States)', + 'UK', 'UK', 'UK', 'UK', 'UK', 'UK'), + Product = c('Office', 'Office', 'Office', 'Excel', 'Excel', 'Excel', + 'Office', 'Office', 'Office', 'Excel', 'Excel', 'Excel', + 'Office', 'Office', 'Office', 'Excel', 'Excel', 'Excel', + 'Office', 'Office', 'Office', 'Excel', 'Excel', 'Excel'), + Date = as.Date(c('1/1/2020', '2/1/2020', '3/1/2020', '1/1/2020', '2/1/2020', '3/1/2020', + '1/1/2020', '2/1/2020', '3/1/2020', '1/1/2020', '2/1/2020', '3/1/2020', + '1/1/2020', '2/1/2020', '3/1/2020', '1/1/2020', '2/1/2020', '3/1/2020', + '1/1/2020', '2/1/2020', '3/1/2020', '1/1/2020', '2/1/2020', '3/1/2020'), format="%m/%d/%Y"), + Target = c(1, 2, 3, 13, 14, 15, 25, 26, 27, 37, 38, 39, 1, 2, 3, 13, 14, 15, 25, 26, 27, 37, 38, 39), + Value_Country = c(1, 2, 3, 1, 2, 3, 10, 20, 30, 10, 20, 30, 1, 2, 3, 1, 2, 3, 10, 20, 30, 10, 20, 30), + Value_All = c(1, 2, 3, 37, 38, 39, 73, 74, 75, 109, 110, 111, 145, 146, 147, 181, 182, 183, 217, 218, 219, 253, 254, 255), + Value_Product = c(1, 2, 3, 10, 11, 12, 1, 2, 3, 10, 11, 12, 1, 2, 3, 10, 11, 12, 1, 2, 3, 10, 11, 12), + Value_Global = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3), + Value_Segment_Product = c(100, 101, 102, 200, 201, 202, 100, 101, 102, 200, 201, 202, 300, 301, 302, 400, 401, 402, 300, 301, 302, 400, 401, 402) + ) %>% tidyr::unite("Combo", - c("Region", "Product"), + c("Segment", "Country", "Product"), sep = "--", remove = F ) # run prep hts function result_data <- prep_hierarchical_data( - input_data = final_data, + input_data = data, run_info = set_run_info(), - combo_variables = c("Region", "Product"), - external_regressors = c("All_Driver", "Region_Driver", "Global_Driver"), + combo_variables = c("Segment", "Country", "Product"), + external_regressors = c("Value_Country", "Value_All", "Value_Product", "Value_Global", "Value_Segment_Product"), forecast_approach = "grouped_hierarchy", frequency_number = 12 ) %>% - dplyr::filter(Date == "2023-01-01") + dplyr::filter(Date == "2020-01-01") # Expected output setup expected_data <- tibble::tibble( - Combo = c( - "Total", "Region_North", "Region_South", "Product_A", "Product_B", "Product_C", "Product_D", - "North_A", "North_B", "North_C", "North_D", "South_C", "South_D", "South_A", "South_B" - ), - Date = as.Date(rep("2023-01-01", 15)), - Target = c(904, 598, 306, 215, 383, 118, 188, 215, 383, 0, 0, 118, 188, 0, 0), - All_Driver = c(1620, 834, 786, 340, 494, 439, 347, 340, 494, 0, 0, 439, 347, 0, 0), - Region_Driver = c(2267892, 977452, 1290440, 2267892, 2267892, 2267892, 2267892, 977452, 977452, NA, NA, 1290440, 1290440, NA, NA), - Global_Driver = rep(203, 15) + Combo = c("Total", "Segment_Commercial", "Segment_Consumer", "Country_United_States", "Country_UK", + "Product_Office", "Product_Excel", "Commercial_United_States_Office", "Commercial_United_States_Excel", + "Commercial_UK_Office", "Commercial_UK_Excel", "Consumer_United_States_Office", "Consumer_United_States_Excel", + "Consumer_UK_Office", "Consumer_UK_Excel"), + Date = as.Date(c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", + "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", + "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01")), + Target = c(152, 76, 76, 28, 124, 52, 100, 1, 13, 25, 37, 1, 13, 25, 37), + Value_Country = c(11, 11, 11, 1, 10, 11, 11, 1, 1, 10, 10, 1, 1, 10, 10), + Value_All = c(1016, 220, 796, 364, 652, 436, 580, 1, 37, 73, 109, 145, 181, 217, 253), + Value_Product = c(11, 11, 11, 11, 11, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10), + Value_Global = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), + Value_Segment_Product = c(1000, 300, 700, 1000, 1000, 400, 600, 100, 200, 100, 200, 300, 400, 300, 400) ) # Assertions expect_equal(result_data, expected_data) }) + +test_that("prep_hierarchical_data returns correct standard hierarchies for region and city", { + # Mock data setup + data <- tibble::tibble( + Area = c('EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'United States', 'United States', 'United States', 'United States'), + Country = c('Croatia', 'Croatia', 'Croatia', 'Croatia', 'Greece', 'Greece', 'Greece', 'Greece', 'United States', 'United States', 'United States', 'United States'), + Date = as.Date(c('2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01')), + Target = c(1, 2, 3, 4, 100, 101, 102, 103, 1000, 1001, 1002, 1003), + Value_All = c(10, 11, 12, 13, 46, 47, 48, 49, 82, 83, 84, 85), + Value_Global = c(50, 51, 52, 53, 50, 51, 52, 53, 50, 51, 52, 53), + Value_Area = c(20, 21, 22, 23, 20, 21, 22, 23, 70, 71, 72, 73) + ) %>% + tidyr::unite("Combo", + c("Area", "Country"), + sep = "--", + remove = F + ) + + # run prep hts function for standard hierarchy + result_data <- prep_hierarchical_data( + input_data = data, + run_info = set_run_info(), + combo_variables = c("Area", "Country"), + external_regressors = c("Value_All", "Value_Global", "Value_Area"), + forecast_approach = "standard_hierarchy", + frequency_number = 12 + ) %>% + dplyr::filter(Date == "2020-01-01") + + # Expected output setup for a standard hierarchical forecast + expected_data <- tibble::tibble( + Combo = c("Total", "A", "B", "EMEA_Croatia", "EMEA_Greece", "United_States_United_States"), + Date = as.Date(c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01")), + Target = c(1101, 101, 1000, 1, 100, 1000), + Value_All = c(138, 56, 82, 10, 46, 82), + Value_Global = c(50, 50, 50, 50, 50, 50), + Value_Area = c(90, 90, 90, 20, 20, 70) + ) + + # Assertions + expect_equal(result_data, expected_data) +}) From 379d0ba639a5c6f9f35dee40fc6b28db8d621843 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Sun, 3 Mar 2024 16:53:03 -0800 Subject: [PATCH 10/18] error update --- R/hierarchy.R | 2 +- tests/testthat/test-hierarchical.R | 76 +++++++++++++++++------------- 2 files changed, 45 insertions(+), 33 deletions(-) diff --git a/R/hierarchy.R b/R/hierarchy.R index 4f1f7ad3..da0ab062 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -169,7 +169,7 @@ prep_hierarchical_data <- function(input_data, dplyr::select(Combo, Date, tidyselect::all_of(regressor_var)) %>% tidyr::pivot_wider( names_from = Combo, - values_from = regressor_var + values_from = tidyselect::all_of(regressor_var) ) %>% dplyr::mutate_if(is.numeric, list(~ replace(., is.na(.), 0))) %>% base::suppressWarnings() diff --git a/tests/testthat/test-hierarchical.R b/tests/testthat/test-hierarchical.R index 13dde456..3cb897bb 100644 --- a/tests/testthat/test-hierarchical.R +++ b/tests/testthat/test-hierarchical.R @@ -2,22 +2,30 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", { # Mock data setup data <- tibble::tibble( - Segment = c('Commercial', 'Commercial', 'Commercial', 'Commercial', 'Commercial', 'Commercial', - 'Commercial', 'Commercial', 'Commercial', 'Commercial', 'Commercial', 'Commercial', - 'Consumer', 'Consumer', 'Consumer', 'Consumer', 'Consumer', 'Consumer', - 'Consumer', 'Consumer', 'Consumer', 'Consumer', 'Consumer', 'Consumer'), - Country = c('United (States)', 'United (States)', 'United (States)', 'United (States)', 'United (States)', 'United (States)', - 'UK', 'UK', 'UK', 'UK', 'UK', 'UK', - 'United (States)', 'United (States)', 'United (States)', 'United (States)', 'United (States)', 'United (States)', - 'UK', 'UK', 'UK', 'UK', 'UK', 'UK'), - Product = c('Office', 'Office', 'Office', 'Excel', 'Excel', 'Excel', - 'Office', 'Office', 'Office', 'Excel', 'Excel', 'Excel', - 'Office', 'Office', 'Office', 'Excel', 'Excel', 'Excel', - 'Office', 'Office', 'Office', 'Excel', 'Excel', 'Excel'), - Date = as.Date(c('1/1/2020', '2/1/2020', '3/1/2020', '1/1/2020', '2/1/2020', '3/1/2020', - '1/1/2020', '2/1/2020', '3/1/2020', '1/1/2020', '2/1/2020', '3/1/2020', - '1/1/2020', '2/1/2020', '3/1/2020', '1/1/2020', '2/1/2020', '3/1/2020', - '1/1/2020', '2/1/2020', '3/1/2020', '1/1/2020', '2/1/2020', '3/1/2020'), format="%m/%d/%Y"), + Segment = c( + "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", + "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", + "Consumer", "Consumer", "Consumer", "Consumer", "Consumer", "Consumer", + "Consumer", "Consumer", "Consumer", "Consumer", "Consumer", "Consumer" + ), + Country = c( + "United (States)", "United (States)", "United (States)", "United (States)", "United (States)", "United (States)", + "UK", "UK", "UK", "UK", "UK", "UK", + "United (States)", "United (States)", "United (States)", "United (States)", "United (States)", "United (States)", + "UK", "UK", "UK", "UK", "UK", "UK" + ), + Product = c( + "Office", "Office", "Office", "Excel", "Excel", "Excel", + "Office", "Office", "Office", "Excel", "Excel", "Excel", + "Office", "Office", "Office", "Excel", "Excel", "Excel", + "Office", "Office", "Office", "Excel", "Excel", "Excel" + ), + Date = as.Date(c( + "1/1/2020", "2/1/2020", "3/1/2020", "1/1/2020", "2/1/2020", "3/1/2020", + "1/1/2020", "2/1/2020", "3/1/2020", "1/1/2020", "2/1/2020", "3/1/2020", + "1/1/2020", "2/1/2020", "3/1/2020", "1/1/2020", "2/1/2020", "3/1/2020", + "1/1/2020", "2/1/2020", "3/1/2020", "1/1/2020", "2/1/2020", "3/1/2020" + ), format = "%m/%d/%Y"), Target = c(1, 2, 3, 13, 14, 15, 25, 26, 27, 37, 38, 39, 1, 2, 3, 13, 14, 15, 25, 26, 27, 37, 38, 39), Value_Country = c(1, 2, 3, 1, 2, 3, 10, 20, 30, 10, 20, 30, 1, 2, 3, 1, 2, 3, 10, 20, 30, 10, 20, 30), Value_All = c(1, 2, 3, 37, 38, 39, 73, 74, 75, 109, 110, 111, 145, 146, 147, 181, 182, 183, 217, 218, 219, 253, 254, 255), @@ -44,13 +52,17 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", { # Expected output setup expected_data <- tibble::tibble( - Combo = c("Total", "Segment_Commercial", "Segment_Consumer", "Country_United_States", "Country_UK", - "Product_Office", "Product_Excel", "Commercial_United_States_Office", "Commercial_United_States_Excel", - "Commercial_UK_Office", "Commercial_UK_Excel", "Consumer_United_States_Office", "Consumer_United_States_Excel", - "Consumer_UK_Office", "Consumer_UK_Excel"), - Date = as.Date(c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", - "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", - "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01")), + Combo = c( + "Total", "Segment_Commercial", "Segment_Consumer", "Country_United_States", "Country_UK", + "Product_Office", "Product_Excel", "Commercial_United_States_Office", "Commercial_United_States_Excel", + "Commercial_UK_Office", "Commercial_UK_Excel", "Consumer_United_States_Office", "Consumer_United_States_Excel", + "Consumer_UK_Office", "Consumer_UK_Excel" + ), + Date = as.Date(c( + "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", + "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", + "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01" + )), Target = c(152, 76, 76, 28, 124, 52, 100, 1, 13, 25, 37, 1, 13, 25, 37), Value_Country = c(11, 11, 11, 1, 10, 11, 11, 1, 1, 10, 10, 1, 1, 10, 10), Value_All = c(1016, 220, 796, 364, 652, 436, 580, 1, 37, 73, 109, 145, 181, 217, 253), @@ -66,20 +78,20 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", { test_that("prep_hierarchical_data returns correct standard hierarchies for region and city", { # Mock data setup data <- tibble::tibble( - Area = c('EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'EMEA', 'United States', 'United States', 'United States', 'United States'), - Country = c('Croatia', 'Croatia', 'Croatia', 'Croatia', 'Greece', 'Greece', 'Greece', 'Greece', 'United States', 'United States', 'United States', 'United States'), - Date = as.Date(c('2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01')), + Area = c("EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "United States", "United States", "United States", "United States"), + Country = c("Croatia", "Croatia", "Croatia", "Croatia", "Greece", "Greece", "Greece", "Greece", "United States", "United States", "United States", "United States"), + Date = as.Date(c("2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01")), Target = c(1, 2, 3, 4, 100, 101, 102, 103, 1000, 1001, 1002, 1003), Value_All = c(10, 11, 12, 13, 46, 47, 48, 49, 82, 83, 84, 85), Value_Global = c(50, 51, 52, 53, 50, 51, 52, 53, 50, 51, 52, 53), Value_Area = c(20, 21, 22, 23, 20, 21, 22, 23, 70, 71, 72, 73) ) %>% tidyr::unite("Combo", - c("Area", "Country"), - sep = "--", - remove = F + c("Area", "Country"), + sep = "--", + remove = F ) - + # run prep hts function for standard hierarchy result_data <- prep_hierarchical_data( input_data = data, @@ -90,7 +102,7 @@ test_that("prep_hierarchical_data returns correct standard hierarchies for regio frequency_number = 12 ) %>% dplyr::filter(Date == "2020-01-01") - + # Expected output setup for a standard hierarchical forecast expected_data <- tibble::tibble( Combo = c("Total", "A", "B", "EMEA_Croatia", "EMEA_Greece", "United_States_United_States"), @@ -100,7 +112,7 @@ test_that("prep_hierarchical_data returns correct standard hierarchies for regio Value_Global = c(50, 50, 50, 50, 50, 50), Value_Area = c(90, 90, 90, 20, 20, 70) ) - + # Assertions expect_equal(result_data, expected_data) }) From d21c6bb05ae7b5ef9cd3ee85feef10d27b28d0f4 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Sun, 3 Mar 2024 17:24:12 -0800 Subject: [PATCH 11/18] error fix --- R/hierarchy.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/hierarchy.R b/R/hierarchy.R index da0ab062..9d1ec34b 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -165,11 +165,12 @@ prep_hierarchical_data <- function(input_data, by = c("Combo", "Date") ) } else if (value_level == "All") { + bottom_level_temp_tbl <- input_data_adj %>% dplyr::select(Combo, Date, tidyselect::all_of(regressor_var)) %>% tidyr::pivot_wider( names_from = Combo, - values_from = tidyselect::all_of(regressor_var) + values_from = as.character(regressor_var) ) %>% dplyr::mutate_if(is.numeric, list(~ replace(., is.na(.), 0))) %>% base::suppressWarnings() From 84c83e63903db50d6eb97faa5cb055438b10c8f3 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Sun, 3 Mar 2024 18:04:30 -0800 Subject: [PATCH 12/18] error fix --- R/hierarchy.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/hierarchy.R b/R/hierarchy.R index 9d1ec34b..0376eeb4 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -168,9 +168,10 @@ prep_hierarchical_data <- function(input_data, bottom_level_temp_tbl <- input_data_adj %>% dplyr::select(Combo, Date, tidyselect::all_of(regressor_var)) %>% + dplyr::rename(Value = tidyselect::all_of(regressor_var)) %>% tidyr::pivot_wider( names_from = Combo, - values_from = as.character(regressor_var) + values_from = Value ) %>% dplyr::mutate_if(is.numeric, list(~ replace(., is.na(.), 0))) %>% base::suppressWarnings() From c2ce65b0863798577af1cc0bc6776fc5d071830e Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Sun, 3 Mar 2024 18:20:10 -0800 Subject: [PATCH 13/18] error fix --- tests/testthat/test-hierarchical.R | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/testthat/test-hierarchical.R b/tests/testthat/test-hierarchical.R index 3cb897bb..64252820 100644 --- a/tests/testthat/test-hierarchical.R +++ b/tests/testthat/test-hierarchical.R @@ -2,24 +2,24 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", { # Mock data setup data <- tibble::tibble( - Segment = c( + Segment = as.character(c( "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", "Commercial", "Consumer", "Consumer", "Consumer", "Consumer", "Consumer", "Consumer", "Consumer", "Consumer", "Consumer", "Consumer", "Consumer", "Consumer" - ), - Country = c( + )), + Country = as.character(c( "United (States)", "United (States)", "United (States)", "United (States)", "United (States)", "United (States)", "UK", "UK", "UK", "UK", "UK", "UK", "United (States)", "United (States)", "United (States)", "United (States)", "United (States)", "United (States)", "UK", "UK", "UK", "UK", "UK", "UK" - ), - Product = c( + )), + Product = as.character(c( "Office", "Office", "Office", "Excel", "Excel", "Excel", "Office", "Office", "Office", "Excel", "Excel", "Excel", "Office", "Office", "Office", "Excel", "Excel", "Excel", "Office", "Office", "Office", "Excel", "Excel", "Excel" - ), + )), Date = as.Date(c( "1/1/2020", "2/1/2020", "3/1/2020", "1/1/2020", "2/1/2020", "3/1/2020", "1/1/2020", "2/1/2020", "3/1/2020", "1/1/2020", "2/1/2020", "3/1/2020", @@ -52,12 +52,12 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", { # Expected output setup expected_data <- tibble::tibble( - Combo = c( + Combo = as.character(c( "Total", "Segment_Commercial", "Segment_Consumer", "Country_United_States", "Country_UK", "Product_Office", "Product_Excel", "Commercial_United_States_Office", "Commercial_United_States_Excel", "Commercial_UK_Office", "Commercial_UK_Excel", "Consumer_United_States_Office", "Consumer_United_States_Excel", "Consumer_UK_Office", "Consumer_UK_Excel" - ), + )), Date = as.Date(c( "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", @@ -75,11 +75,11 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", { expect_equal(result_data, expected_data) }) -test_that("prep_hierarchical_data returns correct standard hierarchies for region and city", { +test_that("prep_hierarchical_data returns correct standard hierarchies", { # Mock data setup data <- tibble::tibble( - Area = c("EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "United States", "United States", "United States", "United States"), - Country = c("Croatia", "Croatia", "Croatia", "Croatia", "Greece", "Greece", "Greece", "Greece", "United States", "United States", "United States", "United States"), + Area = as.character(c("EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "United States", "United States", "United States", "United States")), + Country = as.character(c("Croatia", "Croatia", "Croatia", "Croatia", "Greece", "Greece", "Greece", "Greece", "United States", "United States", "United States", "United States")), Date = as.Date(c("2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01")), Target = c(1, 2, 3, 4, 100, 101, 102, 103, 1000, 1001, 1002, 1003), Value_All = c(10, 11, 12, 13, 46, 47, 48, 49, 82, 83, 84, 85), @@ -105,7 +105,7 @@ test_that("prep_hierarchical_data returns correct standard hierarchies for regio # Expected output setup for a standard hierarchical forecast expected_data <- tibble::tibble( - Combo = c("Total", "A", "B", "EMEA_Croatia", "EMEA_Greece", "United_States_United_States"), + Combo = as.character(c("Total", "A", "B", "EMEA_Croatia", "EMEA_Greece", "United_States_United_States")), Date = as.Date(c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-01")), Target = c(1101, 101, 1000, 1, 100, 1000), Value_All = c(138, 56, 82, 10, 46, 82), From df46feadfcb1107578245ada575dd4ff2ce4899b Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Sun, 3 Mar 2024 18:36:51 -0800 Subject: [PATCH 14/18] error testing --- R/hierarchy.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/hierarchy.R b/R/hierarchy.R index 0376eeb4..89d98ce5 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -169,6 +169,7 @@ prep_hierarchical_data <- function(input_data, bottom_level_temp_tbl <- input_data_adj %>% dplyr::select(Combo, Date, tidyselect::all_of(regressor_var)) %>% dplyr::rename(Value = tidyselect::all_of(regressor_var)) %>% + dplyr::mutate(Value = as.numeric(Value)) %>% tidyr::pivot_wider( names_from = Combo, values_from = Value From 43efda820344be0bf74ee120b4298db78658f548 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Sun, 3 Mar 2024 18:55:26 -0800 Subject: [PATCH 15/18] debug --- R/hierarchy.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/hierarchy.R b/R/hierarchy.R index 89d98ce5..87686c33 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -169,7 +169,7 @@ prep_hierarchical_data <- function(input_data, bottom_level_temp_tbl <- input_data_adj %>% dplyr::select(Combo, Date, tidyselect::all_of(regressor_var)) %>% dplyr::rename(Value = tidyselect::all_of(regressor_var)) %>% - dplyr::mutate(Value = as.numeric(Value)) %>% + dplyr::mutate(Combo = as.character(Combo)) %>% tidyr::pivot_wider( names_from = Combo, values_from = Value From e36bdc236b5a4f3c3c815cc2d59b8a159cc0d720 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Mon, 4 Mar 2024 16:43:19 -0800 Subject: [PATCH 16/18] bug fixes --- R/hierarchy.R | 4 +--- tests/testthat/test-hierarchical.R | 6 ++++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/R/hierarchy.R b/R/hierarchy.R index 87686c33..dbf2523d 100644 --- a/R/hierarchy.R +++ b/R/hierarchy.R @@ -168,11 +168,9 @@ prep_hierarchical_data <- function(input_data, bottom_level_temp_tbl <- input_data_adj %>% dplyr::select(Combo, Date, tidyselect::all_of(regressor_var)) %>% - dplyr::rename(Value = tidyselect::all_of(regressor_var)) %>% - dplyr::mutate(Combo = as.character(Combo)) %>% tidyr::pivot_wider( names_from = Combo, - values_from = Value + values_from = tidyselect::all_of(regressor_var) ) %>% dplyr::mutate_if(is.numeric, list(~ replace(., is.na(.), 0))) %>% base::suppressWarnings() diff --git a/tests/testthat/test-hierarchical.R b/tests/testthat/test-hierarchical.R index 64252820..9bb18cc6 100644 --- a/tests/testthat/test-hierarchical.R +++ b/tests/testthat/test-hierarchical.R @@ -1,5 +1,8 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", { + + skip_if(getRversion() == "3.6.0", "Skipping for R 3.6.0") + # Mock data setup data <- tibble::tibble( Segment = as.character(c( @@ -76,6 +79,9 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", { }) test_that("prep_hierarchical_data returns correct standard hierarchies", { + + skip_if(getRversion() == "3.6.0", "Skipping for R 3.6.0") + # Mock data setup data <- tibble::tibble( Area = as.character(c("EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "EMEA", "United States", "United States", "United States", "United States")), From 6d500de9fe971ec0ac4b5e19589a740ca3e4cf1d Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Mon, 4 Mar 2024 16:54:08 -0800 Subject: [PATCH 17/18] update tests --- tests/testthat/test-hierarchical.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-hierarchical.R b/tests/testthat/test-hierarchical.R index 9bb18cc6..1b6ada93 100644 --- a/tests/testthat/test-hierarchical.R +++ b/tests/testthat/test-hierarchical.R @@ -1,7 +1,7 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", { - skip_if(getRversion() == "3.6.0", "Skipping for R 3.6.0") + skip_if(getRversion() < "3.7.0", "Skipping for R 3.6.0 and below") # Mock data setup data <- tibble::tibble( @@ -80,7 +80,7 @@ test_that("prep_hierarchical_data returns correct grouped hierarchies", { test_that("prep_hierarchical_data returns correct standard hierarchies", { - skip_if(getRversion() == "3.6.0", "Skipping for R 3.6.0") + skip_if(getRversion() < "3.7.0", "Skipping for R 3.6.0 and below") # Mock data setup data <- tibble::tibble( From 840dd10d69ff11daff30f98b566fee8fba141330 Mon Sep 17 00:00:00 2001 From: Mike Tokic Date: Mon, 4 Mar 2024 17:05:03 -0800 Subject: [PATCH 18/18] update global var --- R/utility.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/utility.R b/R/utility.R index 75b8fdba..384bb420 100644 --- a/R/utility.R +++ b/R/utility.R @@ -13,7 +13,8 @@ utils::globalVariables(c( "x", "num_cores", "run_info", "negative_forecast", "Forecast_Adj", "Final_Col", "lag_val", "libs", ".config", "Forecast_Tbl", "Model_Workflow", "id", "model_run", "Auto_Accept", "Feature", "Imp", "Importance", "LOFO_Var", "Var_RMSE", "Vote", "Votes", "desc", - "term", "Column", "Box_Cox_Lambda", "get_recipie_configurable" + "term", "Column", "Box_Cox_Lambda", "get_recipie_configurable", "Agg", "Unique", "Var", + "Var_Combo", "regressor", "regressor_tbl", "value_level_iter" )) #' @importFrom magrittr %>%