diff --git a/R/feature_selection.R b/R/feature_selection.R index f416e14a..1769d0c4 100644 --- a/R/feature_selection.R +++ b/R/feature_selection.R @@ -6,6 +6,7 @@ #' @param parallel_processing parallel processing #' @param date_type date_type #' @param fast turns off lofo +#' @param seed seed #' #' @return list of best features to use #' @noRd @@ -14,18 +15,15 @@ select_features <- function(input_data, train_test_data, parallel_processing = NULL, date_type, - fast = FALSE) { - - # list correlated features - non_cor_cols <- multicolinearity_fn(input_data) + fast = FALSE, + seed = 123) { - # only keep historical data and drop correlated features + # only keep historical data input_data <- input_data %>% - tidyr::drop_na(Target) #%>% - #dplyr::select(Combo, Target, tidyselect::all_of(non_cor_cols)) + tidyr::drop_na(Target) + # skip lofo if there are too many features if(ncol(input_data) > 250) { - print("skipping lofo") fast = TRUE } @@ -57,10 +55,11 @@ select_features <- function(input_data, # run leave one feature out selection lofo_results <- lofo_fn( - run_info, - input_data, - train_test_data, - parallel_processing + run_info = run_info, + data = input_data, + train_test_splits = train_test_data, + parallel_processing = parallel_processing, + seed = seed ) %>% dplyr::filter(Imp >= 0) %>% dplyr::rename(Feature = LOFO_Var) %>% @@ -89,14 +88,16 @@ select_features <- function(input_data, # botuta feature selection boruta_results <- tibble::tibble( - Feature = boruta_fn(input_data), + Feature = boruta_fn(input_data, + seed), Vote = 1, Auto_Accept = 0 ) } # random forest feature importance - vip_rf_results <- vip_rf_fn(input_data) %>% + vip_rf_results <- vip_rf_fn(input_data, + seed) %>% dplyr::rename(Feature = Variable) %>% dplyr::mutate( Vote = 1, @@ -105,7 +106,8 @@ select_features <- function(input_data, dplyr::select(Feature, Vote, Auto_Accept) # cubist feature importance - vip_cubist_results <- vip_cubist_fn(input_data) %>% + vip_cubist_results <- vip_cubist_fn(input_data, + seed) %>% dplyr::rename(Feature = Variable) %>% dplyr::mutate( Vote = 1, @@ -114,7 +116,8 @@ select_features <- function(input_data, dplyr::select(Feature, Vote, Auto_Accept) # lasso regression feature importance - vip_lm_initial <- vip_lm_fn(input_data) + vip_lm_initial <- vip_lm_fn(input_data, + seed) missing_cols <- setdiff( colnames(input_data %>% @@ -165,24 +168,6 @@ select_features <- function(input_data, return(fs_list) } -#' Multicolinearity Filter -#' -#' @param data data -#' -#' @return list of features that are not correlated with one another -#' @noRd -multicolinearity_fn <- function(data) { - recipes::recipe( - Target ~ ., - data = data - ) %>% - recipes::step_zv(recipes::all_predictors()) %>% - recipes::step_corr(recipes::all_numeric_predictors(), threshold = .9) %>% - recipes::prep(training = data) %>% - recipes::bake(data) %>% - colnames() -} - #' Target Correlation Filter #' #' @param data data @@ -304,11 +289,14 @@ vip_cubist_fn <- function(data, #' #' @param data data #' @param iterations iterations +#' @param seed seed #' #' @return list of most important features in boruta selection process #' @noRd boruta_fn <- function(data, - iterations = 100) { + iterations = 100, + seed = 123) { + set.seed(seed) Boruta::Boruta(Target ~ ., data = data, maxRuns = iterations) %>% Boruta::getSelectedAttributes() } @@ -327,7 +315,8 @@ lofo_fn <- function(run_info, data, train_test_splits, parallel_processing, - pca = FALSE) { + pca = FALSE, + seed = 123) { # parallel run info par_info <- par_start( @@ -448,7 +437,7 @@ lofo_fn <- function(run_info, Date <= test_end ) - set.seed(123) + set.seed(seed) xgb_model_fit <- wflw_spec_tune_xgboost %>% generics::fit(train_data) @@ -464,7 +453,7 @@ lofo_fn <- function(run_info, ) %>% dplyr::select(Target, Forecast, Train_Test_ID, LOFO_Var) - set.seed(123) + set.seed(seed) lr_model_fit <- wflw_spec_glmnet %>% generics::fit(train_data) @@ -480,7 +469,7 @@ lofo_fn <- function(run_info, ) %>% dplyr::select(Target, Forecast, Train_Test_ID, LOFO_Var) - set.seed(123) + set.seed(seed) cubist_model_fit <- wflw_spec_cubist %>% generics::fit(train_data) @@ -517,8 +506,7 @@ lofo_fn <- function(run_info, dplyr::rename(Var_RMSE = RMSE) %>% dplyr::rowwise() %>% dplyr::mutate( - Imp = Var_RMSE - baseline_rmse, - Imp_Norm = max(c(1 - (baseline_rmse / Var_RMSE), 0)) + Imp = Var_RMSE - baseline_rmse ) %>% dplyr::ungroup() diff --git a/R/train_models.R b/R/train_models.R index 282f76ed..d14ad125 100644 --- a/R/train_models.R +++ b/R/train_models.R @@ -278,7 +278,6 @@ train_models <- function(run_info, if (feature_selection) { # ensure feature selection objects get exported - multicolinearity_fn <- multicolinearity_fn lofo_fn <- lofo_fn target_corr_fn <- target_corr_fn vip_rf_fn <- vip_rf_fn @@ -332,7 +331,6 @@ train_models <- function(run_info, fs_list <- append(fs_list, list(R2 = R2_fs_list)) } - print(fs_list) } # train each model diff --git a/README.md b/README.md index 23a90c3c..89f31601 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ The Microsoft Finance Time Series Forecasting Framework, aka finnts or Finn, is an automated forecasting framework for producing financial forecasts. While it was built for corporate finance activities, it can easily expand to any time series forecasting problem! -- Automated feature engineering, back testing, and model selection. +- Automated feature engineering, feature selection, back testing, and model selection. - Access to 25+ models. Both univariate and multivariate models. - Azure integration to run thousands of time series in parallel within the cloud. - Supports daily, weekly, monthly, quarterly, and yearly forecasts. diff --git a/vignettes/feature-selection.Rmd b/vignettes/feature-selection.Rmd index 6087c6d9..453cc4ad 100644 --- a/vignettes/feature-selection.Rmd +++ b/vignettes/feature-selection.Rmd @@ -18,17 +18,17 @@ Finn leverages multiple techniques of feature selection to ensure only the best ## Feature Selection Techniques -Below are the techniques used in the feature selection process. It's important to note that before running this process, multicolinearity is removed from each feature engineering recipe by removing features that are over 0.9 correlated with another feature. +Below are the techniques used in the feature selection process. ### Target Correlation Removes features that are correlated with the target variable. For daily and weekly data, a correlation filter of 0.2 is applied. For all other date types, a correlation of 0.5 is applied. -### Leave One Feature Out +### Leave One Feature Out (lofo) -This is a more complex process where various models (cubist, glmnet, xgboost) are trained on the validation splits of the data. Each round, one feature is held out of the data, and the change in prediction accuracy over the hold out validation data is calculated. If the accuracy gets worse by removing the feature, it gets flagged as an important feature. This is not a recursive feature elimination process, instead only one feature is ever held out at any point in time. +This is a more complex process where various models (cubist, glmnet, xgboost) are trained on the validation splits of the data. Each round, one feature is held out of the data, and the change in prediction accuracy (RMSE) over the hold out validation data is calculated. If the accuracy gets worse by removing the feature, it gets flagged as an important feature. This is not a recursive feature elimination process, instead only one feature is ever held out at any point in time. -This technique is used for local models for yearly, quarterly, and monthly data. It's turned off for global models or daily or weekly data since it would take too long to run properly. +This technique is used for yearly, quarterly, and monthly data. It's turned off for daily or weekly data since it would take too long to run properly. If a feature engineering recipe contains more than 250 features, lofo is also turned off to keep runtime low. ### Boruta @@ -44,4 +44,4 @@ Multiple models (cubist, glmnet, ranger) are trained on the entire training data Since we use multiple techniques for feature selection, we need to determine how we will use this information to select the final features. This is where the voting process comes in. If a feature gets flagged in one of the above techniques successfully, it gets a vote. If a feature receives enough votes, it is kept and ultimately used when training individual models. -Daily and weekly data have a voting threshold of 3, meaning a feature needs to get at least 3 votes from 3 separate feature selection techniques in order to be kept. Yearly, quarterly, or monthly data have a voting threshold of 4. Each feature needs to get a majority of the votes in order to be kept. This process can reduce up to 50%-90% of features. The final result is keeping all the features that contain the "signal" while discarding all other features that just contain "noise". +Daily and weekly data have a voting threshold of 3, meaning a feature needs to get at least 3 votes from 3 separate feature selection techniques in order to be kept. Yearly, quarterly, or monthly data have a voting threshold of 4 (3 if lofo isn't ran). Each feature needs to get a majority of the votes in order to be kept. This process can reduce up to 50%-95% of features. The final result is keeping all the features that contain the "signal" while discarding all other features that just contain "noise".