remove multicolinearity filtering

microsoft · Aug 18, 2023 · aa919db · aa919db
1 parent 30514f4
commit aa919db
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 49 deletions.
diff --git a/R/feature_selection.R b/R/feature_selection.R
@@ -6,6 +6,7 @@
 #' @param parallel_processing parallel processing
 #' @param date_type date_type
 #' @param fast turns off lofo
+#' @param seed seed
 #'
 #' @return list of best features to use
 #' @noRd
@@ -14,18 +15,15 @@ select_features <- function(input_data,
                             train_test_data,
                             parallel_processing = NULL,
                             date_type,
-                            fast = FALSE) {
-
-  # list correlated features
-  non_cor_cols <- multicolinearity_fn(input_data)
+                            fast = FALSE, 
+                            seed = 123) {
 
-  # only keep historical data and drop correlated features
+  # only keep historical data
   input_data <- input_data %>%
-    tidyr::drop_na(Target) #%>%
-    #dplyr::select(Combo, Target, tidyselect::all_of(non_cor_cols))
+    tidyr::drop_na(Target) 
 
+  # skip lofo if there are too many features
   if(ncol(input_data) > 250) {
-    print("skipping lofo")
     fast = TRUE
   }
 
@@ -57,10 +55,11 @@ select_features <- function(input_data,
 
       # run leave one feature out selection
       lofo_results <- lofo_fn(
-        run_info,
-        input_data,
-        train_test_data,
-        parallel_processing
+        run_info = run_info,
+        data = input_data,
+        train_test_splits = train_test_data,
+        parallel_processing = parallel_processing, 
+        seed = seed
       ) %>%
         dplyr::filter(Imp >= 0) %>%
         dplyr::rename(Feature = LOFO_Var) %>%
@@ -89,14 +88,16 @@ select_features <- function(input_data,
 
     # botuta feature selection
     boruta_results <- tibble::tibble(
-      Feature = boruta_fn(input_data),
+      Feature = boruta_fn(input_data, 
+                          seed),
       Vote = 1,
       Auto_Accept = 0
     )
   }
 
   # random forest feature importance
-  vip_rf_results <- vip_rf_fn(input_data) %>%
+  vip_rf_results <- vip_rf_fn(input_data, 
+                              seed) %>%
     dplyr::rename(Feature = Variable) %>%
     dplyr::mutate(
       Vote = 1,
@@ -105,7 +106,8 @@ select_features <- function(input_data,
     dplyr::select(Feature, Vote, Auto_Accept)
 
   # cubist feature importance
-  vip_cubist_results <- vip_cubist_fn(input_data) %>%
+  vip_cubist_results <- vip_cubist_fn(input_data, 
+                                      seed) %>%
     dplyr::rename(Feature = Variable) %>%
     dplyr::mutate(
       Vote = 1,
@@ -114,7 +116,8 @@ select_features <- function(input_data,
     dplyr::select(Feature, Vote, Auto_Accept)
 
   # lasso regression feature importance
-  vip_lm_initial <- vip_lm_fn(input_data)
+  vip_lm_initial <- vip_lm_fn(input_data, 
+                              seed)
 
   missing_cols <- setdiff(
     colnames(input_data %>%
@@ -165,24 +168,6 @@ select_features <- function(input_data,
   return(fs_list)
 }
 
-#' Multicolinearity Filter
-#'
-#' @param data data
-#'
-#' @return list of features that are not correlated with one another
-#' @noRd
-multicolinearity_fn <- function(data) {
-  recipes::recipe(
-    Target ~ .,
-    data = data
-  ) %>%
-    recipes::step_zv(recipes::all_predictors()) %>%
-    recipes::step_corr(recipes::all_numeric_predictors(), threshold = .9) %>%
-    recipes::prep(training = data) %>%
-    recipes::bake(data) %>%
-    colnames()
-}
-
 #' Target Correlation Filter
 #'
 #' @param data data
@@ -304,11 +289,14 @@ vip_cubist_fn <- function(data,
 #'
 #' @param data data
 #' @param iterations iterations
+#' @param seed seed
 #'
 #' @return list of most important features in boruta selection process
 #' @noRd
 boruta_fn <- function(data,
-                      iterations = 100) {
+                      iterations = 100, 
+                      seed = 123) {
+  set.seed(seed)
   Boruta::Boruta(Target ~ ., data = data, maxRuns = iterations) %>%
     Boruta::getSelectedAttributes()
 }
@@ -327,7 +315,8 @@ lofo_fn <- function(run_info,
                     data,
                     train_test_splits,
                     parallel_processing,
-                    pca = FALSE) {
+                    pca = FALSE, 
+                    seed = 123) {
 
   # parallel run info
   par_info <- par_start(
@@ -448,7 +437,7 @@ lofo_fn <- function(run_info,
           Date <= test_end
         )
 
-      set.seed(123)
+      set.seed(seed)
 
       xgb_model_fit <- wflw_spec_tune_xgboost %>%
         generics::fit(train_data)
@@ -464,7 +453,7 @@ lofo_fn <- function(run_info,
         ) %>%
         dplyr::select(Target, Forecast, Train_Test_ID, LOFO_Var)
 
-      set.seed(123)
+      set.seed(seed)
 
       lr_model_fit <- wflw_spec_glmnet %>%
         generics::fit(train_data)
@@ -480,7 +469,7 @@ lofo_fn <- function(run_info,
         ) %>%
         dplyr::select(Target, Forecast, Train_Test_ID, LOFO_Var)
 
-      set.seed(123)
+      set.seed(seed)
 
       cubist_model_fit <- wflw_spec_cubist %>%
         generics::fit(train_data)
@@ -517,8 +506,7 @@ lofo_fn <- function(run_info,
     dplyr::rename(Var_RMSE = RMSE) %>%
     dplyr::rowwise() %>%
     dplyr::mutate(
-      Imp = Var_RMSE - baseline_rmse,
-      Imp_Norm = max(c(1 - (baseline_rmse / Var_RMSE), 0))
+      Imp = Var_RMSE - baseline_rmse
     ) %>%
     dplyr::ungroup()
 

diff --git a/R/train_models.R b/R/train_models.R
@@ -278,7 +278,6 @@ train_models <- function(run_info,
 
       if (feature_selection) {
         # ensure feature selection objects get exported
-        multicolinearity_fn <- multicolinearity_fn
         lofo_fn <- lofo_fn
         target_corr_fn <- target_corr_fn
         vip_rf_fn <- vip_rf_fn
@@ -332,7 +331,6 @@ train_models <- function(run_info,
 
           fs_list <- append(fs_list, list(R2 = R2_fs_list))
         }
-        print(fs_list)
       }
 
       # train each model

diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@
 
 The Microsoft Finance Time Series Forecasting Framework, aka finnts or Finn, is an automated forecasting framework for producing financial forecasts. While it was built for corporate finance activities, it can easily expand to any time series forecasting problem!
 
--   Automated feature engineering, back testing, and model selection.
+-   Automated feature engineering, feature selection, back testing, and model selection.
 -   Access to 25+ models. Both univariate and multivariate models.
 -   Azure integration to run thousands of time series in parallel within the cloud.
 -   Supports daily, weekly, monthly, quarterly, and yearly forecasts.

diff --git a/vignettes/feature-selection.Rmd b/vignettes/feature-selection.Rmd
@@ -18,17 +18,17 @@ Finn leverages multiple techniques of feature selection to ensure only the best
 
 ## Feature Selection Techniques
 
-Below are the techniques used in the feature selection process. It's important to note that before running this process, multicolinearity is removed from each feature engineering recipe by removing features that are over 0.9 correlated with another feature.
+Below are the techniques used in the feature selection process.
 
 ### Target Correlation
 
 Removes features that are correlated with the target variable. For daily and weekly data, a correlation filter of 0.2 is applied. For all other date types, a correlation of 0.5 is applied.
 
-### Leave One Feature Out
+### Leave One Feature Out (lofo)
 
-This is a more complex process where various models (cubist, glmnet, xgboost) are trained on the validation splits of the data. Each round, one feature is held out of the data, and the change in prediction accuracy over the hold out validation data is calculated. If the accuracy gets worse by removing the feature, it gets flagged as an important feature. This is not a recursive feature elimination process, instead only one feature is ever held out at any point in time.
+This is a more complex process where various models (cubist, glmnet, xgboost) are trained on the validation splits of the data. Each round, one feature is held out of the data, and the change in prediction accuracy (RMSE) over the hold out validation data is calculated. If the accuracy gets worse by removing the feature, it gets flagged as an important feature. This is not a recursive feature elimination process, instead only one feature is ever held out at any point in time.
 
-This technique is used for local models for yearly, quarterly, and monthly data. It's turned off for global models or daily or weekly data since it would take too long to run properly.
+This technique is used for yearly, quarterly, and monthly data. It's turned off for daily or weekly data since it would take too long to run properly. If a feature engineering recipe contains more than 250 features, lofo is also turned off to keep runtime low. 
 
 ### Boruta
 
@@ -44,4 +44,4 @@ Multiple models (cubist, glmnet, ranger) are trained on the entire training data
 
 Since we use multiple techniques for feature selection, we need to determine how we will use this information to select the final features. This is where the voting process comes in. If a feature gets flagged in one of the above techniques successfully, it gets a vote. If a feature receives enough votes, it is kept and ultimately used when training individual models.
 
-Daily and weekly data have a voting threshold of 3, meaning a feature needs to get at least 3 votes from 3 separate feature selection techniques in order to be kept. Yearly, quarterly, or monthly data have a voting threshold of 4. Each feature needs to get a majority of the votes in order to be kept. This process can reduce up to 50%-90% of features. The final result is keeping all the features that contain the "signal" while discarding all other features that just contain "noise".
+Daily and weekly data have a voting threshold of 3, meaning a feature needs to get at least 3 votes from 3 separate feature selection techniques in order to be kept. Yearly, quarterly, or monthly data have a voting threshold of 4 (3 if lofo isn't ran). Each feature needs to get a majority of the votes in order to be kept. This process can reduce up to 50%-95% of features. The final result is keeping all the features that contain the "signal" while discarding all other features that just contain "noise".