Tackle comments

mind-inria · Jul 19, 2024 · 60e9955 · 60e9955
1 parent fad5cca
commit 60e9955
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 39 deletions.
diff --git a/doc_conf/references.bib b/doc_conf/references.bib
@@ -75,11 +75,4 @@ @article{miPermutationbasedIdentificationImportant2021
   copyright = {2021 The Author(s)},
   langid = {english},
   keywords = {Cancer,Data mining,Machine learning,Statistical methods},
-  annotation = {Bandiera\_abtest: a\\
-Cc\_license\_type: cc\_by\\
-Cg\_type: Nature Research Journals\\
-Primary\_atype: Research\\
-Subject\_term: Cancer;Data mining;Machine learning;Statistical methods\\
-Subject\_term\_id: cancer;data-mining;machine-learning;statistical-methods},
-  file = {/home/ahmad/Zotero/storage/AL4ZN6J6/Mi et al. - 2021 - Permutation-based identification of important biom.pdf;/home/ahmad/Zotero/storage/MXTSY7AF/s41467-021-22756-2.html}
-}
+}
diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py
@@ -15,7 +15,7 @@
 degree of correlation between the variables, thus biased towards truly
 non-significant variables highly correlated with the significant ones and
 creating fake significant variables. They introduced a solution for the Random
-Forest estimator based on the conditional sampling by performing sub-groups
+Forest estimator based on conditional sampling by performing sub-groups
 permutation when bisecting the space using the conditioning variables of the
 buiding process. However, this solution is exclusive to the Random Forest and is
 costly with high-dimensional settings.
@@ -51,6 +51,7 @@
 from sklearn.datasets import load_diabetes
 
 from hidimstat.BBI import BlockBasedImportance
+from hidimstat import compute_loco
 
 plt.rcParams.update({"font.size": 14})
 
@@ -71,10 +72,10 @@
 # To apply the standard permutation, we use the implementation introduced by (Mi
 # et al., Nature, 2021) where the significance is measured by the mean of
 # -log10(p_value). For this example, the inference estimator is set to the
-# Random Forest.
+# Random Forest learner.
 #
 
-bbi_perm = BlockBasedImportance(
+bbi_permutation = BlockBasedImportance(
     estimator="RF",
     importance_estimator="residuals_RF",
     do_hypertuning=True,
@@ -84,24 +85,24 @@
     problem_type="regression",
     k_fold=k_fold,
     variables_categories=variables_categories,
-    n_jobs=10,
+    n_jobs=2,
     verbose=0,
     n_permutations=100,
 )
-bbi_perm.fit(X, y)
+bbi_permutation.fit(X, y)
 print("Computing the importance scores with standard permutation")
-results_perm = bbi_perm.compute_importance()
-pvals_perm = -np.log10(results_perm["pval"] + 1e-10)
+results_permutation = bbi_permutation.compute_importance()
+pvals_permutation = -np.log10(results_permutation["pval"] + 1e-10)
 
 #############################################################################
 # Conditional Variable Importance
 # -------------------------------
-# In this example, for the conditional permutation with the two blocks
-# processing, the inference and importance estimators are set to the Random
-# Forest. The significance is measured by the mean of -log10(p_value).
+# For the conditional permutation importance based on the two blocks (inference
+# + importance), the estimators are set to the Random Forest learner. The
+# significance is measured by the mean of -log10(p_value).
 #
 
-bbi_cond = BlockBasedImportance(
+bbi_conditional = BlockBasedImportance(
     estimator="RF",
     importance_estimator="residuals_RF",
     do_hypertuning=True,
@@ -111,28 +112,42 @@
     problem_type="regression",
     k_fold=k_fold,
     variables_categories=variables_categories,
-    n_jobs=10,
+    n_jobs=2,
     verbose=0,
     n_permutations=100,
 )
-bbi_cond.fit(X, y)
+bbi_conditional.fit(X, y)
 print("Computing the importance scores with conditional permutation")
-results_cond = bbi_cond.compute_importance()
-pvals_cond = -np.log10(results_cond["pval"] + 1e-5)
+results_conditional = bbi_conditional.compute_importance()
+pvals_conditional = -np.log10(results_conditional["pval"] + 1e-5)
+
+#############################################################################
+# Leave-One-Covariate-Out (LOCO)
+# ------------------------------
+# We compare the previous permutation-based approaches with a removal-based
+# approach LOCO (Williamson et al., Journal of the American Statistical
+# Association, 2021) where the variable of interest is removed and the inference
+# estimator is retrained using the new features to compare the loss for any drop in the
+# performance.
+#
+
+results_loco = compute_loco(X, y, use_dnn=False)
+pvals_loco = -np.log10(results_loco["p_value"] + 1e-5)
 
 #############################################################################
 # Plotting the comparison
 # -----------------------
 
-list_res = {"Perm": [], "Cond": []}
+list_res = {"Permutation": [], "Conditional": [], "LOCO": []}
 for index, _ in enumerate(diabetes.feature_names):
-    list_res["Perm"].append(pvals_perm[index][0])
-    list_res["Cond"].append(pvals_cond[index][0])
+    list_res["Permutation"].append(pvals_permutation[index][0])
+    list_res["Conditional"].append(pvals_conditional[index][0])
+    list_res["LOCO"].append(pvals_loco[index])
 
 x = np.arange(len(diabetes.feature_names))
 width = 0.25  # the width of the bars
 multiplier = 0
-fig, ax = plt.subplots(figsize=(5, 5), layout="constrained")
+fig, ax = plt.subplots(figsize=(10, 10), layout="constrained")
 
 for attribute, measurement in list_res.items():
     offset = width * multiplier
@@ -141,7 +156,7 @@
 
 ax.set_ylabel(r"$-log_{10}p_{val}$")
 ax.set_xticks(x + width / 2, diabetes.feature_names)
-ax.legend(loc="upper left", ncols=2)
+ax.legend(loc="upper left", ncols=3)
 ax.set_ylim(0, 3)
 ax.axhline(y=-np.log10(0.05), color="r", linestyle="-")
 plt.show()
@@ -153,5 +168,6 @@
 # this prediction, the conditional permutation (the controlled alternative)
 # shows an agreement for "bmi", "bp" and "s6" but also highlights the importance
 # of "sex" in this prediction, thus reducing the input space to four significant
-# variables.
+# variables. LOCO underlines the importance of one variable "bp" for this
+# prediction problem.
 #
diff --git a/examples/plot_residuals_sampling.py b/examples/plot_residuals_sampling.py
@@ -2,6 +2,25 @@
 Conditional sampling using residuals vs sampling Random Forest
 ==============================================================
 
+To deploy the Conditional Permutation Importance (CPI),
+:footcite:t:`Chamma_NeurIPS2023` described two main approaches for the
+conditional scheme: 1) Instead of directly permuting the variable of interest as
+in the Permutation Feature Importance (PFI), the residuals of the prediction of
+the variable of interest x_j based on the remaining variables is first computed
+along with a predicted version x_hat_j. These residuals are shuffled and added
+to the predicted version to recreate the variable of interest (Preserving the
+dependency between the variable of interest and the remaining variables while
+breaking the relationship with the outcome). 2) Another option is to use the
+sampling Random Forest. Using the remaining variables to predict the variable of
+interest, and instead of predicting the variable of interest as the mean of the
+instances' outcome of the targeted leaf or the class with the most occurences,
+we sample from the same leaf of the instance of interest within its neighbors,
+and we follow the standard path of the Random Forest.
+
+References
+----------
+.. footbibliography::
+
 """
 
 #############################################################################
@@ -19,7 +38,7 @@
 from sklearn.metrics import roc_auc_score
 import time
 
-n, p = (1000, 12)
+n, p = (100, 12)
 inter_cor, intra_cor = (0, 0.85)
 n_blocks = 1
 n_signal = 2
@@ -32,11 +51,11 @@
 # Generate the synthetic data
 # ---------------------------
 # The function below generates the correlation matrix between the variables
-# according to the provided degrees of correlation (intra + inter). (inter_cor)
+# according to the provided degrees of correlation (intra + inter). `inter_cor`
 # indicates the degree of correlation between the variables/groups whereas
-# (intra_cor) specifies the corresponding degree between the variables within
-# each group. For the single-level case, the (n_blocks) is set to 1 and the
-# (intra_cor) illustrates the correlation between the variables.
+# `intra_cor` specifies the corresponding degree between the variables within
+# each group. For the single-level case, `n_blocks` is set to 1 and the
+# `intra_cor` is the unique correlation between variables.
 #
 # Next, we generate the synthetic data by randomly drawing n_signal predictors
 # from the corresponding p variables and reordering the set of variables to put the
@@ -98,7 +117,7 @@ def _generate_data(seed):
 # Processing across multiple permutations
 # ---------------------------------------
 # In order to get statistical significance with p-values, we run the experiments
-# across 100 repetitions.
+# across 10 repetitions.
 #
 
 
@@ -107,12 +126,12 @@ def compute_simulations(seed):
     # Using the residuals
     start_residuals = time.time()
     bbi_residual = BlockBasedImportance(
-        estimator="DNN",
+        estimator="RF",
         importance_estimator="residuals_RF",
         do_hypertuning=True,
         dict_hypertuning=None,
         conditional=True,
-        n_permutations=50,
+        n_permutations=10,
         n_jobs=2,
         problem_type="regression",
         k_fold=2,
@@ -133,12 +152,12 @@ def compute_simulations(seed):
     # Using the sampling RF
     start_sampling = time.time()
     bbi_sampling = BlockBasedImportance(
-        estimator="DNN",
+        estimator="RF",
         importance_estimator="sampling_RF",
         do_hypertuning=True,
         dict_hypertuning=None,
         conditional=True,
-        n_permutations=50,
+        n_permutations=10,
         n_jobs=2,
         problem_type="regression",
         k_fold=2,
@@ -160,6 +179,7 @@ def compute_simulations(seed):
     return df_final
 
 
+# Running across 10 repetitions
 parallel = Parallel(n_jobs=2, verbose=1)
 final_result = parallel(
     delayed(compute_simulations)(seed=seed) for seed in np.arange(1, 11)

diff --git a/hidimstat/importance_functions.py b/hidimstat/importance_functions.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 from scipy.stats import ttest_1samp
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.model_selection import GridSearchCV
@@ -34,7 +35,11 @@ def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed=
         A dictionary containing the importance scores (importance_score) and
         p-values (p_value).
     """
+    # Convert X to pandas dataframe
+    if not isinstance(X, pd.DataFrame):
+        X = pd.DataFrame(X)
     y = np.array(y)
+
     dict_vals = {"importance_score": [], "p_value": []}
     dict_encode_outcome = {"regression": False, "classification": True}
 
@@ -133,4 +138,7 @@ def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed=
         dict_vals["importance_score"].append(np.mean(delta))
         dict_vals["p_value"].append(p_value)
 
+    # Convert lists to numpy array
+    dict_vals["importance_score"] = np.array(dict_vals["importance_score"])
+    dict_vals["p_value"] = np.array(dict_vals["p_value"])
     return dict_vals