From 60e9955312fe091ae192d995428cbee7e1aaa4f8 Mon Sep 17 00:00:00 2001 From: achamma723 Date: Fri, 19 Jul 2024 13:00:40 +0200 Subject: [PATCH] Tackle comments --- doc_conf/references.bib | 9 +-- ...ot_diabetes_variable_importance_example.py | 58 ++++++++++++------- examples/plot_residuals_sampling.py | 40 +++++++++---- hidimstat/importance_functions.py | 8 +++ 4 files changed, 76 insertions(+), 39 deletions(-) diff --git a/doc_conf/references.bib b/doc_conf/references.bib index 4627487..820ac43 100644 --- a/doc_conf/references.bib +++ b/doc_conf/references.bib @@ -75,11 +75,4 @@ @article{miPermutationbasedIdentificationImportant2021 copyright = {2021 The Author(s)}, langid = {english}, keywords = {Cancer,Data mining,Machine learning,Statistical methods}, - annotation = {Bandiera\_abtest: a\\ -Cc\_license\_type: cc\_by\\ -Cg\_type: Nature Research Journals\\ -Primary\_atype: Research\\ -Subject\_term: Cancer;Data mining;Machine learning;Statistical methods\\ -Subject\_term\_id: cancer;data-mining;machine-learning;statistical-methods}, - file = {/home/ahmad/Zotero/storage/AL4ZN6J6/Mi et al. - 2021 - Permutation-based identification of important biom.pdf;/home/ahmad/Zotero/storage/MXTSY7AF/s41467-021-22756-2.html} -} \ No newline at end of file +} diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index 3d28334..6391ed6 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -15,7 +15,7 @@ degree of correlation between the variables, thus biased towards truly non-significant variables highly correlated with the significant ones and creating fake significant variables. They introduced a solution for the Random -Forest estimator based on the conditional sampling by performing sub-groups +Forest estimator based on conditional sampling by performing sub-groups permutation when bisecting the space using the conditioning variables of the buiding process. However, this solution is exclusive to the Random Forest and is costly with high-dimensional settings. @@ -51,6 +51,7 @@ from sklearn.datasets import load_diabetes from hidimstat.BBI import BlockBasedImportance +from hidimstat import compute_loco plt.rcParams.update({"font.size": 14}) @@ -71,10 +72,10 @@ # To apply the standard permutation, we use the implementation introduced by (Mi # et al., Nature, 2021) where the significance is measured by the mean of # -log10(p_value). For this example, the inference estimator is set to the -# Random Forest. +# Random Forest learner. # -bbi_perm = BlockBasedImportance( +bbi_permutation = BlockBasedImportance( estimator="RF", importance_estimator="residuals_RF", do_hypertuning=True, @@ -84,24 +85,24 @@ problem_type="regression", k_fold=k_fold, variables_categories=variables_categories, - n_jobs=10, + n_jobs=2, verbose=0, n_permutations=100, ) -bbi_perm.fit(X, y) +bbi_permutation.fit(X, y) print("Computing the importance scores with standard permutation") -results_perm = bbi_perm.compute_importance() -pvals_perm = -np.log10(results_perm["pval"] + 1e-10) +results_permutation = bbi_permutation.compute_importance() +pvals_permutation = -np.log10(results_permutation["pval"] + 1e-10) ############################################################################# # Conditional Variable Importance # ------------------------------- -# In this example, for the conditional permutation with the two blocks -# processing, the inference and importance estimators are set to the Random -# Forest. The significance is measured by the mean of -log10(p_value). +# For the conditional permutation importance based on the two blocks (inference +# + importance), the estimators are set to the Random Forest learner. The +# significance is measured by the mean of -log10(p_value). # -bbi_cond = BlockBasedImportance( +bbi_conditional = BlockBasedImportance( estimator="RF", importance_estimator="residuals_RF", do_hypertuning=True, @@ -111,28 +112,42 @@ problem_type="regression", k_fold=k_fold, variables_categories=variables_categories, - n_jobs=10, + n_jobs=2, verbose=0, n_permutations=100, ) -bbi_cond.fit(X, y) +bbi_conditional.fit(X, y) print("Computing the importance scores with conditional permutation") -results_cond = bbi_cond.compute_importance() -pvals_cond = -np.log10(results_cond["pval"] + 1e-5) +results_conditional = bbi_conditional.compute_importance() +pvals_conditional = -np.log10(results_conditional["pval"] + 1e-5) + +############################################################################# +# Leave-One-Covariate-Out (LOCO) +# ------------------------------ +# We compare the previous permutation-based approaches with a removal-based +# approach LOCO (Williamson et al., Journal of the American Statistical +# Association, 2021) where the variable of interest is removed and the inference +# estimator is retrained using the new features to compare the loss for any drop in the +# performance. +# + +results_loco = compute_loco(X, y, use_dnn=False) +pvals_loco = -np.log10(results_loco["p_value"] + 1e-5) ############################################################################# # Plotting the comparison # ----------------------- -list_res = {"Perm": [], "Cond": []} +list_res = {"Permutation": [], "Conditional": [], "LOCO": []} for index, _ in enumerate(diabetes.feature_names): - list_res["Perm"].append(pvals_perm[index][0]) - list_res["Cond"].append(pvals_cond[index][0]) + list_res["Permutation"].append(pvals_permutation[index][0]) + list_res["Conditional"].append(pvals_conditional[index][0]) + list_res["LOCO"].append(pvals_loco[index]) x = np.arange(len(diabetes.feature_names)) width = 0.25 # the width of the bars multiplier = 0 -fig, ax = plt.subplots(figsize=(5, 5), layout="constrained") +fig, ax = plt.subplots(figsize=(10, 10), layout="constrained") for attribute, measurement in list_res.items(): offset = width * multiplier @@ -141,7 +156,7 @@ ax.set_ylabel(r"$-log_{10}p_{val}$") ax.set_xticks(x + width / 2, diabetes.feature_names) -ax.legend(loc="upper left", ncols=2) +ax.legend(loc="upper left", ncols=3) ax.set_ylim(0, 3) ax.axhline(y=-np.log10(0.05), color="r", linestyle="-") plt.show() @@ -153,5 +168,6 @@ # this prediction, the conditional permutation (the controlled alternative) # shows an agreement for "bmi", "bp" and "s6" but also highlights the importance # of "sex" in this prediction, thus reducing the input space to four significant -# variables. +# variables. LOCO underlines the importance of one variable "bp" for this +# prediction problem. # diff --git a/examples/plot_residuals_sampling.py b/examples/plot_residuals_sampling.py index 482ef75..f07e0af 100644 --- a/examples/plot_residuals_sampling.py +++ b/examples/plot_residuals_sampling.py @@ -2,6 +2,25 @@ Conditional sampling using residuals vs sampling Random Forest ============================================================== +To deploy the Conditional Permutation Importance (CPI), +:footcite:t:`Chamma_NeurIPS2023` described two main approaches for the +conditional scheme: 1) Instead of directly permuting the variable of interest as +in the Permutation Feature Importance (PFI), the residuals of the prediction of +the variable of interest x_j based on the remaining variables is first computed +along with a predicted version x_hat_j. These residuals are shuffled and added +to the predicted version to recreate the variable of interest (Preserving the +dependency between the variable of interest and the remaining variables while +breaking the relationship with the outcome). 2) Another option is to use the +sampling Random Forest. Using the remaining variables to predict the variable of +interest, and instead of predicting the variable of interest as the mean of the +instances' outcome of the targeted leaf or the class with the most occurences, +we sample from the same leaf of the instance of interest within its neighbors, +and we follow the standard path of the Random Forest. + +References +---------- +.. footbibliography:: + """ ############################################################################# @@ -19,7 +38,7 @@ from sklearn.metrics import roc_auc_score import time -n, p = (1000, 12) +n, p = (100, 12) inter_cor, intra_cor = (0, 0.85) n_blocks = 1 n_signal = 2 @@ -32,11 +51,11 @@ # Generate the synthetic data # --------------------------- # The function below generates the correlation matrix between the variables -# according to the provided degrees of correlation (intra + inter). (inter_cor) +# according to the provided degrees of correlation (intra + inter). `inter_cor` # indicates the degree of correlation between the variables/groups whereas -# (intra_cor) specifies the corresponding degree between the variables within -# each group. For the single-level case, the (n_blocks) is set to 1 and the -# (intra_cor) illustrates the correlation between the variables. +# `intra_cor` specifies the corresponding degree between the variables within +# each group. For the single-level case, `n_blocks` is set to 1 and the +# `intra_cor` is the unique correlation between variables. # # Next, we generate the synthetic data by randomly drawing n_signal predictors # from the corresponding p variables and reordering the set of variables to put the @@ -98,7 +117,7 @@ def _generate_data(seed): # Processing across multiple permutations # --------------------------------------- # In order to get statistical significance with p-values, we run the experiments -# across 100 repetitions. +# across 10 repetitions. # @@ -107,12 +126,12 @@ def compute_simulations(seed): # Using the residuals start_residuals = time.time() bbi_residual = BlockBasedImportance( - estimator="DNN", + estimator="RF", importance_estimator="residuals_RF", do_hypertuning=True, dict_hypertuning=None, conditional=True, - n_permutations=50, + n_permutations=10, n_jobs=2, problem_type="regression", k_fold=2, @@ -133,12 +152,12 @@ def compute_simulations(seed): # Using the sampling RF start_sampling = time.time() bbi_sampling = BlockBasedImportance( - estimator="DNN", + estimator="RF", importance_estimator="sampling_RF", do_hypertuning=True, dict_hypertuning=None, conditional=True, - n_permutations=50, + n_permutations=10, n_jobs=2, problem_type="regression", k_fold=2, @@ -160,6 +179,7 @@ def compute_simulations(seed): return df_final +# Running across 10 repetitions parallel = Parallel(n_jobs=2, verbose=1) final_result = parallel( delayed(compute_simulations)(seed=seed) for seed in np.arange(1, 11) diff --git a/hidimstat/importance_functions.py b/hidimstat/importance_functions.py index a77e79c..cd17a5d 100644 --- a/hidimstat/importance_functions.py +++ b/hidimstat/importance_functions.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from scipy.stats import ttest_1samp from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.model_selection import GridSearchCV @@ -34,7 +35,11 @@ def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed= A dictionary containing the importance scores (importance_score) and p-values (p_value). """ + # Convert X to pandas dataframe + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) y = np.array(y) + dict_vals = {"importance_score": [], "p_value": []} dict_encode_outcome = {"regression": False, "classification": True} @@ -133,4 +138,7 @@ def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed= dict_vals["importance_score"].append(np.mean(delta)) dict_vals["p_value"].append(p_value) + # Convert lists to numpy array + dict_vals["importance_score"] = np.array(dict_vals["importance_score"]) + dict_vals["p_value"] = np.array(dict_vals["p_value"]) return dict_vals