From 303ff40a9f44fe2aac9fd9d94c76376a7c515a6d Mon Sep 17 00:00:00 2001 From: achamma723 Date: Mon, 15 Jul 2024 03:03:51 +0200 Subject: [PATCH 1/6] Fix doc (mind) + doc example vi --- doc_conf/references.bib | 62 ++++++++++++++++ examples/plot_2D_simulation_example.py | 2 +- ...ot_diabetes_variable_importance_example.py | 71 ++++++++++++++++--- 3 files changed, 125 insertions(+), 10 deletions(-) diff --git a/doc_conf/references.bib b/doc_conf/references.bib index 01a858c..4627487 100644 --- a/doc_conf/references.bib +++ b/doc_conf/references.bib @@ -20,4 +20,66 @@ @article{Chamma_AAAI2024 year={2024}, month={Mar.}, pages={11195-11203} +} + +@article{breimanRandomForests2001, + title = {Random {{Forests}}}, + author = {Breiman, Leo}, + year = {2001}, + month = oct, + journal = {Machine Learning}, + volume = {45}, + number = {1}, + pages = {5--32}, + issn = {1573-0565}, + doi = {10.1023/A:1010933404324}, + urldate = {2022-06-28}, + abstract = {Random forests are a combination of tree predictors such that each tree depends on the values of a random vector sampled independently and with the same distribution for all trees in the forest. The generalization error for forests converges a.s. to a limit as the number of trees in the forest becomes large. The generalization error of a forest of tree classifiers depends on the strength of the individual trees in the forest and the correlation between them. Using a random selection of features to split each node yields error rates that compare favorably to Adaboost (Y. Freund \& R. Schapire, Machine Learning: Proceedings of the Thirteenth International conference, ***, 148--156), but are more robust with respect to noise. Internal estimates monitor error, strength, and correlation and these are used to show the response to increasing the number of features used in the splitting. Internal estimates are also used to measure variable importance. These ideas are also applicable to regression.}, + langid = {english}, + keywords = {classification,ensemble,regression}, + file = {/home/ahmad/Zotero/storage/BYQ8Z75L/Breiman - 2001 - Random Forests.pdf} +} + +@article{stroblConditionalVariableImportance2008, + title = {Conditional Variable Importance for Random Forests}, + author = {Strobl, Carolin and Boulesteix, Anne-Laure and Kneib, Thomas and Augustin, Thomas and Zeileis, Achim}, + year = {2008}, + month = jul, + journal = {BMC Bioinformatics}, + volume = {9}, + number = {1}, + pages = {307}, + issn = {1471-2105}, + doi = {10.1186/1471-2105-9-307}, + urldate = {2022-01-12}, + abstract = {Random forests are becoming increasingly popular in many scientific fields because they can cope with "small n large p" problems, complex interactions and even highly correlated predictor variables. Their variable importance measures have recently been suggested as screening tools for, e.g., gene expression studies. However, these variable importance measures show a bias towards correlated predictor variables.}, + langid = {english}, + file = {/home/ahmad/Zotero/storage/ML7VF5ZJ/Strobl et al. - 2008 - Conditional variable importance for random forests.pdf} +} + + +@article{miPermutationbasedIdentificationImportant2021, + title = {Permutation-Based Identification of Important Biomarkers for Complex Diseases via Machine Learning Models}, + author = {Mi, Xinlei and Zou, Baiming and Zou, Fei and Hu, Jianhua}, + year = {2021}, + month = may, + journal = {Nature Communications}, + volume = {12}, + number = {1}, + pages = {3008}, + publisher = {Nature Publishing Group}, + issn = {2041-1723}, + doi = {10.1038/s41467-021-22756-2}, + urldate = {2022-01-12}, + abstract = {Study of human disease remains challenging due to convoluted disease etiologies and complex molecular mechanisms at genetic, genomic, and proteomic levels. Many machine learning-based methods have been developed and widely used to alleviate some analytic challenges in complex human disease studies. While enjoying the modeling flexibility and robustness, these model frameworks suffer from non-transparency and difficulty in interpreting each individual feature due to their sophisticated algorithms. However, identifying important biomarkers is a critical pursuit towards assisting researchers to establish novel hypotheses regarding prevention, diagnosis and treatment of complex human diseases. Herein, we propose a Permutation-based Feature Importance Test (PermFIT) for estimating and testing the feature importance, and for assisting interpretation of individual feature in complex frameworks, including deep neural networks, random forests, and support vector machines. PermFIT (available at https://github.com/SkadiEye/deepTL) is implemented in a computationally efficient manner, without model refitting. We conduct extensive numerical studies under various scenarios, and show that PermFIT not only yields valid statistical inference, but also improves the prediction accuracy of machine learning models. With the application to the Cancer Genome Atlas kidney tumor data and the HITChip atlas data, PermFIT demonstrates its practical usage in identifying important biomarkers and boosting model prediction performance.}, + copyright = {2021 The Author(s)}, + langid = {english}, + keywords = {Cancer,Data mining,Machine learning,Statistical methods}, + annotation = {Bandiera\_abtest: a\\ +Cc\_license\_type: cc\_by\\ +Cg\_type: Nature Research Journals\\ +Primary\_atype: Research\\ +Subject\_term: Cancer;Data mining;Machine learning;Statistical methods\\ +Subject\_term\_id: cancer;data-mining;machine-learning;statistical-methods}, + file = {/home/ahmad/Zotero/storage/AL4ZN6J6/Mi et al. - 2021 - Permutation-based identification of important biom.pdf;/home/ahmad/Zotero/storage/MXTSY7AF/s41467-021-22756-2.html} } \ No newline at end of file diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index 57bee37..88e329a 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -233,7 +233,7 @@ def plot(maps, titles): # infernece method that does not leverage the data structure. This method # was introduced by Javanmard, A. et al. (2014), Zhang, C. H. et al. (2014) # and Van de Geer, S. et al.. (2014) (full references are available at -# https://Parietal-INRIA.github.io/hidimstat/). +# https://mind-inria.github.io/hidimstat/). # and referred to as Desparsified Lasso. # compute desparsified lasso diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index fba01dc..dc6790c 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -2,8 +2,44 @@ Variable Importance on diabetes dataset ======================================= -This example compares the standard permutation approach for variable importance -and its conditional variant on the diabetes dataset for the single-level case. +Variable Importance estimates the influence of a given input variable to the +prediction made by a model. To perform variable importance in a prediction +problem, :footcite:t:`breimanRandomForests2001` introduced the permutation +approach where the values are shuffled for one variable/column at a time. This +permutation breaks the relationship between the variable of interest and the +outcome. Following, the loss score is checked before and after this +substitution for any significant drop in the performance which reflects the +significance of this variable to predict the outcome. This ease-to-use solution +is demonstrated, in the work by +:footcite:t:`stroblConditionalVariableImportance2008`, to be affected by the +degree of correlation between the variables, thus biased towards truly +non-significant variables highly correlated with the significant ones and +creating fake significant variables. They introduced a solution for the Random +Forest estimator based on the conditional sampling by performing sub-groups +permutation when bisecting the space using the conditioning variables of the +buiding process. However, this solution is exclusive to the Random Forest and is +costly with high-dimensional settings. +:footcite:t:`Chamma_NeurIPS2023` introduced a new model-agnostic solution to +bypass the limitations of the permutation approach under the use of the +conditional schemes. The variable of interest does contain two types of +information: 1) the relationship with the remaining variables and 2) the +relationship with the outcome. The standard permutation, while breaking the +relationship with the outcome, is also destroying the dependency with the +remaining variables. Therefore, instead of directly permuting the variable of +interest, the variable of interest is predicted by the mean of the remaining +variables and the residuals of this prediction are permuted before +reconstructing the new version of the variable. This solution preserves the +dependency with the remaining variables. + +In this example, we compare both the standard permutation and its conditional +variant approaches for variable importance on the diabetes dataset for the +single-level case. The aim is to see if integrating the new +statistically-controlled solution has an impact on the results. + +References +---------- +.. footbibliography:: + """ ############################################################################# @@ -25,16 +61,20 @@ # Use or not a cross-validation with the provided learner k_fold = 2 -# Identifying the categorical (nominal & ordinal) variables +# Identifying the categorical (nominal, binary & ordinal) variables variables_categories = {} ############################################################################# # Standard Variable Importance # ---------------------------- +# To apply the standard permutation, we use the implementation introduced by (Mi +# et al., Nature, 2021) where the significance is measured by the mean of +# -log10(p_value). For this example, the inference estimator is set to the +# Random Forest. +# bbi_perm = BlockBasedImportance( estimator="RF", - importance_estimator="Mod_RF", do_hypertuning=True, dict_hypertuning=None, conditional=False, @@ -54,10 +94,14 @@ ############################################################################# # Conditional Variable Importance # ------------------------------- +# In this example, for the conditional permutation with the two blocks +# processing, the inference and importance estimators are set to the Random +# Forest. The significance is measured by the mean of -log10(p_value). +# bbi_cond = BlockBasedImportance( estimator="RF", - importance_estimator="Mod_RF", + importance_estimator=None, do_hypertuning=True, dict_hypertuning=None, conditional=True, @@ -79,9 +123,9 @@ # ----------------------- list_res = {"Perm": [], "Cond": []} -for ind_el, el in enumerate(diabetes.feature_names): - list_res["Perm"].append(pvals_perm[ind_el][0]) - list_res["Cond"].append(pvals_cond[ind_el][0]) +for index, _ in enumerate(diabetes.feature_names): + list_res["Perm"].append(pvals_perm[index][0]) + list_res["Cond"].append(pvals_cond[index][0]) x = np.arange(len(diabetes.feature_names)) width = 0.25 # the width of the bars @@ -98,5 +142,14 @@ ax.legend(loc="upper left", ncols=2) ax.set_ylim(0, 3) ax.axhline(y=-np.log10(0.05), color="r", linestyle="-") - plt.show() + +############################################################################# +# Analysis of the results +# ----------------------- +# While the standard permutation flags multiple variables to be significant for +# this prediction, the conditional permutation (the controlled alternative) +# shows an agreement for "bmi", "bp" and "s6" but also highlights the importance +# of "sex" in this prediction, thus reducing the input space to four significant +# variables. +# From 5696f6b3e8382a6343afac7f23742d160a7bfe80 Mon Sep 17 00:00:00 2001 From: achamma723 Date: Wed, 17 Jul 2024 01:08:09 +0200 Subject: [PATCH 2/6] Residuals vs Sampling example --- examples/plot_residuals_sampling.py | 210 ++++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 examples/plot_residuals_sampling.py diff --git a/examples/plot_residuals_sampling.py b/examples/plot_residuals_sampling.py new file mode 100644 index 0000000..54788ed --- /dev/null +++ b/examples/plot_residuals_sampling.py @@ -0,0 +1,210 @@ +""" +Conditional sampling using residuals vs sampling Random Forest +============================================================== + +""" + +############################################################################# +# Imports needed for this script +# ------------------------------ + +from hidimstat import BlockBasedImportance +from joblib import Parallel, delayed +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from scipy.linalg import cholesky +from scipy.stats import norm +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import roc_auc_score +import time + +n, p = (1000, 50) +inter_cor, intra_cor = (0, 0.85) +n_blocks = 1 +n_signal = 20 +problem_type = "regression" +snr = 4 +rf = RandomForestRegressor(random_state=2023) +dict_hyper = {"max_depth": [2, 5, 10, 20]} + +############################################################################# +# Generate the synthetic data +# --------------------------- +# The function below generates the correlation matrix between the variables +# according to the provided degrees of correlation (intra + inter). (inter_cor) +# indicates the degree of correlation between the variables/groups whereas +# (intra_cor) specifies the corresponding degree between the variables within +# each group. For the single-level case, the (n_blocks) is set to 1 and the +# (intra_cor) illustrates the correlation between the variables. +# +# Next, we generate the synthetic data by randomly drawing n_signal predictors +# from the corresponding p variables and reordering the set of variables to put the +# n_signal predictors at the beginning. Following, the response is generated +# under a simple linear model with Gaussian noise. + + +def generate_cor_blocks(p, inter_cor, intra_cor, n_blocks): + vars_per_grp = int(p / n_blocks) + cor_mat = np.zeros((p, p)) + cor_mat.fill(inter_cor) + for i in range(n_blocks): + cor_mat[ + (i * vars_per_grp) : ((i + 1) * vars_per_grp), + (i * vars_per_grp) : ((i + 1) * vars_per_grp), + ] = intra_cor + np.fill_diagonal(cor_mat, 1) + return cor_mat + + +def _generate_data(seed): + rng = np.random.RandomState(seed) + + cor_mat = generate_cor_blocks(p, inter_cor, intra_cor, n_blocks) + x = norm.rvs(size=(p, n), random_state=seed) + c = cholesky(cor_mat, lower=True) + X = pd.DataFrame(np.dot(c, x).T, columns=[str(i) for i in np.arange(p)]) + + data = X.copy() + + # Randomly draw n_signal predictors which are defined as signal predictors + indices_var = list(rng.choice(range(data.shape[1]), size=n_signal, replace=False)) + + # Reorder data matrix so that first n_signal predictors are the signal predictors + # List of remaining indices + indices_rem = [ind for ind in range(data.shape[1]) if ind not in indices_var] + total_indices = indices_var + indices_rem + # Before including the non-linear effects + data = data.iloc[:, total_indices] + data_signal = data.iloc[:, np.arange(n_signal)] + + # Determine beta coefficients + effectset = [-0.5, -1, -2, -3, 0.5, 1, 2, 3] + beta = rng.choice(effectset, size=data_signal.shape[1], replace=True) + + # Generate response + # The product of the signal predictors with the beta coefficients + prod_signal = np.dot(data_signal, beta) + + sigma_noise = np.linalg.norm(prod_signal, ord=2) / ( + snr * np.sqrt(data_signal.shape[0]) + ) + y = prod_signal + sigma_noise * rng.normal(size=prod_signal.shape[0]) + + return data, y + + +############################################################################# +# Processing across multiple permutations +# --------------------------------------- +# In order to get statistical significance with p-values, we run the experiments +# across 100 repetitions. +# + + +def compute_simulations(seed): + X, y = _generate_data(seed) + # Using the residuals + start_residuals = time.time() + bbi_residual = BlockBasedImportance( + estimator="RF", + importance_estimator="residuals_RF", + do_hypertuning=True, + dict_hypertuning=None, + conditional=True, + n_permutations=50, + n_jobs=2, + problem_type="regression", + k_fold=2, + variables_categories={}, + ) + bbi_residual.fit(X, y) + results_bbi_residual = bbi_residual.compute_importance() + + df_residuals = {} + df_residuals["method"] = ["residuals"] * X.shape[1] + df_residuals["score"] = [results_bbi_residual["score_R2"]] * X.shape[1] + df_residuals["elapsed"] = [time.time() - start_residuals] * X.shape[1] + df_residuals["importance"] = np.ravel(results_bbi_residual["importance"]) + df_residuals["p-value"] = np.ravel(results_bbi_residual["pval"]) + df_residuals["iteration"] = [seed] * X.shape[1] + df_residuals = pd.DataFrame(df_residuals) + + # Using the sampling RF + start_sampling = time.time() + bbi_sampling = BlockBasedImportance( + estimator="RF", + importance_estimator="sampling_RF", + do_hypertuning=True, + dict_hypertuning=None, + conditional=True, + n_permutations=50, + n_jobs=2, + problem_type="regression", + k_fold=2, + variables_categories={}, + ) + bbi_sampling.fit(X, y) + results_bbi_sampling = bbi_sampling.compute_importance() + + df_sampling = {} + df_sampling["method"] = ["sampling"] * X.shape[1] + df_sampling["score"] = [results_bbi_sampling["score_R2"]] * X.shape[1] + df_sampling["elapsed"] = [time.time() - start_sampling] * X.shape[1] + df_sampling["importance"] = np.ravel(results_bbi_sampling["importance"]) + df_sampling["p-value"] = np.ravel(results_bbi_sampling["pval"]) + df_sampling["iteration"] = [seed] * X.shape[1] + df_sampling = pd.DataFrame(df_sampling) + + df_final = pd.concat([df_residuals, df_sampling], axis=0) + return df_final + + +parallel = Parallel(n_jobs=10, verbose=1) +final_result = parallel( + delayed(compute_simulations)(seed=seed) for seed in np.arange(1, 101) +) + +############################################################################# +# Plotting AUC score and Type-I error +# ----------------------------------- +# With the prediction problems turns to be a binary classification problem for +# the variables being relevant or non-relevant vs the ground-truth, we measure +# the performance in terms of type-I error i.e. the rate of true non-relevant +# variables detected as relevant and AUC score related to correct significant +# variables ordering. +# + +df_final_result = pd.concat(final_result, axis=0).reset_index(drop=True) +df_auc = df_final_result.groupby(by=["method", "iteration"]).apply( + lambda x: roc_auc_score([1] * n_signal + [0] * (p - n_signal), -x["p-value"]) +) +df_auc = df_auc.reset_index(name="auc") +df_type_I = df_final_result.groupby(by=["method", "iteration"]).apply( + lambda x: sum(x.iloc[n_signal:, :]["p-value"] <= 0.05) / x.iloc[2:, :].shape[0] +) +df_type_I = df_type_I.reset_index(name="type-I") + +auc = [ + np.array(df_auc["auc"])[: int(df_auc.shape[0] / 2)], + np.array(df_auc["auc"])[int(df_auc.shape[0] / 2) :], +] +typeI_error = [ + np.array(df_type_I["type-I"])[: int(df_type_I.shape[0] / 2)], + np.array(df_type_I["type-I"])[int(df_type_I.shape[0] / 2) :], +] + +fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), sharey=True) + +# AUC score +axs[0].violinplot(auc, showmeans=False, showmedians=True, vert=False) +axs[0].set_title("AUC score") +axs[0].xaxis.grid(True) +axs[0].set_yticks([x + 1 for x in range(len(auc))], labels=["Residuals", "Sampling"]) +axs[0].set_ylabel("Method") + +# Type-I Error +axs[1].violinplot(typeI_error, showmeans=False, showmedians=True, vert=False) +axs[1].set_title("Type-I Error") +axs[1].axvline(x=0.05, color="r", label="Nominal Rate") +plt.show() From 72d5cdfd5ef2f0de0eed94c0d7af4a5c89925b28 Mon Sep 17 00:00:00 2001 From: achamma723 Date: Wed, 17 Jul 2024 03:19:24 +0200 Subject: [PATCH 3/6] Residuals vs Sampling example --- examples/plot_residuals_sampling.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/plot_residuals_sampling.py b/examples/plot_residuals_sampling.py index 54788ed..482ef75 100644 --- a/examples/plot_residuals_sampling.py +++ b/examples/plot_residuals_sampling.py @@ -19,10 +19,10 @@ from sklearn.metrics import roc_auc_score import time -n, p = (1000, 50) +n, p = (1000, 12) inter_cor, intra_cor = (0, 0.85) n_blocks = 1 -n_signal = 20 +n_signal = 2 problem_type = "regression" snr = 4 rf = RandomForestRegressor(random_state=2023) @@ -107,7 +107,7 @@ def compute_simulations(seed): # Using the residuals start_residuals = time.time() bbi_residual = BlockBasedImportance( - estimator="RF", + estimator="DNN", importance_estimator="residuals_RF", do_hypertuning=True, dict_hypertuning=None, @@ -133,7 +133,7 @@ def compute_simulations(seed): # Using the sampling RF start_sampling = time.time() bbi_sampling = BlockBasedImportance( - estimator="RF", + estimator="DNN", importance_estimator="sampling_RF", do_hypertuning=True, dict_hypertuning=None, @@ -160,9 +160,9 @@ def compute_simulations(seed): return df_final -parallel = Parallel(n_jobs=10, verbose=1) +parallel = Parallel(n_jobs=2, verbose=1) final_result = parallel( - delayed(compute_simulations)(seed=seed) for seed in np.arange(1, 101) + delayed(compute_simulations)(seed=seed) for seed in np.arange(1, 11) ) ############################################################################# From 60e9955312fe091ae192d995428cbee7e1aaa4f8 Mon Sep 17 00:00:00 2001 From: achamma723 Date: Fri, 19 Jul 2024 13:00:40 +0200 Subject: [PATCH 4/6] Tackle comments --- doc_conf/references.bib | 9 +-- ...ot_diabetes_variable_importance_example.py | 58 ++++++++++++------- examples/plot_residuals_sampling.py | 40 +++++++++---- hidimstat/importance_functions.py | 8 +++ 4 files changed, 76 insertions(+), 39 deletions(-) diff --git a/doc_conf/references.bib b/doc_conf/references.bib index 4627487..820ac43 100644 --- a/doc_conf/references.bib +++ b/doc_conf/references.bib @@ -75,11 +75,4 @@ @article{miPermutationbasedIdentificationImportant2021 copyright = {2021 The Author(s)}, langid = {english}, keywords = {Cancer,Data mining,Machine learning,Statistical methods}, - annotation = {Bandiera\_abtest: a\\ -Cc\_license\_type: cc\_by\\ -Cg\_type: Nature Research Journals\\ -Primary\_atype: Research\\ -Subject\_term: Cancer;Data mining;Machine learning;Statistical methods\\ -Subject\_term\_id: cancer;data-mining;machine-learning;statistical-methods}, - file = {/home/ahmad/Zotero/storage/AL4ZN6J6/Mi et al. - 2021 - Permutation-based identification of important biom.pdf;/home/ahmad/Zotero/storage/MXTSY7AF/s41467-021-22756-2.html} -} \ No newline at end of file +} diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index 3d28334..6391ed6 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -15,7 +15,7 @@ degree of correlation between the variables, thus biased towards truly non-significant variables highly correlated with the significant ones and creating fake significant variables. They introduced a solution for the Random -Forest estimator based on the conditional sampling by performing sub-groups +Forest estimator based on conditional sampling by performing sub-groups permutation when bisecting the space using the conditioning variables of the buiding process. However, this solution is exclusive to the Random Forest and is costly with high-dimensional settings. @@ -51,6 +51,7 @@ from sklearn.datasets import load_diabetes from hidimstat.BBI import BlockBasedImportance +from hidimstat import compute_loco plt.rcParams.update({"font.size": 14}) @@ -71,10 +72,10 @@ # To apply the standard permutation, we use the implementation introduced by (Mi # et al., Nature, 2021) where the significance is measured by the mean of # -log10(p_value). For this example, the inference estimator is set to the -# Random Forest. +# Random Forest learner. # -bbi_perm = BlockBasedImportance( +bbi_permutation = BlockBasedImportance( estimator="RF", importance_estimator="residuals_RF", do_hypertuning=True, @@ -84,24 +85,24 @@ problem_type="regression", k_fold=k_fold, variables_categories=variables_categories, - n_jobs=10, + n_jobs=2, verbose=0, n_permutations=100, ) -bbi_perm.fit(X, y) +bbi_permutation.fit(X, y) print("Computing the importance scores with standard permutation") -results_perm = bbi_perm.compute_importance() -pvals_perm = -np.log10(results_perm["pval"] + 1e-10) +results_permutation = bbi_permutation.compute_importance() +pvals_permutation = -np.log10(results_permutation["pval"] + 1e-10) ############################################################################# # Conditional Variable Importance # ------------------------------- -# In this example, for the conditional permutation with the two blocks -# processing, the inference and importance estimators are set to the Random -# Forest. The significance is measured by the mean of -log10(p_value). +# For the conditional permutation importance based on the two blocks (inference +# + importance), the estimators are set to the Random Forest learner. The +# significance is measured by the mean of -log10(p_value). # -bbi_cond = BlockBasedImportance( +bbi_conditional = BlockBasedImportance( estimator="RF", importance_estimator="residuals_RF", do_hypertuning=True, @@ -111,28 +112,42 @@ problem_type="regression", k_fold=k_fold, variables_categories=variables_categories, - n_jobs=10, + n_jobs=2, verbose=0, n_permutations=100, ) -bbi_cond.fit(X, y) +bbi_conditional.fit(X, y) print("Computing the importance scores with conditional permutation") -results_cond = bbi_cond.compute_importance() -pvals_cond = -np.log10(results_cond["pval"] + 1e-5) +results_conditional = bbi_conditional.compute_importance() +pvals_conditional = -np.log10(results_conditional["pval"] + 1e-5) + +############################################################################# +# Leave-One-Covariate-Out (LOCO) +# ------------------------------ +# We compare the previous permutation-based approaches with a removal-based +# approach LOCO (Williamson et al., Journal of the American Statistical +# Association, 2021) where the variable of interest is removed and the inference +# estimator is retrained using the new features to compare the loss for any drop in the +# performance. +# + +results_loco = compute_loco(X, y, use_dnn=False) +pvals_loco = -np.log10(results_loco["p_value"] + 1e-5) ############################################################################# # Plotting the comparison # ----------------------- -list_res = {"Perm": [], "Cond": []} +list_res = {"Permutation": [], "Conditional": [], "LOCO": []} for index, _ in enumerate(diabetes.feature_names): - list_res["Perm"].append(pvals_perm[index][0]) - list_res["Cond"].append(pvals_cond[index][0]) + list_res["Permutation"].append(pvals_permutation[index][0]) + list_res["Conditional"].append(pvals_conditional[index][0]) + list_res["LOCO"].append(pvals_loco[index]) x = np.arange(len(diabetes.feature_names)) width = 0.25 # the width of the bars multiplier = 0 -fig, ax = plt.subplots(figsize=(5, 5), layout="constrained") +fig, ax = plt.subplots(figsize=(10, 10), layout="constrained") for attribute, measurement in list_res.items(): offset = width * multiplier @@ -141,7 +156,7 @@ ax.set_ylabel(r"$-log_{10}p_{val}$") ax.set_xticks(x + width / 2, diabetes.feature_names) -ax.legend(loc="upper left", ncols=2) +ax.legend(loc="upper left", ncols=3) ax.set_ylim(0, 3) ax.axhline(y=-np.log10(0.05), color="r", linestyle="-") plt.show() @@ -153,5 +168,6 @@ # this prediction, the conditional permutation (the controlled alternative) # shows an agreement for "bmi", "bp" and "s6" but also highlights the importance # of "sex" in this prediction, thus reducing the input space to four significant -# variables. +# variables. LOCO underlines the importance of one variable "bp" for this +# prediction problem. # diff --git a/examples/plot_residuals_sampling.py b/examples/plot_residuals_sampling.py index 482ef75..f07e0af 100644 --- a/examples/plot_residuals_sampling.py +++ b/examples/plot_residuals_sampling.py @@ -2,6 +2,25 @@ Conditional sampling using residuals vs sampling Random Forest ============================================================== +To deploy the Conditional Permutation Importance (CPI), +:footcite:t:`Chamma_NeurIPS2023` described two main approaches for the +conditional scheme: 1) Instead of directly permuting the variable of interest as +in the Permutation Feature Importance (PFI), the residuals of the prediction of +the variable of interest x_j based on the remaining variables is first computed +along with a predicted version x_hat_j. These residuals are shuffled and added +to the predicted version to recreate the variable of interest (Preserving the +dependency between the variable of interest and the remaining variables while +breaking the relationship with the outcome). 2) Another option is to use the +sampling Random Forest. Using the remaining variables to predict the variable of +interest, and instead of predicting the variable of interest as the mean of the +instances' outcome of the targeted leaf or the class with the most occurences, +we sample from the same leaf of the instance of interest within its neighbors, +and we follow the standard path of the Random Forest. + +References +---------- +.. footbibliography:: + """ ############################################################################# @@ -19,7 +38,7 @@ from sklearn.metrics import roc_auc_score import time -n, p = (1000, 12) +n, p = (100, 12) inter_cor, intra_cor = (0, 0.85) n_blocks = 1 n_signal = 2 @@ -32,11 +51,11 @@ # Generate the synthetic data # --------------------------- # The function below generates the correlation matrix between the variables -# according to the provided degrees of correlation (intra + inter). (inter_cor) +# according to the provided degrees of correlation (intra + inter). `inter_cor` # indicates the degree of correlation between the variables/groups whereas -# (intra_cor) specifies the corresponding degree between the variables within -# each group. For the single-level case, the (n_blocks) is set to 1 and the -# (intra_cor) illustrates the correlation between the variables. +# `intra_cor` specifies the corresponding degree between the variables within +# each group. For the single-level case, `n_blocks` is set to 1 and the +# `intra_cor` is the unique correlation between variables. # # Next, we generate the synthetic data by randomly drawing n_signal predictors # from the corresponding p variables and reordering the set of variables to put the @@ -98,7 +117,7 @@ def _generate_data(seed): # Processing across multiple permutations # --------------------------------------- # In order to get statistical significance with p-values, we run the experiments -# across 100 repetitions. +# across 10 repetitions. # @@ -107,12 +126,12 @@ def compute_simulations(seed): # Using the residuals start_residuals = time.time() bbi_residual = BlockBasedImportance( - estimator="DNN", + estimator="RF", importance_estimator="residuals_RF", do_hypertuning=True, dict_hypertuning=None, conditional=True, - n_permutations=50, + n_permutations=10, n_jobs=2, problem_type="regression", k_fold=2, @@ -133,12 +152,12 @@ def compute_simulations(seed): # Using the sampling RF start_sampling = time.time() bbi_sampling = BlockBasedImportance( - estimator="DNN", + estimator="RF", importance_estimator="sampling_RF", do_hypertuning=True, dict_hypertuning=None, conditional=True, - n_permutations=50, + n_permutations=10, n_jobs=2, problem_type="regression", k_fold=2, @@ -160,6 +179,7 @@ def compute_simulations(seed): return df_final +# Running across 10 repetitions parallel = Parallel(n_jobs=2, verbose=1) final_result = parallel( delayed(compute_simulations)(seed=seed) for seed in np.arange(1, 11) diff --git a/hidimstat/importance_functions.py b/hidimstat/importance_functions.py index a77e79c..cd17a5d 100644 --- a/hidimstat/importance_functions.py +++ b/hidimstat/importance_functions.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from scipy.stats import ttest_1samp from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.model_selection import GridSearchCV @@ -34,7 +35,11 @@ def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed= A dictionary containing the importance scores (importance_score) and p-values (p_value). """ + # Convert X to pandas dataframe + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) y = np.array(y) + dict_vals = {"importance_score": [], "p_value": []} dict_encode_outcome = {"regression": False, "classification": True} @@ -133,4 +138,7 @@ def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed= dict_vals["importance_score"].append(np.mean(delta)) dict_vals["p_value"].append(p_value) + # Convert lists to numpy array + dict_vals["importance_score"] = np.array(dict_vals["importance_score"]) + dict_vals["p_value"] = np.array(dict_vals["p_value"]) return dict_vals From be8d6973a9be83a446382edb1df8604811d08a2c Mon Sep 17 00:00:00 2001 From: achamma723 Date: Fri, 19 Jul 2024 13:04:37 +0200 Subject: [PATCH 5/6] Fix analysis results residuals_sampling --- examples/plot_residuals_sampling.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/plot_residuals_sampling.py b/examples/plot_residuals_sampling.py index f07e0af..ec30d6e 100644 --- a/examples/plot_residuals_sampling.py +++ b/examples/plot_residuals_sampling.py @@ -228,3 +228,10 @@ def compute_simulations(seed): axs[1].set_title("Type-I Error") axs[1].axvline(x=0.05, color="r", label="Nominal Rate") plt.show() + +############################################################################# +# Analysis of the results +# ----------------------- +# We can observe that the sampling approaches'performance is almost similar to +# that of the residuals. Sampling accelerates the conditional importance +# computation by simplifying the residuals steps. From eee37513cab5046422c69f685737516362548338 Mon Sep 17 00:00:00 2001 From: achamma723 Date: Fri, 19 Jul 2024 16:02:26 +0200 Subject: [PATCH 6/6] Minor changes --- examples/plot_diabetes_variable_importance_example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index 6391ed6..98e2597 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -3,7 +3,7 @@ ======================================= Variable Importance estimates the influence of a given input variable to the -prediction made by a model. To perform variable importance in a prediction +prediction made by a model. To assess variable importance in a prediction problem, :footcite:t:`breimanRandomForests2001` introduced the permutation approach where the values are shuffled for one variable/column at a time. This permutation breaks the relationship between the variable of interest and the @@ -26,7 +26,7 @@ relationship with the outcome. The standard permutation, while breaking the relationship with the outcome, is also destroying the dependency with the remaining variables. Therefore, instead of directly permuting the variable of -interest, the variable of interest is predicted by the mean of the remaining +interest, the variable of interest is predicted by the remaining variables and the residuals of this prediction are permuted before reconstructing the new version of the variable. This solution preserves the dependency with the remaining variables.