Skip to content

Commit

Permalink
Tackle comments
Browse files Browse the repository at this point in the history
  • Loading branch information
achamma723 committed Jul 19, 2024
1 parent fad5cca commit 60e9955
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 39 deletions.
9 changes: 1 addition & 8 deletions doc_conf/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,4 @@ @article{miPermutationbasedIdentificationImportant2021
copyright = {2021 The Author(s)},
langid = {english},
keywords = {Cancer,Data mining,Machine learning,Statistical methods},
annotation = {Bandiera\_abtest: a\\
Cc\_license\_type: cc\_by\\
Cg\_type: Nature Research Journals\\
Primary\_atype: Research\\
Subject\_term: Cancer;Data mining;Machine learning;Statistical methods\\
Subject\_term\_id: cancer;data-mining;machine-learning;statistical-methods},
file = {/home/ahmad/Zotero/storage/AL4ZN6J6/Mi et al. - 2021 - Permutation-based identification of important biom.pdf;/home/ahmad/Zotero/storage/MXTSY7AF/s41467-021-22756-2.html}
}
}
58 changes: 37 additions & 21 deletions examples/plot_diabetes_variable_importance_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
degree of correlation between the variables, thus biased towards truly
non-significant variables highly correlated with the significant ones and
creating fake significant variables. They introduced a solution for the Random
Forest estimator based on the conditional sampling by performing sub-groups
Forest estimator based on conditional sampling by performing sub-groups
permutation when bisecting the space using the conditioning variables of the
buiding process. However, this solution is exclusive to the Random Forest and is
costly with high-dimensional settings.
Expand Down Expand Up @@ -51,6 +51,7 @@
from sklearn.datasets import load_diabetes

from hidimstat.BBI import BlockBasedImportance
from hidimstat import compute_loco

plt.rcParams.update({"font.size": 14})

Expand All @@ -71,10 +72,10 @@
# To apply the standard permutation, we use the implementation introduced by (Mi
# et al., Nature, 2021) where the significance is measured by the mean of
# -log10(p_value). For this example, the inference estimator is set to the
# Random Forest.
# Random Forest learner.
#

bbi_perm = BlockBasedImportance(
bbi_permutation = BlockBasedImportance(
estimator="RF",
importance_estimator="residuals_RF",
do_hypertuning=True,
Expand All @@ -84,24 +85,24 @@
problem_type="regression",
k_fold=k_fold,
variables_categories=variables_categories,
n_jobs=10,
n_jobs=2,
verbose=0,
n_permutations=100,
)
bbi_perm.fit(X, y)
bbi_permutation.fit(X, y)
print("Computing the importance scores with standard permutation")
results_perm = bbi_perm.compute_importance()
pvals_perm = -np.log10(results_perm["pval"] + 1e-10)
results_permutation = bbi_permutation.compute_importance()
pvals_permutation = -np.log10(results_permutation["pval"] + 1e-10)

#############################################################################
# Conditional Variable Importance
# -------------------------------
# In this example, for the conditional permutation with the two blocks
# processing, the inference and importance estimators are set to the Random
# Forest. The significance is measured by the mean of -log10(p_value).
# For the conditional permutation importance based on the two blocks (inference
# + importance), the estimators are set to the Random Forest learner. The
# significance is measured by the mean of -log10(p_value).
#

bbi_cond = BlockBasedImportance(
bbi_conditional = BlockBasedImportance(
estimator="RF",
importance_estimator="residuals_RF",
do_hypertuning=True,
Expand All @@ -111,28 +112,42 @@
problem_type="regression",
k_fold=k_fold,
variables_categories=variables_categories,
n_jobs=10,
n_jobs=2,
verbose=0,
n_permutations=100,
)
bbi_cond.fit(X, y)
bbi_conditional.fit(X, y)
print("Computing the importance scores with conditional permutation")
results_cond = bbi_cond.compute_importance()
pvals_cond = -np.log10(results_cond["pval"] + 1e-5)
results_conditional = bbi_conditional.compute_importance()
pvals_conditional = -np.log10(results_conditional["pval"] + 1e-5)

#############################################################################
# Leave-One-Covariate-Out (LOCO)
# ------------------------------
# We compare the previous permutation-based approaches with a removal-based
# approach LOCO (Williamson et al., Journal of the American Statistical
# Association, 2021) where the variable of interest is removed and the inference
# estimator is retrained using the new features to compare the loss for any drop in the
# performance.
#

results_loco = compute_loco(X, y, use_dnn=False)
pvals_loco = -np.log10(results_loco["p_value"] + 1e-5)

#############################################################################
# Plotting the comparison
# -----------------------

list_res = {"Perm": [], "Cond": []}
list_res = {"Permutation": [], "Conditional": [], "LOCO": []}
for index, _ in enumerate(diabetes.feature_names):
list_res["Perm"].append(pvals_perm[index][0])
list_res["Cond"].append(pvals_cond[index][0])
list_res["Permutation"].append(pvals_permutation[index][0])
list_res["Conditional"].append(pvals_conditional[index][0])
list_res["LOCO"].append(pvals_loco[index])

x = np.arange(len(diabetes.feature_names))
width = 0.25 # the width of the bars
multiplier = 0
fig, ax = plt.subplots(figsize=(5, 5), layout="constrained")
fig, ax = plt.subplots(figsize=(10, 10), layout="constrained")

for attribute, measurement in list_res.items():
offset = width * multiplier
Expand All @@ -141,7 +156,7 @@

ax.set_ylabel(r"$-log_{10}p_{val}$")
ax.set_xticks(x + width / 2, diabetes.feature_names)
ax.legend(loc="upper left", ncols=2)
ax.legend(loc="upper left", ncols=3)
ax.set_ylim(0, 3)
ax.axhline(y=-np.log10(0.05), color="r", linestyle="-")
plt.show()
Expand All @@ -153,5 +168,6 @@
# this prediction, the conditional permutation (the controlled alternative)
# shows an agreement for "bmi", "bp" and "s6" but also highlights the importance
# of "sex" in this prediction, thus reducing the input space to four significant
# variables.
# variables. LOCO underlines the importance of one variable "bp" for this
# prediction problem.
#
40 changes: 30 additions & 10 deletions examples/plot_residuals_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,25 @@
Conditional sampling using residuals vs sampling Random Forest
==============================================================
To deploy the Conditional Permutation Importance (CPI),
:footcite:t:`Chamma_NeurIPS2023` described two main approaches for the
conditional scheme: 1) Instead of directly permuting the variable of interest as
in the Permutation Feature Importance (PFI), the residuals of the prediction of
the variable of interest x_j based on the remaining variables is first computed
along with a predicted version x_hat_j. These residuals are shuffled and added
to the predicted version to recreate the variable of interest (Preserving the
dependency between the variable of interest and the remaining variables while
breaking the relationship with the outcome). 2) Another option is to use the
sampling Random Forest. Using the remaining variables to predict the variable of
interest, and instead of predicting the variable of interest as the mean of the
instances' outcome of the targeted leaf or the class with the most occurences,
we sample from the same leaf of the instance of interest within its neighbors,
and we follow the standard path of the Random Forest.
References
----------
.. footbibliography::
"""

#############################################################################
Expand All @@ -19,7 +38,7 @@
from sklearn.metrics import roc_auc_score
import time

n, p = (1000, 12)
n, p = (100, 12)
inter_cor, intra_cor = (0, 0.85)
n_blocks = 1
n_signal = 2
Expand All @@ -32,11 +51,11 @@
# Generate the synthetic data
# ---------------------------
# The function below generates the correlation matrix between the variables
# according to the provided degrees of correlation (intra + inter). (inter_cor)
# according to the provided degrees of correlation (intra + inter). `inter_cor`
# indicates the degree of correlation between the variables/groups whereas
# (intra_cor) specifies the corresponding degree between the variables within
# each group. For the single-level case, the (n_blocks) is set to 1 and the
# (intra_cor) illustrates the correlation between the variables.
# `intra_cor` specifies the corresponding degree between the variables within
# each group. For the single-level case, `n_blocks` is set to 1 and the
# `intra_cor` is the unique correlation between variables.
#
# Next, we generate the synthetic data by randomly drawing n_signal predictors
# from the corresponding p variables and reordering the set of variables to put the
Expand Down Expand Up @@ -98,7 +117,7 @@ def _generate_data(seed):
# Processing across multiple permutations
# ---------------------------------------
# In order to get statistical significance with p-values, we run the experiments
# across 100 repetitions.
# across 10 repetitions.
#


Expand All @@ -107,12 +126,12 @@ def compute_simulations(seed):
# Using the residuals
start_residuals = time.time()
bbi_residual = BlockBasedImportance(
estimator="DNN",
estimator="RF",
importance_estimator="residuals_RF",
do_hypertuning=True,
dict_hypertuning=None,
conditional=True,
n_permutations=50,
n_permutations=10,
n_jobs=2,
problem_type="regression",
k_fold=2,
Expand All @@ -133,12 +152,12 @@ def compute_simulations(seed):
# Using the sampling RF
start_sampling = time.time()
bbi_sampling = BlockBasedImportance(
estimator="DNN",
estimator="RF",
importance_estimator="sampling_RF",
do_hypertuning=True,
dict_hypertuning=None,
conditional=True,
n_permutations=50,
n_permutations=10,
n_jobs=2,
problem_type="regression",
k_fold=2,
Expand All @@ -160,6 +179,7 @@ def compute_simulations(seed):
return df_final


# Running across 10 repetitions
parallel = Parallel(n_jobs=2, verbose=1)
final_result = parallel(
delayed(compute_simulations)(seed=seed) for seed in np.arange(1, 11)
Expand Down
8 changes: 8 additions & 0 deletions hidimstat/importance_functions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pandas as pd
from scipy.stats import ttest_1samp
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
Expand Down Expand Up @@ -34,7 +35,11 @@ def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed=
A dictionary containing the importance scores (importance_score) and
p-values (p_value).
"""
# Convert X to pandas dataframe
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
y = np.array(y)

dict_vals = {"importance_score": [], "p_value": []}
dict_encode_outcome = {"regression": False, "classification": True}

Expand Down Expand Up @@ -133,4 +138,7 @@ def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed=
dict_vals["importance_score"].append(np.mean(delta))
dict_vals["p_value"].append(p_value)

# Convert lists to numpy array
dict_vals["importance_score"] = np.array(dict_vals["importance_score"])
dict_vals["p_value"] = np.array(dict_vals["p_value"])
return dict_vals

0 comments on commit 60e9955

Please sign in to comment.