Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX]: Diabetes example #4

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions doc_conf/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,66 @@ @article{Chamma_AAAI2024
year={2024},
month={Mar.},
pages={11195-11203}
}

@article{breimanRandomForests2001,
title = {Random {{Forests}}},
author = {Breiman, Leo},
year = {2001},
month = oct,
journal = {Machine Learning},
volume = {45},
number = {1},
pages = {5--32},
issn = {1573-0565},
doi = {10.1023/A:1010933404324},
urldate = {2022-06-28},
abstract = {Random forests are a combination of tree predictors such that each tree depends on the values of a random vector sampled independently and with the same distribution for all trees in the forest. The generalization error for forests converges a.s. to a limit as the number of trees in the forest becomes large. The generalization error of a forest of tree classifiers depends on the strength of the individual trees in the forest and the correlation between them. Using a random selection of features to split each node yields error rates that compare favorably to Adaboost (Y. Freund \& R. Schapire, Machine Learning: Proceedings of the Thirteenth International conference, ***, 148--156), but are more robust with respect to noise. Internal estimates monitor error, strength, and correlation and these are used to show the response to increasing the number of features used in the splitting. Internal estimates are also used to measure variable importance. These ideas are also applicable to regression.},
langid = {english},
keywords = {classification,ensemble,regression},
file = {/home/ahmad/Zotero/storage/BYQ8Z75L/Breiman - 2001 - Random Forests.pdf}
}

@article{stroblConditionalVariableImportance2008,
title = {Conditional Variable Importance for Random Forests},
author = {Strobl, Carolin and Boulesteix, Anne-Laure and Kneib, Thomas and Augustin, Thomas and Zeileis, Achim},
year = {2008},
month = jul,
journal = {BMC Bioinformatics},
volume = {9},
number = {1},
pages = {307},
issn = {1471-2105},
doi = {10.1186/1471-2105-9-307},
urldate = {2022-01-12},
abstract = {Random forests are becoming increasingly popular in many scientific fields because they can cope with "small n large p" problems, complex interactions and even highly correlated predictor variables. Their variable importance measures have recently been suggested as screening tools for, e.g., gene expression studies. However, these variable importance measures show a bias towards correlated predictor variables.},
langid = {english},
file = {/home/ahmad/Zotero/storage/ML7VF5ZJ/Strobl et al. - 2008 - Conditional variable importance for random forests.pdf}
}


@article{miPermutationbasedIdentificationImportant2021,
title = {Permutation-Based Identification of Important Biomarkers for Complex Diseases via Machine Learning Models},
author = {Mi, Xinlei and Zou, Baiming and Zou, Fei and Hu, Jianhua},
year = {2021},
month = may,
journal = {Nature Communications},
volume = {12},
number = {1},
pages = {3008},
publisher = {Nature Publishing Group},
issn = {2041-1723},
doi = {10.1038/s41467-021-22756-2},
urldate = {2022-01-12},
abstract = {Study of human disease remains challenging due to convoluted disease etiologies and complex molecular mechanisms at genetic, genomic, and proteomic levels. Many machine learning-based methods have been developed and widely used to alleviate some analytic challenges in complex human disease studies. While enjoying the modeling flexibility and robustness, these model frameworks suffer from non-transparency and difficulty in interpreting each individual feature due to their sophisticated algorithms. However, identifying important biomarkers is a critical pursuit towards assisting researchers to establish novel hypotheses regarding prevention, diagnosis and treatment of complex human diseases. Herein, we propose a Permutation-based Feature Importance Test (PermFIT) for estimating and testing the feature importance, and for assisting interpretation of individual feature in complex frameworks, including deep neural networks, random forests, and support vector machines. PermFIT (available at https://github.com/SkadiEye/deepTL) is implemented in a computationally efficient manner, without model refitting. We conduct extensive numerical studies under various scenarios, and show that PermFIT not only yields valid statistical inference, but also improves the prediction accuracy of machine learning models. With the application to the Cancer Genome Atlas kidney tumor data and the HITChip atlas data, PermFIT demonstrates its practical usage in identifying important biomarkers and boosting model prediction performance.},
copyright = {2021 The Author(s)},
langid = {english},
keywords = {Cancer,Data mining,Machine learning,Statistical methods},
annotation = {Bandiera\_abtest: a\\
Cc\_license\_type: cc\_by\\
Cg\_type: Nature Research Journals\\
Primary\_atype: Research\\
Subject\_term: Cancer;Data mining;Machine learning;Statistical methods\\
Subject\_term\_id: cancer;data-mining;machine-learning;statistical-methods},
file = {/home/ahmad/Zotero/storage/AL4ZN6J6/Mi et al. - 2021 - Permutation-based identification of important biom.pdf;/home/ahmad/Zotero/storage/MXTSY7AF/s41467-021-22756-2.html}
}
2 changes: 1 addition & 1 deletion examples/plot_2D_simulation_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def plot(maps, titles):
# infernece method that does not leverage the data structure. This method
# was introduced by Javanmard, A. et al. (2014), Zhang, C. H. et al. (2014)
# and Van de Geer, S. et al.. (2014) (full references are available at
# https://Parietal-INRIA.github.io/hidimstat/).
# https://mind-inria.github.io/hidimstat/).
# and referred to as Desparsified Lasso.

# compute desparsified lasso
Expand Down
68 changes: 61 additions & 7 deletions examples/plot_diabetes_variable_importance_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,44 @@
Variable Importance on diabetes dataset
=======================================

This example compares the standard permutation approach for variable importance
and its conditional variant on the diabetes dataset for the single-level case.
Variable Importance estimates the influence of a given input variable to the
prediction made by a model. To perform variable importance in a prediction
problem, :footcite:t:`breimanRandomForests2001` introduced the permutation
approach where the values are shuffled for one variable/column at a time. This
permutation breaks the relationship between the variable of interest and the
outcome. Following, the loss score is checked before and after this
substitution for any significant drop in the performance which reflects the
significance of this variable to predict the outcome. This ease-to-use solution
is demonstrated, in the work by
:footcite:t:`stroblConditionalVariableImportance2008`, to be affected by the
degree of correlation between the variables, thus biased towards truly
non-significant variables highly correlated with the significant ones and
creating fake significant variables. They introduced a solution for the Random
Forest estimator based on the conditional sampling by performing sub-groups
permutation when bisecting the space using the conditioning variables of the
buiding process. However, this solution is exclusive to the Random Forest and is
costly with high-dimensional settings.
:footcite:t:`Chamma_NeurIPS2023` introduced a new model-agnostic solution to
bypass the limitations of the permutation approach under the use of the
conditional schemes. The variable of interest does contain two types of
information: 1) the relationship with the remaining variables and 2) the
relationship with the outcome. The standard permutation, while breaking the
relationship with the outcome, is also destroying the dependency with the
remaining variables. Therefore, instead of directly permuting the variable of
interest, the variable of interest is predicted by the mean of the remaining
variables and the residuals of this prediction are permuted before
reconstructing the new version of the variable. This solution preserves the
dependency with the remaining variables.

In this example, we compare both the standard permutation and its conditional
variant approaches for variable importance on the diabetes dataset for the
single-level case. The aim is to see if integrating the new
statistically-controlled solution has an impact on the results.

References
----------
.. footbibliography::

"""

#############################################################################
Expand All @@ -25,12 +61,17 @@

# Use or not a cross-validation with the provided learner
k_fold = 2
# Identifying the categorical (nominal & ordinal) variables
# Identifying the categorical (nominal, binary & ordinal) variables
variables_categories = {}

#############################################################################
# Standard Variable Importance
# ----------------------------
# To apply the standard permutation, we use the implementation introduced by (Mi
# et al., Nature, 2021) where the significance is measured by the mean of
# -log10(p_value). For this example, the inference estimator is set to the
# Random Forest.
#

bbi_perm = BlockBasedImportance(
estimator="RF",
Expand All @@ -54,6 +95,10 @@
#############################################################################
# Conditional Variable Importance
# -------------------------------
# In this example, for the conditional permutation with the two blocks
# processing, the inference and importance estimators are set to the Random
# Forest. The significance is measured by the mean of -log10(p_value).
#

bbi_cond = BlockBasedImportance(
estimator="RF",
Expand All @@ -79,9 +124,9 @@
# -----------------------

list_res = {"Perm": [], "Cond": []}
for ind_el, el in enumerate(diabetes.feature_names):
list_res["Perm"].append(pvals_perm[ind_el][0])
list_res["Cond"].append(pvals_cond[ind_el][0])
for index, _ in enumerate(diabetes.feature_names):
list_res["Perm"].append(pvals_perm[index][0])
list_res["Cond"].append(pvals_cond[index][0])

x = np.arange(len(diabetes.feature_names))
width = 0.25 # the width of the bars
Expand All @@ -98,5 +143,14 @@
ax.legend(loc="upper left", ncols=2)
ax.set_ylim(0, 3)
ax.axhline(y=-np.log10(0.05), color="r", linestyle="-")

plt.show()

#############################################################################
# Analysis of the results
# -----------------------
# While the standard permutation flags multiple variables to be significant for
# this prediction, the conditional permutation (the controlled alternative)
# shows an agreement for "bmi", "bp" and "s6" but also highlights the importance
# of "sex" in this prediction, thus reducing the input space to four significant
# variables.
#
Loading