diff --git a/docs/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip b/docs/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
index b07fd63..fc22e40 100644
Binary files a/docs/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip and b/docs/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip differ
diff --git a/docs/_downloads/117422582fb46cc8ed6549598a2d87de/plot_dcrt_example.zip b/docs/_downloads/117422582fb46cc8ed6549598a2d87de/plot_dcrt_example.zip
index f25b521..6dd860f 100644
Binary files a/docs/_downloads/117422582fb46cc8ed6549598a2d87de/plot_dcrt_example.zip and b/docs/_downloads/117422582fb46cc8ed6549598a2d87de/plot_dcrt_example.zip differ
diff --git a/docs/_downloads/19e00c6ab9a5db77483b1039457f1a16/plot_residuals_sampling.ipynb b/docs/_downloads/19e00c6ab9a5db77483b1039457f1a16/plot_residuals_sampling.ipynb
deleted file mode 100644
index 1891e4d..0000000
--- a/docs/_downloads/19e00c6ab9a5db77483b1039457f1a16/plot_residuals_sampling.ipynb
+++ /dev/null
@@ -1,111 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n# Conditional sampling using residuals vs sampling Random Forest\n\nTo deploy the Conditional Permutation Importance (CPI),\n:footcite:t:`Chamma_NeurIPS2023` described two main approaches for the\nconditional scheme: 1) Instead of directly permuting the variable of interest as\nin the Permutation Feature Importance (PFI), the residuals of the prediction of\nthe variable of interest x_j based on the remaining variables is first computed\nalong with a predicted version x_hat_j. These residuals are shuffled and added\nto the predicted version to recreate the variable of interest (Preserving the\ndependency between the variable of interest and the remaining variables while\nbreaking the relationship with the outcome). 2) Another option is to use the\nsampling Random Forest. Using the remaining variables to predict the variable of\ninterest, and instead of predicting the variable of interest as the mean of the\ninstances' outcome of the targeted leaf or the class with the most occurences,\nwe sample from the same leaf of the instance of interest within its neighbors,\nand we follow the standard path of the Random Forest.\n\n## References\n.. footbibliography::\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Imports needed for this script\n\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "from hidimstat import BlockBasedImportance\nfrom joblib import Parallel, delayed\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nfrom scipy.linalg import cholesky\nfrom scipy.stats import norm\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.metrics import roc_auc_score\nimport time\n\nn, p = (100, 12)\ninter_cor, intra_cor = (0, 0.85)\nn_blocks = 1\nn_signal = 2\nproblem_type = \"regression\"\nsnr = 4\nrf = RandomForestRegressor(random_state=2023)\ndict_hyper = {\"max_depth\": [2, 5, 10, 20]}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Generate the synthetic data\nThe function below generates the correlation matrix between the variables\naccording to the provided degrees of correlation (intra + inter). `inter_cor`\nindicates the degree of correlation between the variables/groups whereas\n`intra_cor` specifies the corresponding degree between the variables within\neach group. For the single-level case, `n_blocks` is set to 1 and the\n`intra_cor` is the unique correlation between variables.\n\nNext, we generate the synthetic data by randomly drawing n_signal predictors\nfrom the corresponding p variables and reordering the set of variables to put the\nn_signal predictors at the beginning. Following, the response is generated\nunder a simple linear model with Gaussian noise.\n\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "def generate_cor_blocks(p, inter_cor, intra_cor, n_blocks):\n vars_per_grp = int(p / n_blocks)\n cor_mat = np.zeros((p, p))\n cor_mat.fill(inter_cor)\n for i in range(n_blocks):\n cor_mat[\n (i * vars_per_grp) : ((i + 1) * vars_per_grp),\n (i * vars_per_grp) : ((i + 1) * vars_per_grp),\n ] = intra_cor\n np.fill_diagonal(cor_mat, 1)\n return cor_mat\n\n\ndef _generate_data(seed):\n rng = np.random.RandomState(seed)\n\n cor_mat = generate_cor_blocks(p, inter_cor, intra_cor, n_blocks)\n x = norm.rvs(size=(p, n), random_state=seed)\n c = cholesky(cor_mat, lower=True)\n X = pd.DataFrame(np.dot(c, x).T, columns=[str(i) for i in np.arange(p)])\n\n data = X.copy()\n\n # Randomly draw n_signal predictors which are defined as signal predictors\n indices_var = list(rng.choice(range(data.shape[1]), size=n_signal, replace=False))\n\n # Reorder data matrix so that first n_signal predictors are the signal predictors\n # List of remaining indices\n indices_rem = [ind for ind in range(data.shape[1]) if ind not in indices_var]\n total_indices = indices_var + indices_rem\n # Before including the non-linear effects\n data = data.iloc[:, total_indices]\n data_signal = data.iloc[:, np.arange(n_signal)]\n\n # Determine beta coefficients\n effectset = [-0.5, -1, -2, -3, 0.5, 1, 2, 3]\n beta = rng.choice(effectset, size=data_signal.shape[1], replace=True)\n\n # Generate response\n # The product of the signal predictors with the beta coefficients\n prod_signal = np.dot(data_signal, beta)\n\n sigma_noise = np.linalg.norm(prod_signal, ord=2) / (\n snr * np.sqrt(data_signal.shape[0])\n )\n y = prod_signal + sigma_noise * rng.normal(size=prod_signal.shape[0])\n\n return data, y"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Processing across multiple permutations\nIn order to get statistical significance with p-values, we run the experiments\nacross 10 repetitions.\n\n\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "def compute_simulations(seed):\n X, y = _generate_data(seed)\n # Using the residuals\n start_residuals = time.time()\n bbi_residual = BlockBasedImportance(\n estimator=\"RF\",\n importance_estimator=\"residuals_RF\",\n do_hypertuning=True,\n dict_hypertuning=None,\n conditional=True,\n n_permutations=10,\n n_jobs=2,\n problem_type=\"regression\",\n k_fold=2,\n variables_categories={},\n )\n bbi_residual.fit(X, y)\n results_bbi_residual = bbi_residual.compute_importance()\n\n df_residuals = {}\n df_residuals[\"method\"] = [\"residuals\"] * X.shape[1]\n df_residuals[\"score\"] = [results_bbi_residual[\"score_R2\"]] * X.shape[1]\n df_residuals[\"elapsed\"] = [time.time() - start_residuals] * X.shape[1]\n df_residuals[\"importance\"] = np.ravel(results_bbi_residual[\"importance\"])\n df_residuals[\"p-value\"] = np.ravel(results_bbi_residual[\"pval\"])\n df_residuals[\"iteration\"] = [seed] * X.shape[1]\n df_residuals = pd.DataFrame(df_residuals)\n\n # Using the sampling RF\n start_sampling = time.time()\n bbi_sampling = BlockBasedImportance(\n estimator=\"RF\",\n importance_estimator=\"sampling_RF\",\n do_hypertuning=True,\n dict_hypertuning=None,\n conditional=True,\n n_permutations=10,\n n_jobs=2,\n problem_type=\"regression\",\n k_fold=2,\n variables_categories={},\n )\n bbi_sampling.fit(X, y)\n results_bbi_sampling = bbi_sampling.compute_importance()\n\n df_sampling = {}\n df_sampling[\"method\"] = [\"sampling\"] * X.shape[1]\n df_sampling[\"score\"] = [results_bbi_sampling[\"score_R2\"]] * X.shape[1]\n df_sampling[\"elapsed\"] = [time.time() - start_sampling] * X.shape[1]\n df_sampling[\"importance\"] = np.ravel(results_bbi_sampling[\"importance\"])\n df_sampling[\"p-value\"] = np.ravel(results_bbi_sampling[\"pval\"])\n df_sampling[\"iteration\"] = [seed] * X.shape[1]\n df_sampling = pd.DataFrame(df_sampling)\n\n df_final = pd.concat([df_residuals, df_sampling], axis=0)\n return df_final\n\n\n# Running across 10 repetitions\nparallel = Parallel(n_jobs=2, verbose=1)\nfinal_result = parallel(\n delayed(compute_simulations)(seed=seed) for seed in np.arange(1, 11)\n)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Plotting AUC score and Type-I error\nWith the prediction problems turns to be a binary classification problem for\nthe variables being relevant or non-relevant vs the ground-truth, we measure\nthe performance in terms of type-I error i.e. the rate of true non-relevant\nvariables detected as relevant and AUC score related to correct significant\nvariables ordering.\n\n\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": false
- },
- "outputs": [],
- "source": [
- "df_final_result = pd.concat(final_result, axis=0).reset_index(drop=True)\ndf_auc = df_final_result.groupby(by=[\"method\", \"iteration\"]).apply(\n lambda x: roc_auc_score([1] * n_signal + [0] * (p - n_signal), -x[\"p-value\"])\n)\ndf_auc = df_auc.reset_index(name=\"auc\")\ndf_type_I = df_final_result.groupby(by=[\"method\", \"iteration\"]).apply(\n lambda x: sum(x.iloc[n_signal:, :][\"p-value\"] <= 0.05) / x.iloc[2:, :].shape[0]\n)\ndf_type_I = df_type_I.reset_index(name=\"type-I\")\n\nauc = [\n np.array(df_auc[\"auc\"])[: int(df_auc.shape[0] / 2)],\n np.array(df_auc[\"auc\"])[int(df_auc.shape[0] / 2) :],\n]\ntypeI_error = [\n np.array(df_type_I[\"type-I\"])[: int(df_type_I.shape[0] / 2)],\n np.array(df_type_I[\"type-I\"])[int(df_type_I.shape[0] / 2) :],\n]\n\nfig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), sharey=True)\n\n# AUC score\naxs[0].violinplot(auc, showmeans=False, showmedians=True, vert=False)\naxs[0].set_title(\"AUC score\")\naxs[0].xaxis.grid(True)\naxs[0].set_yticks([x + 1 for x in range(len(auc))], labels=[\"Residuals\", \"Sampling\"])\naxs[0].set_ylabel(\"Method\")\n\n# Type-I Error\naxs[1].violinplot(typeI_error, showmeans=False, showmedians=True, vert=False)\naxs[1].set_title(\"Type-I Error\")\naxs[1].axvline(x=0.05, color=\"r\", label=\"Nominal Rate\")\nplt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Analysis of the results\nWe can observe that the sampling approaches'performance is almost similar to\nthat of the residuals. Sampling accelerates the conditional importance\ncomputation by simplifying the residuals steps.\n\n"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.12.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
\ No newline at end of file
diff --git a/docs/_downloads/1a99dc8cb1c22f91072d67cf26fce26c/plot_knockoff_aggregation.zip b/docs/_downloads/1a99dc8cb1c22f91072d67cf26fce26c/plot_knockoff_aggregation.zip
index 1807c76..0bc90dd 100644
Binary files a/docs/_downloads/1a99dc8cb1c22f91072d67cf26fce26c/plot_knockoff_aggregation.zip and b/docs/_downloads/1a99dc8cb1c22f91072d67cf26fce26c/plot_knockoff_aggregation.zip differ
diff --git a/docs/_downloads/5ca231767268e6cd969e65225d673650/plot_fmri_data_example.zip b/docs/_downloads/5ca231767268e6cd969e65225d673650/plot_fmri_data_example.zip
index cbcfd87..8f3fc4d 100644
Binary files a/docs/_downloads/5ca231767268e6cd969e65225d673650/plot_fmri_data_example.zip and b/docs/_downloads/5ca231767268e6cd969e65225d673650/plot_fmri_data_example.zip differ
diff --git a/docs/_downloads/642b61154cca48af8e3feb505b920e16/plot_dcrt_example.ipynb b/docs/_downloads/642b61154cca48af8e3feb505b920e16/plot_dcrt_example.ipynb
index a5fb0bf..2f27260 100644
--- a/docs/_downloads/642b61154cca48af8e3feb505b920e16/plot_dcrt_example.ipynb
+++ b/docs/_downloads/642b61154cca48af8e3feb505b920e16/plot_dcrt_example.ipynb
@@ -78,7 +78,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.12.6"
+ "version": "3.12.7"
}
},
"nbformat": 4,
diff --git a/docs/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip b/docs/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
index 1272a11..c3d0721 100644
Binary files a/docs/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip and b/docs/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip differ
diff --git a/docs/_downloads/6f624092537330c9f373c01828b2b9ae/plot_diabetes_variable_importance_example.py b/docs/_downloads/6f624092537330c9f373c01828b2b9ae/plot_diabetes_variable_importance_example.py
index 249d326..f5126e2 100644
--- a/docs/_downloads/6f624092537330c9f373c01828b2b9ae/plot_diabetes_variable_importance_example.py
+++ b/docs/_downloads/6f624092537330c9f373c01828b2b9ae/plot_diabetes_variable_importance_example.py
@@ -17,8 +17,8 @@
creating fake significant variables. They introduced a solution for the Random
Forest estimator based on conditional sampling by performing sub-groups
permutation when bisecting the space using the conditioning variables of the
-buiding process. However, this solution is exclusive to the Random Forest and is
-costly with high-dimensional settings.
+buiding process. However, this solution is exclusive to the Random Forest and
+is costly with high-dimensional settings.
:footcite:t:`Chamma_NeurIPS2023` introduced a new model-agnostic solution to
bypass the limitations of the permutation approach under the use of the
conditional schemes. The variable of interest does contain two types of
@@ -45,129 +45,222 @@
#############################################################################
# Imports needed for this script
# ------------------------------
-
import matplotlib.pyplot as plt
import numpy as np
+import pandas as pd
+from scipy.stats import norm
+from sklearn.base import clone
from sklearn.datasets import load_diabetes
+from sklearn.linear_model import RidgeCV
+from sklearn.metrics import r2_score, root_mean_squared_error
+from sklearn.model_selection import KFold
-from hidimstat.bbi import BlockBasedImportance
-from hidimstat import compute_loco
-
-plt.rcParams.update({"font.size": 14})
-
-# Fixing the random seed
-rng = np.random.RandomState(2024)
+from hidimstat.cpi import CPI
+from hidimstat.loco import LOCO
+from hidimstat.permutation_importance import PermutationImportance
+#############################################################################
+# Load the diabetes dataset
+# ------------------------------
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
-# Use or not a cross-validation with the provided learner
-k_fold = 2
-# Identifying the categorical (nominal, binary & ordinal) variables
-variables_categories = {}
+#############################################################################
+# Fit a baseline model on the diabetes dataset
+# ------------------------------
+# We use a Ridge regression model with a 10-fold cross-validation to fit the
+# diabetes dataset.
+
+n_folds = 5
+regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
+regressor_list = [clone(regressor) for _ in range(n_folds)]
+kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
+for i, (train_index, test_index) in enumerate(kf.split(X)):
+ regressor_list[i].fit(X[train_index], y[train_index])
+ score = r2_score(
+ y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
+ )
+ mse = root_mean_squared_error(
+ y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
+ )
+ print(f"Fold {i}: {score}")
+ print(f"Fold {i}: {mse}")
#############################################################################
-# Standard Variable Importance
-# ----------------------------
-# To apply the standard permutation, we use the implementation introduced by (Mi
-# et al., Nature, 2021) where the significance is measured by the mean of
-# -log10(p_value). For this example, the inference estimator is set to the
-# Random Forest learner.
-#
-
-bbi_permutation = BlockBasedImportance(
- estimator="RF",
- importance_estimator="residuals_RF",
- do_hypertuning=True,
- dict_hypertuning=None,
- conditional=False,
- group_stacking=False,
- problem_type="regression",
- k_fold=k_fold,
- variables_categories=variables_categories,
- n_jobs=2,
- verbose=0,
- n_permutations=100,
-)
-bbi_permutation.fit(X, y)
-print("Computing the importance scores with standard permutation")
-results_permutation = bbi_permutation.compute_importance()
-pvals_permutation = -np.log10(results_permutation["pval"] + 1e-10)
+# Fit a baselien model on the diabetes dataset
+# ------------------------------
+# We use a Ridge regression model with a 10-fold cross-validation to fit the
+# diabetes dataset.
+
+n_folds = 10
+regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
+regressor_list = [clone(regressor) for _ in range(n_folds)]
+kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
+for i, (train_index, test_index) in enumerate(kf.split(X)):
+ regressor_list[i].fit(X[train_index], y[train_index])
+ score = r2_score(
+ y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
+ )
+ mse = root_mean_squared_error(
+ y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
+ )
+
+ print(f"Fold {i}: {score}")
+ print(f"Fold {i}: {mse}")
#############################################################################
-# Conditional Variable Importance
-# -------------------------------
-# For the conditional permutation importance based on the two blocks (inference
-# + importance), the estimators are set to the Random Forest learner. The
-# significance is measured by the mean of -log10(p_value).
-#
-
-bbi_conditional = BlockBasedImportance(
- estimator="RF",
- importance_estimator="residuals_RF",
- do_hypertuning=True,
- dict_hypertuning=None,
- conditional=True,
- group_stacking=False,
- problem_type="regression",
- k_fold=k_fold,
- variables_categories=variables_categories,
- n_jobs=2,
- verbose=0,
- n_permutations=100,
-)
-bbi_conditional.fit(X, y)
-print("Computing the importance scores with conditional permutation")
-results_conditional = bbi_conditional.compute_importance()
-pvals_conditional = -np.log10(results_conditional["pval"] + 1e-5)
+# Measure the importance of variables using the CPI method
+# ------------------------------
+
+cpi_importance_list = []
+for i, (train_index, test_index) in enumerate(kf.split(X)):
+ print(f"Fold {i}")
+ X_train, X_test = X[train_index], X[test_index]
+ y_train, y_test = y[train_index], y[test_index]
+ cpi = CPI(
+ estimator=regressor_list[i],
+ imputation_model=RidgeCV(alphas=np.logspace(-3, 3, 10)),
+ # covariate_estimator=HistGradientBoostingRegressor(random_state=0,),
+ n_permutations=50,
+ random_state=0,
+ n_jobs=4,
+ )
+ cpi.fit(X_train, y_train)
+ importance = cpi.score(X_test, y_test)
+ cpi_importance_list.append(importance)
#############################################################################
-# Leave-One-Covariate-Out (LOCO)
+# Measure the importance of variables using the LOCO method
# ------------------------------
-# We compare the previous permutation-based approaches with a removal-based
-# approach LOCO (Williamson et al., Journal of the American Statistical
-# Association, 2021) where the variable of interest is removed and the inference
-# estimator is retrained using the new features to compare the loss for any drop in the
-# performance.
-#
-results_loco = compute_loco(X, y, use_dnn=False)
-pvals_loco = -np.log10(results_loco["p_value"] + 1e-5)
+loco_importance_list = []
+
+for i, (train_index, test_index) in enumerate(kf.split(X)):
+ print(f"Fold {i}")
+ X_train, X_test = X[train_index], X[test_index]
+ y_train, y_test = y[train_index], y[test_index]
+ loco = LOCO(
+ estimator=regressor_list[i],
+ random_state=0,
+ n_jobs=4,
+ )
+ loco.fit(X_train, y_train)
+ importance = loco.score(X_test, y_test)
+ loco_importance_list.append(importance)
+
#############################################################################
-# Plotting the comparison
-# -----------------------
-
-list_res = {"Permutation": [], "Conditional": [], "LOCO": []}
-for index, _ in enumerate(diabetes.feature_names):
- list_res["Permutation"].append(pvals_permutation[index][0])
- list_res["Conditional"].append(pvals_conditional[index][0])
- list_res["LOCO"].append(pvals_loco[index])
-
-x = np.arange(len(diabetes.feature_names))
-width = 0.25 # the width of the bars
-multiplier = 0
-fig, ax = plt.subplots(figsize=(10, 10), layout="constrained")
-
-for attribute, measurement in list_res.items():
- offset = width * multiplier
- rects = ax.bar(x + offset, measurement, width, label=attribute)
- multiplier += 1
-
-ax.set_ylabel(r"$-log_{10}p_{val}$")
-ax.set_xticks(x + width / 2, diabetes.feature_names)
-ax.legend(loc="upper left", ncols=3)
-ax.set_ylim(0, 3)
-ax.axhline(y=-np.log10(0.05), color="r", linestyle="-")
-plt.show()
+# Measure the importance of variables using the permutation method
+# ------------------------------
+
+pi_importance_list = []
+
+for i, (train_index, test_index) in enumerate(kf.split(X)):
+ print(f"Fold {i}")
+ X_train, X_test = X[train_index], X[test_index]
+ y_train, y_test = y[train_index], y[test_index]
+ pi = PermutationImportance(
+ estimator=regressor_list[i],
+ n_permutations=50,
+ random_state=0,
+ n_jobs=4,
+ )
+ pi.fit(X_train, y_train)
+ importance = pi.score(X_test, y_test)
+ pi_importance_list.append(importance)
+
#############################################################################
-# Analysis of the results
-# -----------------------
-# While the standard permutation flags multiple variables to be significant for
-# this prediction, the conditional permutation (the controlled alternative)
-# shows an agreement for "bmi", "bp" and "s6" but also highlights the importance
-# of "sex" in this prediction, thus reducing the input space to four significant
-# variables. LOCO underlines the importance of one variable "bp" for this
-# prediction problem.
-#
+# Define a function to compute the p-value from importance values
+# ------------------------------
+def compute_pval(vim):
+ mean_vim = np.mean(vim, axis=0)
+ std_vim = np.std(vim, axis=0)
+ pval = norm.sf(mean_vim / std_vim)
+ return np.clip(pval, 1e-10, 1 - 1e-10)
+
+
+# Define a function to compute the p-value from importance values
+# ------------------------------
+def compute_pval(vim):
+ mean_vim = np.mean(vim, axis=0)
+ std_vim = np.std(vim, axis=0)
+ pval = norm.sf(mean_vim / std_vim)
+ return np.clip(pval, 1e-10, 1 - 1e-10)
+
+
+#############################################################################
+# Analyze the results
+# ------------------------------
+
+
+cpi_vim_arr = np.array([x["importance"] for x in cpi_importance_list]) / 2
+cpi_pval = compute_pval(cpi_vim_arr)
+
+vim = [
+ pd.DataFrame(
+ {
+ "var": np.arange(cpi_vim_arr.shape[1]),
+ "importance": x["importance"],
+ "fold": i,
+ "pval": cpi_pval,
+ "method": "CPI",
+ }
+ )
+ for x in cpi_importance_list
+]
+
+loco_vim_arr = np.array([x["importance"] for x in loco_importance_list])
+loco_pval = compute_pval(loco_vim_arr)
+
+vim += [
+ pd.DataFrame(
+ {
+ "var": np.arange(loco_vim_arr.shape[1]),
+ "importance": x["importance"],
+ "fold": i,
+ "pval": loco_pval,
+ "method": "LOCO",
+ }
+ )
+ for x in loco_importance_list
+]
+
+pi_vim_arr = np.array([x["importance"] for x in pi_importance_list])
+pi_pval = compute_pval(pi_vim_arr)
+
+vim += [
+ pd.DataFrame(
+ {
+ "var": np.arange(pi_vim_arr.shape[1]),
+ "importance": x["importance"],
+ "fold": i,
+ "pval": pi_pval,
+ "method": "PI",
+ }
+ )
+ for x in pi_importance_list
+]
+
+fig, ax = plt.subplots()
+df_plot = pd.concat(vim)
+df_plot["pval"] = -np.log10(df_plot["pval"])
+methods = df_plot["method"].unique()
+colors = plt.cm.get_cmap("tab10", 10)
+
+for i, method in enumerate(methods):
+ subset = df_plot[df_plot["method"] == method]
+ ax.bar(
+ subset["var"] + i * 0.2,
+ subset["pval"],
+ width=0.2,
+ label=method,
+ color=colors(i),
+ )
+
+ax.legend(title="Method")
+ax.set_ylabel(r"$-\log_{10}(\text{p-value})$")
+ax.axhline(-np.log10(0.05), color="tab:red", ls="--")
+ax.set_xlabel("Variable")
+ax.set_xticklabels(diabetes.feature_names)
+plt.show()
diff --git a/docs/_downloads/707d94040f5ada342e781499193f46f1/plot_diabetes_variable_importance_example.zip b/docs/_downloads/707d94040f5ada342e781499193f46f1/plot_diabetes_variable_importance_example.zip
index 16ab338..28aaf4c 100644
Binary files a/docs/_downloads/707d94040f5ada342e781499193f46f1/plot_diabetes_variable_importance_example.zip and b/docs/_downloads/707d94040f5ada342e781499193f46f1/plot_diabetes_variable_importance_example.zip differ
diff --git a/docs/_downloads/7d2770a07fbe419760c9ac177df4f69e/plot_2D_simulation_example.ipynb b/docs/_downloads/7d2770a07fbe419760c9ac177df4f69e/plot_2D_simulation_example.ipynb
index db2015f..dd3b212 100644
--- a/docs/_downloads/7d2770a07fbe419760c9ac177df4f69e/plot_2D_simulation_example.ipynb
+++ b/docs/_downloads/7d2770a07fbe419760c9ac177df4f69e/plot_2D_simulation_example.ipynb
@@ -240,7 +240,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.12.6"
+ "version": "3.12.7"
}
},
"nbformat": 4,
diff --git a/docs/_downloads/8635bd4b58b2828c710e4331f35d14f6/plot_knockoff_aggregation.ipynb b/docs/_downloads/8635bd4b58b2828c710e4331f35d14f6/plot_knockoff_aggregation.ipynb
index ca697bf..36ec633 100644
--- a/docs/_downloads/8635bd4b58b2828c710e4331f35d14f6/plot_knockoff_aggregation.ipynb
+++ b/docs/_downloads/8635bd4b58b2828c710e4331f35d14f6/plot_knockoff_aggregation.ipynb
@@ -42,7 +42,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.12.6"
+ "version": "3.12.7"
}
},
"nbformat": 4,
diff --git a/docs/_downloads/931385a6992917f918857d6a3ee9f780/plot_fmri_data_example.ipynb b/docs/_downloads/931385a6992917f918857d6a3ee9f780/plot_fmri_data_example.ipynb
index e4d5c78..947c86f 100644
--- a/docs/_downloads/931385a6992917f918857d6a3ee9f780/plot_fmri_data_example.ipynb
+++ b/docs/_downloads/931385a6992917f918857d6a3ee9f780/plot_fmri_data_example.ipynb
@@ -301,7 +301,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.12.6"
+ "version": "3.12.7"
}
},
"nbformat": 4,
diff --git a/docs/_downloads/9c9d12a10e1e5ba45b342ada5a7b5181/plot_residuals_sampling.py b/docs/_downloads/9c9d12a10e1e5ba45b342ada5a7b5181/plot_residuals_sampling.py
deleted file mode 100644
index ec30d6e..0000000
--- a/docs/_downloads/9c9d12a10e1e5ba45b342ada5a7b5181/plot_residuals_sampling.py
+++ /dev/null
@@ -1,237 +0,0 @@
-"""
-Conditional sampling using residuals vs sampling Random Forest
-==============================================================
-
-To deploy the Conditional Permutation Importance (CPI),
-:footcite:t:`Chamma_NeurIPS2023` described two main approaches for the
-conditional scheme: 1) Instead of directly permuting the variable of interest as
-in the Permutation Feature Importance (PFI), the residuals of the prediction of
-the variable of interest x_j based on the remaining variables is first computed
-along with a predicted version x_hat_j. These residuals are shuffled and added
-to the predicted version to recreate the variable of interest (Preserving the
-dependency between the variable of interest and the remaining variables while
-breaking the relationship with the outcome). 2) Another option is to use the
-sampling Random Forest. Using the remaining variables to predict the variable of
-interest, and instead of predicting the variable of interest as the mean of the
-instances' outcome of the targeted leaf or the class with the most occurences,
-we sample from the same leaf of the instance of interest within its neighbors,
-and we follow the standard path of the Random Forest.
-
-References
-----------
-.. footbibliography::
-
-"""
-
-#############################################################################
-# Imports needed for this script
-# ------------------------------
-
-from hidimstat import BlockBasedImportance
-from joblib import Parallel, delayed
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-from scipy.linalg import cholesky
-from scipy.stats import norm
-from sklearn.ensemble import RandomForestRegressor
-from sklearn.metrics import roc_auc_score
-import time
-
-n, p = (100, 12)
-inter_cor, intra_cor = (0, 0.85)
-n_blocks = 1
-n_signal = 2
-problem_type = "regression"
-snr = 4
-rf = RandomForestRegressor(random_state=2023)
-dict_hyper = {"max_depth": [2, 5, 10, 20]}
-
-#############################################################################
-# Generate the synthetic data
-# ---------------------------
-# The function below generates the correlation matrix between the variables
-# according to the provided degrees of correlation (intra + inter). `inter_cor`
-# indicates the degree of correlation between the variables/groups whereas
-# `intra_cor` specifies the corresponding degree between the variables within
-# each group. For the single-level case, `n_blocks` is set to 1 and the
-# `intra_cor` is the unique correlation between variables.
-#
-# Next, we generate the synthetic data by randomly drawing n_signal predictors
-# from the corresponding p variables and reordering the set of variables to put the
-# n_signal predictors at the beginning. Following, the response is generated
-# under a simple linear model with Gaussian noise.
-
-
-def generate_cor_blocks(p, inter_cor, intra_cor, n_blocks):
- vars_per_grp = int(p / n_blocks)
- cor_mat = np.zeros((p, p))
- cor_mat.fill(inter_cor)
- for i in range(n_blocks):
- cor_mat[
- (i * vars_per_grp) : ((i + 1) * vars_per_grp),
- (i * vars_per_grp) : ((i + 1) * vars_per_grp),
- ] = intra_cor
- np.fill_diagonal(cor_mat, 1)
- return cor_mat
-
-
-def _generate_data(seed):
- rng = np.random.RandomState(seed)
-
- cor_mat = generate_cor_blocks(p, inter_cor, intra_cor, n_blocks)
- x = norm.rvs(size=(p, n), random_state=seed)
- c = cholesky(cor_mat, lower=True)
- X = pd.DataFrame(np.dot(c, x).T, columns=[str(i) for i in np.arange(p)])
-
- data = X.copy()
-
- # Randomly draw n_signal predictors which are defined as signal predictors
- indices_var = list(rng.choice(range(data.shape[1]), size=n_signal, replace=False))
-
- # Reorder data matrix so that first n_signal predictors are the signal predictors
- # List of remaining indices
- indices_rem = [ind for ind in range(data.shape[1]) if ind not in indices_var]
- total_indices = indices_var + indices_rem
- # Before including the non-linear effects
- data = data.iloc[:, total_indices]
- data_signal = data.iloc[:, np.arange(n_signal)]
-
- # Determine beta coefficients
- effectset = [-0.5, -1, -2, -3, 0.5, 1, 2, 3]
- beta = rng.choice(effectset, size=data_signal.shape[1], replace=True)
-
- # Generate response
- # The product of the signal predictors with the beta coefficients
- prod_signal = np.dot(data_signal, beta)
-
- sigma_noise = np.linalg.norm(prod_signal, ord=2) / (
- snr * np.sqrt(data_signal.shape[0])
- )
- y = prod_signal + sigma_noise * rng.normal(size=prod_signal.shape[0])
-
- return data, y
-
-
-#############################################################################
-# Processing across multiple permutations
-# ---------------------------------------
-# In order to get statistical significance with p-values, we run the experiments
-# across 10 repetitions.
-#
-
-
-def compute_simulations(seed):
- X, y = _generate_data(seed)
- # Using the residuals
- start_residuals = time.time()
- bbi_residual = BlockBasedImportance(
- estimator="RF",
- importance_estimator="residuals_RF",
- do_hypertuning=True,
- dict_hypertuning=None,
- conditional=True,
- n_permutations=10,
- n_jobs=2,
- problem_type="regression",
- k_fold=2,
- variables_categories={},
- )
- bbi_residual.fit(X, y)
- results_bbi_residual = bbi_residual.compute_importance()
-
- df_residuals = {}
- df_residuals["method"] = ["residuals"] * X.shape[1]
- df_residuals["score"] = [results_bbi_residual["score_R2"]] * X.shape[1]
- df_residuals["elapsed"] = [time.time() - start_residuals] * X.shape[1]
- df_residuals["importance"] = np.ravel(results_bbi_residual["importance"])
- df_residuals["p-value"] = np.ravel(results_bbi_residual["pval"])
- df_residuals["iteration"] = [seed] * X.shape[1]
- df_residuals = pd.DataFrame(df_residuals)
-
- # Using the sampling RF
- start_sampling = time.time()
- bbi_sampling = BlockBasedImportance(
- estimator="RF",
- importance_estimator="sampling_RF",
- do_hypertuning=True,
- dict_hypertuning=None,
- conditional=True,
- n_permutations=10,
- n_jobs=2,
- problem_type="regression",
- k_fold=2,
- variables_categories={},
- )
- bbi_sampling.fit(X, y)
- results_bbi_sampling = bbi_sampling.compute_importance()
-
- df_sampling = {}
- df_sampling["method"] = ["sampling"] * X.shape[1]
- df_sampling["score"] = [results_bbi_sampling["score_R2"]] * X.shape[1]
- df_sampling["elapsed"] = [time.time() - start_sampling] * X.shape[1]
- df_sampling["importance"] = np.ravel(results_bbi_sampling["importance"])
- df_sampling["p-value"] = np.ravel(results_bbi_sampling["pval"])
- df_sampling["iteration"] = [seed] * X.shape[1]
- df_sampling = pd.DataFrame(df_sampling)
-
- df_final = pd.concat([df_residuals, df_sampling], axis=0)
- return df_final
-
-
-# Running across 10 repetitions
-parallel = Parallel(n_jobs=2, verbose=1)
-final_result = parallel(
- delayed(compute_simulations)(seed=seed) for seed in np.arange(1, 11)
-)
-
-#############################################################################
-# Plotting AUC score and Type-I error
-# -----------------------------------
-# With the prediction problems turns to be a binary classification problem for
-# the variables being relevant or non-relevant vs the ground-truth, we measure
-# the performance in terms of type-I error i.e. the rate of true non-relevant
-# variables detected as relevant and AUC score related to correct significant
-# variables ordering.
-#
-
-df_final_result = pd.concat(final_result, axis=0).reset_index(drop=True)
-df_auc = df_final_result.groupby(by=["method", "iteration"]).apply(
- lambda x: roc_auc_score([1] * n_signal + [0] * (p - n_signal), -x["p-value"])
-)
-df_auc = df_auc.reset_index(name="auc")
-df_type_I = df_final_result.groupby(by=["method", "iteration"]).apply(
- lambda x: sum(x.iloc[n_signal:, :]["p-value"] <= 0.05) / x.iloc[2:, :].shape[0]
-)
-df_type_I = df_type_I.reset_index(name="type-I")
-
-auc = [
- np.array(df_auc["auc"])[: int(df_auc.shape[0] / 2)],
- np.array(df_auc["auc"])[int(df_auc.shape[0] / 2) :],
-]
-typeI_error = [
- np.array(df_type_I["type-I"])[: int(df_type_I.shape[0] / 2)],
- np.array(df_type_I["type-I"])[int(df_type_I.shape[0] / 2) :],
-]
-
-fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), sharey=True)
-
-# AUC score
-axs[0].violinplot(auc, showmeans=False, showmedians=True, vert=False)
-axs[0].set_title("AUC score")
-axs[0].xaxis.grid(True)
-axs[0].set_yticks([x + 1 for x in range(len(auc))], labels=["Residuals", "Sampling"])
-axs[0].set_ylabel("Method")
-
-# Type-I Error
-axs[1].violinplot(typeI_error, showmeans=False, showmedians=True, vert=False)
-axs[1].set_title("Type-I Error")
-axs[1].axvline(x=0.05, color="r", label="Nominal Rate")
-plt.show()
-
-#############################################################################
-# Analysis of the results
-# -----------------------
-# We can observe that the sampling approaches'performance is almost similar to
-# that of the residuals. Sampling accelerates the conditional importance
-# computation by simplifying the residuals steps.
diff --git a/docs/_downloads/a70e28075a283d5e3fe675ced733c459/plot_diabetes_variable_importance_example.ipynb b/docs/_downloads/a70e28075a283d5e3fe675ced733c459/plot_diabetes_variable_importance_example.ipynb
index 3e34c08..74d8cd2 100644
--- a/docs/_downloads/a70e28075a283d5e3fe675ced733c459/plot_diabetes_variable_importance_example.ipynb
+++ b/docs/_downloads/a70e28075a283d5e3fe675ced733c459/plot_diabetes_variable_importance_example.ipynb
@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n# Variable Importance on diabetes dataset\n\nVariable Importance estimates the influence of a given input variable to the\nprediction made by a model. To assess variable importance in a prediction\nproblem, :footcite:t:`breimanRandomForests2001` introduced the permutation\napproach where the values are shuffled for one variable/column at a time. This\npermutation breaks the relationship between the variable of interest and the\noutcome. Following, the loss score is checked before and after this\nsubstitution for any significant drop in the performance which reflects the\nsignificance of this variable to predict the outcome. This ease-to-use solution\nis demonstrated, in the work by\n:footcite:t:`stroblConditionalVariableImportance2008`, to be affected by the\ndegree of correlation between the variables, thus biased towards truly\nnon-significant variables highly correlated with the significant ones and\ncreating fake significant variables. They introduced a solution for the Random\nForest estimator based on conditional sampling by performing sub-groups\npermutation when bisecting the space using the conditioning variables of the\nbuiding process. However, this solution is exclusive to the Random Forest and is\ncostly with high-dimensional settings.\n:footcite:t:`Chamma_NeurIPS2023` introduced a new model-agnostic solution to\nbypass the limitations of the permutation approach under the use of the\nconditional schemes. The variable of interest does contain two types of\ninformation: 1) the relationship with the remaining variables and 2) the\nrelationship with the outcome. The standard permutation, while breaking the\nrelationship with the outcome, is also destroying the dependency with the\nremaining variables. Therefore, instead of directly permuting the variable of\ninterest, the variable of interest is predicted by the remaining\nvariables and the residuals of this prediction are permuted before\nreconstructing the new version of the variable. This solution preserves the\ndependency with the remaining variables.\n\nIn this example, we compare both the standard permutation and its conditional\nvariant approaches for variable importance on the diabetes dataset for the\nsingle-level case. The aim is to see if integrating the new\nstatistically-controlled solution has an impact on the results.\n\n## References\n.. footbibliography::\n"
+ "\n# Variable Importance on diabetes dataset\n\nVariable Importance estimates the influence of a given input variable to the\nprediction made by a model. To assess variable importance in a prediction\nproblem, :footcite:t:`breimanRandomForests2001` introduced the permutation\napproach where the values are shuffled for one variable/column at a time. This\npermutation breaks the relationship between the variable of interest and the\noutcome. Following, the loss score is checked before and after this\nsubstitution for any significant drop in the performance which reflects the\nsignificance of this variable to predict the outcome. This ease-to-use solution\nis demonstrated, in the work by\n:footcite:t:`stroblConditionalVariableImportance2008`, to be affected by the\ndegree of correlation between the variables, thus biased towards truly\nnon-significant variables highly correlated with the significant ones and\ncreating fake significant variables. They introduced a solution for the Random\nForest estimator based on conditional sampling by performing sub-groups\npermutation when bisecting the space using the conditioning variables of the\nbuiding process. However, this solution is exclusive to the Random Forest and \nis costly with high-dimensional settings.\n:footcite:t:`Chamma_NeurIPS2023` introduced a new model-agnostic solution to\nbypass the limitations of the permutation approach under the use of the\nconditional schemes. The variable of interest does contain two types of\ninformation: 1) the relationship with the remaining variables and 2) the\nrelationship with the outcome. The standard permutation, while breaking the\nrelationship with the outcome, is also destroying the dependency with the\nremaining variables. Therefore, instead of directly permuting the variable of\ninterest, the variable of interest is predicted by the remaining\nvariables and the residuals of this prediction are permuted before\nreconstructing the new version of the variable. This solution preserves the\ndependency with the remaining variables.\n\nIn this example, we compare both the standard permutation and its conditional\nvariant approaches for variable importance on the diabetes dataset for the\nsingle-level case. The aim is to see if integrating the new\nstatistically-controlled solution has an impact on the results.\n\n## References\n.. footbibliography::\n"
]
},
{
@@ -22,14 +22,14 @@
},
"outputs": [],
"source": [
- "import matplotlib.pyplot as plt\nimport numpy as np\nfrom sklearn.datasets import load_diabetes\n\nfrom hidimstat.bbi import BlockBasedImportance\nfrom hidimstat import compute_loco\n\nplt.rcParams.update({\"font.size\": 14})\n\n# Fixing the random seed\nrng = np.random.RandomState(2024)\n\ndiabetes = load_diabetes()\nX, y = diabetes.data, diabetes.target\n\n# Use or not a cross-validation with the provided learner\nk_fold = 2\n# Identifying the categorical (nominal, binary & ordinal) variables\nvariables_categories = {}"
+ "import matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\nfrom scipy.stats import norm\nfrom sklearn.base import clone\nfrom sklearn.datasets import load_diabetes\nfrom sklearn.linear_model import RidgeCV\nfrom sklearn.metrics import r2_score, root_mean_squared_error\nfrom sklearn.model_selection import KFold\n\nfrom hidimstat.cpi import CPI\nfrom hidimstat.loco import LOCO\nfrom hidimstat.permutation_importance import PermutationImportance"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Standard Variable Importance\nTo apply the standard permutation, we use the implementation introduced by (Mi\net al., Nature, 2021) where the significance is measured by the mean of\n-log10(p_value). For this example, the inference estimator is set to the\nRandom Forest learner.\n\n\n"
+ "## Load the diabetes dataset\n\n"
]
},
{
@@ -40,14 +40,14 @@
},
"outputs": [],
"source": [
- "bbi_permutation = BlockBasedImportance(\n estimator=\"RF\",\n importance_estimator=\"residuals_RF\",\n do_hypertuning=True,\n dict_hypertuning=None,\n conditional=False,\n group_stacking=False,\n problem_type=\"regression\",\n k_fold=k_fold,\n variables_categories=variables_categories,\n n_jobs=2,\n verbose=0,\n n_permutations=100,\n)\nbbi_permutation.fit(X, y)\nprint(\"Computing the importance scores with standard permutation\")\nresults_permutation = bbi_permutation.compute_importance()\npvals_permutation = -np.log10(results_permutation[\"pval\"] + 1e-10)"
+ "diabetes = load_diabetes()\nX, y = diabetes.data, diabetes.target"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Conditional Variable Importance\nFor the conditional permutation importance based on the two blocks (inference\n+ importance), the estimators are set to the Random Forest learner. The\nsignificance is measured by the mean of -log10(p_value).\n\n\n"
+ "## Fit a baseline model on the diabetes dataset\nWe use a Ridge regression model with a 10-fold cross-validation to fit the\ndiabetes dataset.\n\n"
]
},
{
@@ -58,14 +58,14 @@
},
"outputs": [],
"source": [
- "bbi_conditional = BlockBasedImportance(\n estimator=\"RF\",\n importance_estimator=\"residuals_RF\",\n do_hypertuning=True,\n dict_hypertuning=None,\n conditional=True,\n group_stacking=False,\n problem_type=\"regression\",\n k_fold=k_fold,\n variables_categories=variables_categories,\n n_jobs=2,\n verbose=0,\n n_permutations=100,\n)\nbbi_conditional.fit(X, y)\nprint(\"Computing the importance scores with conditional permutation\")\nresults_conditional = bbi_conditional.compute_importance()\npvals_conditional = -np.log10(results_conditional[\"pval\"] + 1e-5)"
+ "n_folds = 5\nregressor = RidgeCV(alphas=np.logspace(-3, 3, 10))\nregressor_list = [clone(regressor) for _ in range(n_folds)]\nkf = KFold(n_splits=n_folds, shuffle=True, random_state=0)\nfor i, (train_index, test_index) in enumerate(kf.split(X)):\n regressor_list[i].fit(X[train_index], y[train_index])\n score = r2_score(\n y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])\n )\n mse = root_mean_squared_error(\n y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])\n )\n\n print(f\"Fold {i}: {score}\")\n print(f\"Fold {i}: {mse}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Leave-One-Covariate-Out (LOCO)\nWe compare the previous permutation-based approaches with a removal-based\napproach LOCO (Williamson et al., Journal of the American Statistical\nAssociation, 2021) where the variable of interest is removed and the inference\nestimator is retrained using the new features to compare the loss for any drop in the\nperformance.\n\n\n"
+ "## Fit a baselien model on the diabetes dataset\nWe use a Ridge regression model with a 10-fold cross-validation to fit the\ndiabetes dataset.\n\n"
]
},
{
@@ -76,14 +76,14 @@
},
"outputs": [],
"source": [
- "results_loco = compute_loco(X, y, use_dnn=False)\npvals_loco = -np.log10(results_loco[\"p_value\"] + 1e-5)"
+ "n_folds = 10\nregressor = RidgeCV(alphas=np.logspace(-3, 3, 10))\nregressor_list = [clone(regressor) for _ in range(n_folds)]\nkf = KFold(n_splits=n_folds, shuffle=True, random_state=0)\nfor i, (train_index, test_index) in enumerate(kf.split(X)):\n regressor_list[i].fit(X[train_index], y[train_index])\n score = r2_score(\n y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])\n )\n mse = root_mean_squared_error(\n y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])\n )\n\n print(f\"Fold {i}: {score}\")\n print(f\"Fold {i}: {mse}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Plotting the comparison\n\n"
+ "## Measure the importance of variables using the CPI method\n\n"
]
},
{
@@ -94,14 +94,79 @@
},
"outputs": [],
"source": [
- "list_res = {\"Permutation\": [], \"Conditional\": [], \"LOCO\": []}\nfor index, _ in enumerate(diabetes.feature_names):\n list_res[\"Permutation\"].append(pvals_permutation[index][0])\n list_res[\"Conditional\"].append(pvals_conditional[index][0])\n list_res[\"LOCO\"].append(pvals_loco[index])\n\nx = np.arange(len(diabetes.feature_names))\nwidth = 0.25 # the width of the bars\nmultiplier = 0\nfig, ax = plt.subplots(figsize=(10, 10), layout=\"constrained\")\n\nfor attribute, measurement in list_res.items():\n offset = width * multiplier\n rects = ax.bar(x + offset, measurement, width, label=attribute)\n multiplier += 1\n\nax.set_ylabel(r\"$-log_{10}p_{val}$\")\nax.set_xticks(x + width / 2, diabetes.feature_names)\nax.legend(loc=\"upper left\", ncols=3)\nax.set_ylim(0, 3)\nax.axhline(y=-np.log10(0.05), color=\"r\", linestyle=\"-\")\nplt.show()"
+ "cpi_importance_list = []\nfor i, (train_index, test_index) in enumerate(kf.split(X)):\n print(f\"Fold {i}\")\n X_train, X_test = X[train_index], X[test_index]\n y_train, y_test = y[train_index], y[test_index]\n cpi = CPI(\n estimator=regressor_list[i],\n imputation_model=RidgeCV(alphas=np.logspace(-3, 3, 10)),\n # covariate_estimator=HistGradientBoostingRegressor(random_state=0,),\n n_permutations=50,\n random_state=0,\n n_jobs=4,\n )\n cpi.fit(X_train, y_train)\n importance = cpi.score(X_test, y_test)\n cpi_importance_list.append(importance)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Analysis of the results\nWhile the standard permutation flags multiple variables to be significant for\nthis prediction, the conditional permutation (the controlled alternative)\nshows an agreement for \"bmi\", \"bp\" and \"s6\" but also highlights the importance\nof \"sex\" in this prediction, thus reducing the input space to four significant\nvariables. LOCO underlines the importance of one variable \"bp\" for this\nprediction problem.\n\n\n"
+ "## Measure the importance of variables using the LOCO method\n\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "loco_importance_list = []\n\nfor i, (train_index, test_index) in enumerate(kf.split(X)):\n print(f\"Fold {i}\")\n X_train, X_test = X[train_index], X[test_index]\n y_train, y_test = y[train_index], y[test_index]\n loco = LOCO(\n estimator=regressor_list[i],\n random_state=0,\n n_jobs=4,\n )\n loco.fit(X_train, y_train)\n importance = loco.score(X_test, y_test)\n loco_importance_list.append(importance)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Measure the importance of variables using the permutation method\n\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "pi_importance_list = []\n\nfor i, (train_index, test_index) in enumerate(kf.split(X)):\n print(f\"Fold {i}\")\n X_train, X_test = X[train_index], X[test_index]\n y_train, y_test = y[train_index], y[test_index]\n pi = PermutationImportance(\n estimator=regressor_list[i],\n n_permutations=50,\n random_state=0,\n n_jobs=4,\n )\n pi.fit(X_train, y_train)\n importance = pi.score(X_test, y_test)\n pi_importance_list.append(importance)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Define a function to compute the p-value from importance values\n\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "def compute_pval(vim):\n mean_vim = np.mean(vim, axis=0)\n std_vim = np.std(vim, axis=0)\n pval = norm.sf(mean_vim / std_vim)\n return np.clip(pval, 1e-10, 1 - 1e-10)\n\n\n# Define a function to compute the p-value from importance values\n# ------------------------------\ndef compute_pval(vim):\n mean_vim = np.mean(vim, axis=0)\n std_vim = np.std(vim, axis=0)\n pval = norm.sf(mean_vim / std_vim)\n return np.clip(pval, 1e-10, 1 - 1e-10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analyze the results\n\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "cpi_vim_arr = np.array([x[\"importance\"] for x in cpi_importance_list]) / 2\ncpi_pval = compute_pval(cpi_vim_arr)\n\nvim = [\n pd.DataFrame(\n {\n \"var\": np.arange(cpi_vim_arr.shape[1]),\n \"importance\": x[\"importance\"],\n \"fold\": i,\n \"pval\": cpi_pval,\n \"method\": \"CPI\",\n }\n )\n for x in cpi_importance_list\n]\n\nloco_vim_arr = np.array([x[\"importance\"] for x in loco_importance_list])\nloco_pval = compute_pval(loco_vim_arr)\n\nvim += [\n pd.DataFrame(\n {\n \"var\": np.arange(loco_vim_arr.shape[1]),\n \"importance\": x[\"importance\"],\n \"fold\": i,\n \"pval\": loco_pval,\n \"method\": \"LOCO\",\n }\n )\n for x in loco_importance_list\n]\n\npi_vim_arr = np.array([x[\"importance\"] for x in pi_importance_list])\npi_pval = compute_pval(pi_vim_arr)\n\nvim += [\n pd.DataFrame(\n {\n \"var\": np.arange(pi_vim_arr.shape[1]),\n \"importance\": x[\"importance\"],\n \"fold\": i,\n \"pval\": pi_pval,\n \"method\": \"PI\",\n }\n )\n for x in pi_importance_list\n]\n\nfig, ax = plt.subplots()\ndf_plot = pd.concat(vim)\ndf_plot[\"pval\"] = -np.log10(df_plot[\"pval\"])\nmethods = df_plot[\"method\"].unique()\ncolors = plt.cm.get_cmap(\"tab10\", 10)\n\nfor i, method in enumerate(methods):\n subset = df_plot[df_plot[\"method\"] == method]\n ax.bar(\n subset[\"var\"] + i * 0.2,\n subset[\"pval\"],\n width=0.2,\n label=method,\n color=colors(i),\n )\n\nax.legend(title=\"Method\")\nax.set_ylabel(r\"$-\\log_{10}(\\text{p-value})$\")\nax.axhline(-np.log10(0.05), color=\"tab:red\", ls=\"--\")\nax.set_xlabel(\"Variable\")\nax.set_xticklabels(diabetes.feature_names)\nplt.show()"
]
}
],
@@ -121,7 +186,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.12.6"
+ "version": "3.12.7"
}
},
"nbformat": 4,
diff --git a/docs/_downloads/e08c0f6d4aade0f0eaf8ba56dbbfd9c9/plot_2D_simulation_example.zip b/docs/_downloads/e08c0f6d4aade0f0eaf8ba56dbbfd9c9/plot_2D_simulation_example.zip
index 5a84c49..0bae569 100644
Binary files a/docs/_downloads/e08c0f6d4aade0f0eaf8ba56dbbfd9c9/plot_2D_simulation_example.zip and b/docs/_downloads/e08c0f6d4aade0f0eaf8ba56dbbfd9c9/plot_2D_simulation_example.zip differ
diff --git a/docs/_downloads/e11e159ab7d7ecf370a7f4df9ae35323/plot_residuals_sampling.zip b/docs/_downloads/e11e159ab7d7ecf370a7f4df9ae35323/plot_residuals_sampling.zip
deleted file mode 100644
index e700d2a..0000000
Binary files a/docs/_downloads/e11e159ab7d7ecf370a7f4df9ae35323/plot_residuals_sampling.zip and /dev/null differ
diff --git a/docs/_images/sphx_glr_plot_diabetes_variable_importance_example_001.png b/docs/_images/sphx_glr_plot_diabetes_variable_importance_example_001.png
index 97ebbf1..6374ad9 100644
Binary files a/docs/_images/sphx_glr_plot_diabetes_variable_importance_example_001.png and b/docs/_images/sphx_glr_plot_diabetes_variable_importance_example_001.png differ
diff --git a/docs/_images/sphx_glr_plot_diabetes_variable_importance_example_thumb.png b/docs/_images/sphx_glr_plot_diabetes_variable_importance_example_thumb.png
index 296f44a..f477b62 100644
Binary files a/docs/_images/sphx_glr_plot_diabetes_variable_importance_example_thumb.png and b/docs/_images/sphx_glr_plot_diabetes_variable_importance_example_thumb.png differ
diff --git a/docs/_images/sphx_glr_plot_residuals_sampling_001.png b/docs/_images/sphx_glr_plot_residuals_sampling_001.png
deleted file mode 100644
index 3caa165..0000000
Binary files a/docs/_images/sphx_glr_plot_residuals_sampling_001.png and /dev/null differ
diff --git a/docs/_images/sphx_glr_plot_residuals_sampling_thumb.png b/docs/_images/sphx_glr_plot_residuals_sampling_thumb.png
deleted file mode 100644
index dead343..0000000
Binary files a/docs/_images/sphx_glr_plot_residuals_sampling_thumb.png and /dev/null differ
diff --git a/docs/_sources/auto_examples/index.rst.txt b/docs/_sources/auto_examples/index.rst.txt
index c2a45bf..6e93cd3 100644
--- a/docs/_sources/auto_examples/index.rst.txt
+++ b/docs/_sources/auto_examples/index.rst.txt
@@ -34,23 +34,6 @@ Examples Gallery
-.. raw:: html
-
-
-
-.. only:: html
-
- .. image:: /auto_examples/images/thumb/sphx_glr_plot_diabetes_variable_importance_example_thumb.png
- :alt:
-
- :ref:`sphx_glr_auto_examples_plot_diabetes_variable_importance_example.py`
-
-.. raw:: html
-
-
Variable Importance on diabetes dataset
-
-
-
.. raw:: html
@@ -87,35 +70,35 @@ Examples Gallery
.. raw:: html
-
+
.. only:: html
- .. image:: /auto_examples/images/thumb/sphx_glr_plot_fmri_data_example_thumb.png
+ .. image:: /auto_examples/images/thumb/sphx_glr_plot_diabetes_variable_importance_example_thumb.png
:alt:
- :ref:`sphx_glr_auto_examples_plot_fmri_data_example.py`
+ :ref:`sphx_glr_auto_examples_plot_diabetes_variable_importance_example.py`
.. raw:: html
-
Support recovery on fMRI data
+
Variable Importance on diabetes dataset
.. raw:: html
-
+
.. only:: html
- .. image:: /auto_examples/images/thumb/sphx_glr_plot_residuals_sampling_thumb.png
+ .. image:: /auto_examples/images/thumb/sphx_glr_plot_fmri_data_example_thumb.png
:alt:
- :ref:`sphx_glr_auto_examples_plot_residuals_sampling.py`
+ :ref:`sphx_glr_auto_examples_plot_fmri_data_example.py`
.. raw:: html
-
Conditional sampling using residuals vs sampling Random Forest
+
Support recovery on fMRI data
@@ -130,11 +113,10 @@ Examples Gallery
:hidden:
/auto_examples/plot_dcrt_example
- /auto_examples/plot_diabetes_variable_importance_example
/auto_examples/plot_knockoff_aggregation
/auto_examples/plot_2D_simulation_example
+ /auto_examples/plot_diabetes_variable_importance_example
/auto_examples/plot_fmri_data_example
- /auto_examples/plot_residuals_sampling
.. only:: html
diff --git a/docs/_sources/auto_examples/plot_2D_simulation_example.rst.txt b/docs/_sources/auto_examples/plot_2D_simulation_example.rst.txt
index a568450..82f573f 100644
--- a/docs/_sources/auto_examples/plot_2D_simulation_example.rst.txt
+++ b/docs/_sources/auto_examples/plot_2D_simulation_example.rst.txt
@@ -554,9 +554,9 @@ randomization.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** (1 minutes 8.156 seconds)
+ **Total running time of the script:** (1 minutes 4.674 seconds)
-**Estimated memory usage:** 694 MB
+**Estimated memory usage:** 718 MB
.. _sphx_glr_download_auto_examples_plot_2D_simulation_example.py:
diff --git a/docs/_sources/auto_examples/plot_dcrt_example.rst.txt b/docs/_sources/auto_examples/plot_dcrt_example.rst.txt
index 7b4f063..f202cce 100644
--- a/docs/_sources/auto_examples/plot_dcrt_example.rst.txt
+++ b/docs/_sources/auto_examples/plot_dcrt_example.rst.txt
@@ -162,9 +162,9 @@ Plotting the comparison
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** (1 minutes 2.337 seconds)
+ **Total running time of the script:** (1 minutes 1.588 seconds)
-**Estimated memory usage:** 620 MB
+**Estimated memory usage:** 624 MB
.. _sphx_glr_download_auto_examples_plot_dcrt_example.py:
diff --git a/docs/_sources/auto_examples/plot_diabetes_variable_importance_example.rst.txt b/docs/_sources/auto_examples/plot_diabetes_variable_importance_example.rst.txt
index 93b1228..7c3ee7a 100644
--- a/docs/_sources/auto_examples/plot_diabetes_variable_importance_example.rst.txt
+++ b/docs/_sources/auto_examples/plot_diabetes_variable_importance_example.rst.txt
@@ -36,8 +36,8 @@ non-significant variables highly correlated with the significant ones and
creating fake significant variables. They introduced a solution for the Random
Forest estimator based on conditional sampling by performing sub-groups
permutation when bisecting the space using the conditioning variables of the
-buiding process. However, this solution is exclusive to the Random Forest and is
-costly with high-dimensional settings.
+buiding process. However, this solution is exclusive to the Random Forest and
+is costly with high-dimensional settings.
:footcite:t:`Chamma_NeurIPS2023` introduced a new model-agnostic solution to
bypass the limitations of the permutation approach under the use of the
conditional schemes. The variable of interest does contain two types of
@@ -64,30 +64,43 @@ References
Imports needed for this script
------------------------------
-.. GENERATED FROM PYTHON SOURCE LINES 48-69
+.. GENERATED FROM PYTHON SOURCE LINES 48-62
.. code-block:: Python
-
import matplotlib.pyplot as plt
import numpy as np
+ import pandas as pd
+ from scipy.stats import norm
+ from sklearn.base import clone
from sklearn.datasets import load_diabetes
+ from sklearn.linear_model import RidgeCV
+ from sklearn.metrics import r2_score, root_mean_squared_error
+ from sklearn.model_selection import KFold
+
+ from hidimstat.cpi import CPI
+ from hidimstat.loco import LOCO
+ from hidimstat.permutation_importance import PermutationImportance
+
+
+
+
+
+
+
- from hidimstat.bbi import BlockBasedImportance
- from hidimstat import compute_loco
+.. GENERATED FROM PYTHON SOURCE LINES 63-65
- plt.rcParams.update({"font.size": 14})
+Load the diabetes dataset
+------------------------------
+
+.. GENERATED FROM PYTHON SOURCE LINES 65-68
- # Fixing the random seed
- rng = np.random.RandomState(2024)
+.. code-block:: Python
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target
- # Use or not a cross-validation with the provided learner
- k_fold = 2
- # Identifying the categorical (nominal, binary & ordinal) variables
- variables_categories = {}
@@ -95,40 +108,82 @@ Imports needed for this script
+.. GENERATED FROM PYTHON SOURCE LINES 69-73
+
+Fit a baseline model on the diabetes dataset
+------------------------------
+We use a Ridge regression model with a 10-fold cross-validation to fit the
+diabetes dataset.
+
+.. GENERATED FROM PYTHON SOURCE LINES 73-89
+
+.. code-block:: Python
+
+
+ n_folds = 5
+ regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
+ regressor_list = [clone(regressor) for _ in range(n_folds)]
+ kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
+ for i, (train_index, test_index) in enumerate(kf.split(X)):
+ regressor_list[i].fit(X[train_index], y[train_index])
+ score = r2_score(
+ y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
+ )
+ mse = root_mean_squared_error(
+ y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
+ )
+
+ print(f"Fold {i}: {score}")
+ print(f"Fold {i}: {mse}")
+
+
+
+
+.. rst-class:: sphx-glr-script-out
+
+ .. code-block:: none
+
+ Fold 0: 0.33092885980301545
+ Fold 0: 58.57429457038258
+ Fold 1: 0.46114740610001137
+ Fold 1: 53.69518424561582
+ Fold 2: 0.5333394029342122
+ Fold 2: 54.666537166030764
+ Fold 3: 0.5048253747874585
+ Fold 3: 54.3633702441001
+ Fold 4: 0.5979566135054368
+ Fold 4: 52.287948367546456
+
-.. GENERATED FROM PYTHON SOURCE LINES 70-77
-Standard Variable Importance
-----------------------------
-To apply the standard permutation, we use the implementation introduced by (Mi
-et al., Nature, 2021) where the significance is measured by the mean of
--log10(p_value). For this example, the inference estimator is set to the
-Random Forest learner.
+.. GENERATED FROM PYTHON SOURCE LINES 90-94
-.. GENERATED FROM PYTHON SOURCE LINES 77-97
+Fit a baselien model on the diabetes dataset
+------------------------------
+We use a Ridge regression model with a 10-fold cross-validation to fit the
+diabetes dataset.
+
+.. GENERATED FROM PYTHON SOURCE LINES 94-111
.. code-block:: Python
- bbi_permutation = BlockBasedImportance(
- estimator="RF",
- importance_estimator="residuals_RF",
- do_hypertuning=True,
- dict_hypertuning=None,
- conditional=False,
- group_stacking=False,
- problem_type="regression",
- k_fold=k_fold,
- variables_categories=variables_categories,
- n_jobs=2,
- verbose=0,
- n_permutations=100,
- )
- bbi_permutation.fit(X, y)
- print("Computing the importance scores with standard permutation")
- results_permutation = bbi_permutation.compute_importance()
- pvals_permutation = -np.log10(results_permutation["pval"] + 1e-10)
+ n_folds = 10
+ regressor = RidgeCV(alphas=np.logspace(-3, 3, 10))
+ regressor_list = [clone(regressor) for _ in range(n_folds)]
+ kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
+ for i, (train_index, test_index) in enumerate(kf.split(X)):
+ regressor_list[i].fit(X[train_index], y[train_index])
+ score = r2_score(
+ y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
+ )
+ mse = root_mean_squared_error(
+ y_true=y[test_index], y_pred=regressor_list[i].predict(X[test_index])
+ )
+
+ print(f"Fold {i}: {score}")
+ print(f"Fold {i}: {mse}")
@@ -138,45 +193,104 @@ Random Forest learner.
.. code-block:: none
- Processing: 1
- Processing: 2
- Computing the importance scores with standard permutation
+ Fold 0: 0.34873505091371093
+ Fold 0: 56.151600911163946
+ Fold 1: 0.2722061930940729
+ Fold 1: 61.35323376775815
+ Fold 2: 0.5316220308691071
+ Fold 2: 49.33374807702299
+ Fold 3: 0.36967014548640154
+ Fold 3: 59.01492149527525
+ Fold 4: 0.5858181205553151
+ Fold 4: 51.479244269131684
+ Fold 5: 0.46246416851449
+ Fold 5: 58.3119253313517
+ Fold 6: 0.5235794267120801
+ Fold 6: 51.352999936251216
+ Fold 7: 0.48683083150894546
+ Fold 7: 56.75372711094103
+ Fold 8: 0.665318862395647
+ Fold 8: 47.26090648721779
+ Fold 9: 0.5514585816057873
+ Fold 9: 55.7236176985951
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 112-114
+
+Measure the importance of variables using the CPI method
+------------------------------
+
+.. GENERATED FROM PYTHON SOURCE LINES 114-132
+.. code-block:: Python
+ cpi_importance_list = []
+ for i, (train_index, test_index) in enumerate(kf.split(X)):
+ print(f"Fold {i}")
+ X_train, X_test = X[train_index], X[test_index]
+ y_train, y_test = y[train_index], y[test_index]
+ cpi = CPI(
+ estimator=regressor_list[i],
+ imputation_model=RidgeCV(alphas=np.logspace(-3, 3, 10)),
+ # covariate_estimator=HistGradientBoostingRegressor(random_state=0,),
+ n_permutations=50,
+ random_state=0,
+ n_jobs=4,
+ )
+ cpi.fit(X_train, y_train)
+ importance = cpi.score(X_test, y_test)
+ cpi_importance_list.append(importance)
-.. GENERATED FROM PYTHON SOURCE LINES 98-104
-Conditional Variable Importance
--------------------------------
-For the conditional permutation importance based on the two blocks (inference
-+ importance), the estimators are set to the Random Forest learner. The
-significance is measured by the mean of -log10(p_value).
-.. GENERATED FROM PYTHON SOURCE LINES 104-124
+
+.. rst-class:: sphx-glr-script-out
+
+ .. code-block:: none
+
+ Fold 0
+ Fold 1
+ Fold 2
+ Fold 3
+ Fold 4
+ Fold 5
+ Fold 6
+ Fold 7
+ Fold 8
+ Fold 9
+
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 133-135
+
+Measure the importance of variables using the LOCO method
+------------------------------
+
+.. GENERATED FROM PYTHON SOURCE LINES 135-152
.. code-block:: Python
- bbi_conditional = BlockBasedImportance(
- estimator="RF",
- importance_estimator="residuals_RF",
- do_hypertuning=True,
- dict_hypertuning=None,
- conditional=True,
- group_stacking=False,
- problem_type="regression",
- k_fold=k_fold,
- variables_categories=variables_categories,
- n_jobs=2,
- verbose=0,
- n_permutations=100,
- )
- bbi_conditional.fit(X, y)
- print("Computing the importance scores with conditional permutation")
- results_conditional = bbi_conditional.compute_importance()
- pvals_conditional = -np.log10(results_conditional["pval"] + 1e-5)
+ loco_importance_list = []
+
+ for i, (train_index, test_index) in enumerate(kf.split(X)):
+ print(f"Fold {i}")
+ X_train, X_test = X[train_index], X[test_index]
+ y_train, y_test = y[train_index], y[test_index]
+ loco = LOCO(
+ estimator=regressor_list[i],
+ random_state=0,
+ n_jobs=4,
+ )
+ loco.fit(X_train, y_train)
+ importance = loco.score(X_test, y_test)
+ loco_importance_list.append(importance)
+
@@ -186,31 +300,46 @@ significance is measured by the mean of -log10(p_value).
.. code-block:: none
- Processing: 1
- Processing: 2
- Computing the importance scores with conditional permutation
+ Fold 0
+ Fold 1
+ Fold 2
+ Fold 3
+ Fold 4
+ Fold 5
+ Fold 6
+ Fold 7
+ Fold 8
+ Fold 9
-.. GENERATED FROM PYTHON SOURCE LINES 125-133
+.. GENERATED FROM PYTHON SOURCE LINES 153-155
-Leave-One-Covariate-Out (LOCO)
+Measure the importance of variables using the permutation method
------------------------------
-We compare the previous permutation-based approaches with a removal-based
-approach LOCO (Williamson et al., Journal of the American Statistical
-Association, 2021) where the variable of interest is removed and the inference
-estimator is retrained using the new features to compare the loss for any drop in the
-performance.
-
-.. GENERATED FROM PYTHON SOURCE LINES 133-137
+.. GENERATED FROM PYTHON SOURCE LINES 155-173
.. code-block:: Python
- results_loco = compute_loco(X, y, use_dnn=False)
- pvals_loco = -np.log10(results_loco["p_value"] + 1e-5)
+ pi_importance_list = []
+
+ for i, (train_index, test_index) in enumerate(kf.split(X)):
+ print(f"Fold {i}")
+ X_train, X_test = X[train_index], X[test_index]
+ y_train, y_test = y[train_index], y[test_index]
+ pi = PermutationImportance(
+ estimator=regressor_list[i],
+ n_permutations=50,
+ random_state=0,
+ n_jobs=4,
+ )
+ pi.fit(X_train, y_train)
+ importance = pi.score(X_test, y_test)
+ pi_importance_list.append(importance)
+
@@ -220,83 +349,160 @@ performance.
.. code-block:: none
- Processing col: 1
- Processing col: 2
- Processing col: 3
- Processing col: 4
- Processing col: 5
- Processing col: 6
- Processing col: 7
- Processing col: 8
- Processing col: 9
- Processing col: 10
+ Fold 0
+ Fold 1
+ Fold 2
+ Fold 3
+ Fold 4
+ Fold 5
+ Fold 6
+ Fold 7
+ Fold 8
+ Fold 9
-.. GENERATED FROM PYTHON SOURCE LINES 138-140
+.. GENERATED FROM PYTHON SOURCE LINES 174-176
-Plotting the comparison
------------------------
+Define a function to compute the p-value from importance values
+------------------------------
-.. GENERATED FROM PYTHON SOURCE LINES 140-164
+.. GENERATED FROM PYTHON SOURCE LINES 176-192
.. code-block:: Python
+ def compute_pval(vim):
+ mean_vim = np.mean(vim, axis=0)
+ std_vim = np.std(vim, axis=0)
+ pval = norm.sf(mean_vim / std_vim)
+ return np.clip(pval, 1e-10, 1 - 1e-10)
- list_res = {"Permutation": [], "Conditional": [], "LOCO": []}
- for index, _ in enumerate(diabetes.feature_names):
- list_res["Permutation"].append(pvals_permutation[index][0])
- list_res["Conditional"].append(pvals_conditional[index][0])
- list_res["LOCO"].append(pvals_loco[index])
- x = np.arange(len(diabetes.feature_names))
- width = 0.25 # the width of the bars
- multiplier = 0
- fig, ax = plt.subplots(figsize=(10, 10), layout="constrained")
+ # Define a function to compute the p-value from importance values
+ # ------------------------------
+ def compute_pval(vim):
+ mean_vim = np.mean(vim, axis=0)
+ std_vim = np.std(vim, axis=0)
+ pval = norm.sf(mean_vim / std_vim)
+ return np.clip(pval, 1e-10, 1 - 1e-10)
- for attribute, measurement in list_res.items():
- offset = width * multiplier
- rects = ax.bar(x + offset, measurement, width, label=attribute)
- multiplier += 1
- ax.set_ylabel(r"$-log_{10}p_{val}$")
- ax.set_xticks(x + width / 2, diabetes.feature_names)
- ax.legend(loc="upper left", ncols=3)
- ax.set_ylim(0, 3)
- ax.axhline(y=-np.log10(0.05), color="r", linestyle="-")
- plt.show()
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 193-195
+
+Analyze the results
+------------------------------
+
+.. GENERATED FROM PYTHON SOURCE LINES 195-267
+
+.. code-block:: Python
+
+
+
+ cpi_vim_arr = np.array([x["importance"] for x in cpi_importance_list]) / 2
+ cpi_pval = compute_pval(cpi_vim_arr)
+
+ vim = [
+ pd.DataFrame(
+ {
+ "var": np.arange(cpi_vim_arr.shape[1]),
+ "importance": x["importance"],
+ "fold": i,
+ "pval": cpi_pval,
+ "method": "CPI",
+ }
+ )
+ for x in cpi_importance_list
+ ]
+
+ loco_vim_arr = np.array([x["importance"] for x in loco_importance_list])
+ loco_pval = compute_pval(loco_vim_arr)
+
+ vim += [
+ pd.DataFrame(
+ {
+ "var": np.arange(loco_vim_arr.shape[1]),
+ "importance": x["importance"],
+ "fold": i,
+ "pval": loco_pval,
+ "method": "LOCO",
+ }
+ )
+ for x in loco_importance_list
+ ]
+
+ pi_vim_arr = np.array([x["importance"] for x in pi_importance_list])
+ pi_pval = compute_pval(pi_vim_arr)
+
+ vim += [
+ pd.DataFrame(
+ {
+ "var": np.arange(pi_vim_arr.shape[1]),
+ "importance": x["importance"],
+ "fold": i,
+ "pval": pi_pval,
+ "method": "PI",
+ }
+ )
+ for x in pi_importance_list
+ ]
+
+ fig, ax = plt.subplots()
+ df_plot = pd.concat(vim)
+ df_plot["pval"] = -np.log10(df_plot["pval"])
+ methods = df_plot["method"].unique()
+ colors = plt.cm.get_cmap("tab10", 10)
+
+ for i, method in enumerate(methods):
+ subset = df_plot[df_plot["method"] == method]
+ ax.bar(
+ subset["var"] + i * 0.2,
+ subset["pval"],
+ width=0.2,
+ label=method,
+ color=colors(i),
+ )
+
+ ax.legend(title="Method")
+ ax.set_ylabel(r"$-\log_{10}(\text{p-value})$")
+ ax.axhline(-np.log10(0.05), color="tab:red", ls="--")
+ ax.set_xlabel("Variable")
+ ax.set_xticklabels(diabetes.feature_names)
+ plt.show()
+
+
+
.. image-sg:: /auto_examples/images/sphx_glr_plot_diabetes_variable_importance_example_001.png
:alt: plot diabetes variable importance example
:srcset: /auto_examples/images/sphx_glr_plot_diabetes_variable_importance_example_001.png
:class: sphx-glr-single-img
+.. rst-class:: sphx-glr-script-out
+ .. code-block:: none
+ /home/runner/work/hidimstat/hidimstat/examples/plot_diabetes_variable_importance_example.py:249: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.
+ colors = plt.cm.get_cmap("tab10", 10)
+ /home/runner/work/hidimstat/hidimstat/examples/plot_diabetes_variable_importance_example.py:265: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
+ ax.set_xticklabels(diabetes.feature_names)
-.. GENERATED FROM PYTHON SOURCE LINES 165-174
-Analysis of the results
------------------------
-While the standard permutation flags multiple variables to be significant for
-this prediction, the conditional permutation (the controlled alternative)
-shows an agreement for "bmi", "bp" and "s6" but also highlights the importance
-of "sex" in this prediction, thus reducing the input space to four significant
-variables. LOCO underlines the importance of one variable "bp" for this
-prediction problem.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** (0 minutes 54.114 seconds)
+ **Total running time of the script:** (0 minutes 8.146 seconds)
-**Estimated memory usage:** 621 MB
+**Estimated memory usage:** 609 MB
.. _sphx_glr_download_auto_examples_plot_diabetes_variable_importance_example.py:
diff --git a/docs/_sources/auto_examples/plot_fmri_data_example.rst.txt b/docs/_sources/auto_examples/plot_fmri_data_example.rst.txt
index 435594c..f28a5f4 100644
--- a/docs/_sources/auto_examples/plot_fmri_data_example.rst.txt
+++ b/docs/_sources/auto_examples/plot_fmri_data_example.rst.txt
@@ -184,8 +184,10 @@ You may choose a subject in [1, 2, 3, 4, 5, 6]. By default subject=2.
Downloading data from http://data.pymvpa.org/datasets/haxby2001/MD5SUMS ...
...done. (0 seconds, 0 min)
Downloading data from http://data.pymvpa.org/datasets/haxby2001/subj2-2010.01.14.tar.gz ...
-
Downloaded 20578304 of 291168628 bytes (7.1%, 13.8s remaining)
Downloaded 60334080 of 291168628 bytes (20.7%, 7.9s remaining)
Downloaded 104456192 of 291168628 bytes (35.9%, 5.5s remaining)
Downloaded 149889024 of 291168628 bytes (51.5%, 3.8s remaining)
Downloaded 195772416 of 291168628 bytes (67.2%, 2.5s remaining)
Downloaded 242360320 of 291168628 bytes (83.2%, 1.2s remaining)
Downloaded 287531008 of 291168628 bytes (98.8%, 0.1s remaining) ...done. (7 seconds, 0 min)
+
Downloaded 77365248 of 291168628 bytes (26.6%, 2.8s remaining)
Downloaded 176619520 of 291168628 bytes (60.7%, 1.3s remaining)
Downloaded 274890752 of 291168628 bytes (94.4%, 0.2s remaining) ...done. (3 seconds, 0 min)
Extracting data from /home/runner/nilearn_data/haxby2001/def37a305edfda829916fa14c9ea08f8/subj2-2010.01.14.tar.gz..... done.
+ /opt/hostedtoolcache/Python/3.12.7/x64/lib/python3.12/site-packages/nilearn/image/resampling.py:492: UserWarning: The provided image has no sform in its header. Please check the provided file. Results may not be as expected.
+ warnings.warn(
@@ -282,7 +284,7 @@ Now, we compute p-values thanks to permutation tests applied to
.. code-block:: none
[Parallel(n_jobs=1)]: Done 49 tasks | elapsed: 1.6s
- [Parallel(n_jobs=1)]: Done 199 tasks | elapsed: 6.6s
+ [Parallel(n_jobs=1)]: Done 199 tasks | elapsed: 6.4s
@@ -330,6 +332,8 @@ and high-dimensional inference (c.f. References).
.. code-block:: none
Clustered inference: n_clusters = 500, inference method = desparsified-lasso, seed = 0
+ /opt/hostedtoolcache/Python/3.12.7/x64/lib/python3.12/site-packages/sklearn/linear_model/_coordinate_descent.py:683: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.7422256933791083, tolerance: 0.21600000000000003
+ model = cd_fast.enet_coordinate_descent_gram(
@@ -361,7 +365,7 @@ However you might benefit from clustering randomization taking
.. code-block:: none
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
- [Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 35.8s finished
+ [Parallel(n_jobs=2)]: Done 5 out of 5 | elapsed: 34.8s finished
@@ -563,6 +567,13 @@ called `plot_map` that wraps all these steps.
:class: sphx-glr-multi-img
+.. rst-class:: sphx-glr-script-out
+
+ .. code-block:: none
+
+ /opt/hostedtoolcache/Python/3.12.7/x64/lib/python3.12/site-packages/nilearn/plotting/displays/_slicers.py:308: UserWarning: empty mask
+ ims = self._map_show(img, type="imshow", threshold=threshold, **kwargs)
+
@@ -599,9 +610,9 @@ spurious discoveries.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** (1 minutes 33.832 seconds)
+ **Total running time of the script:** (1 minutes 29.048 seconds)
-**Estimated memory usage:** 3374 MB
+**Estimated memory usage:** 3156 MB
.. _sphx_glr_download_auto_examples_plot_fmri_data_example.py:
diff --git a/docs/_sources/auto_examples/plot_knockoff_aggregation.rst.txt b/docs/_sources/auto_examples/plot_knockoff_aggregation.rst.txt
index b33738a..0ae2b67 100644
--- a/docs/_sources/auto_examples/plot_knockoff_aggregation.rst.txt
+++ b/docs/_sources/auto_examples/plot_knockoff_aggregation.rst.txt
@@ -205,9 +205,9 @@ Imports needed for this script
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** (5 minutes 47.064 seconds)
+ **Total running time of the script:** (5 minutes 27.608 seconds)
-**Estimated memory usage:** 753 MB
+**Estimated memory usage:** 743 MB
.. _sphx_glr_download_auto_examples_plot_knockoff_aggregation.py:
diff --git a/docs/_sources/auto_examples/plot_residuals_sampling.rst.txt b/docs/_sources/auto_examples/plot_residuals_sampling.rst.txt
deleted file mode 100644
index 6ef37b3..0000000
--- a/docs/_sources/auto_examples/plot_residuals_sampling.rst.txt
+++ /dev/null
@@ -1,352 +0,0 @@
-
-.. DO NOT EDIT.
-.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
-.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
-.. "auto_examples/plot_residuals_sampling.py"
-.. LINE NUMBERS ARE GIVEN BELOW.
-
-.. only:: html
-
- .. note::
- :class: sphx-glr-download-link-note
-
- :ref:`Go to the end `
- to download the full example code.
-
-.. rst-class:: sphx-glr-example-title
-
-.. _sphx_glr_auto_examples_plot_residuals_sampling.py:
-
-
-Conditional sampling using residuals vs sampling Random Forest
-==============================================================
-
-To deploy the Conditional Permutation Importance (CPI),
-:footcite:t:`Chamma_NeurIPS2023` described two main approaches for the
-conditional scheme: 1) Instead of directly permuting the variable of interest as
-in the Permutation Feature Importance (PFI), the residuals of the prediction of
-the variable of interest x_j based on the remaining variables is first computed
-along with a predicted version x_hat_j. These residuals are shuffled and added
-to the predicted version to recreate the variable of interest (Preserving the
-dependency between the variable of interest and the remaining variables while
-breaking the relationship with the outcome). 2) Another option is to use the
-sampling Random Forest. Using the remaining variables to predict the variable of
-interest, and instead of predicting the variable of interest as the mean of the
-instances' outcome of the targeted leaf or the class with the most occurences,
-we sample from the same leaf of the instance of interest within its neighbors,
-and we follow the standard path of the Random Forest.
-
-References
-----------
-.. footbibliography::
-
-.. GENERATED FROM PYTHON SOURCE LINES 27-29
-
-Imports needed for this script
-------------------------------
-
-.. GENERATED FROM PYTHON SOURCE LINES 29-50
-
-.. code-block:: Python
-
-
- from hidimstat import BlockBasedImportance
- from joblib import Parallel, delayed
- import matplotlib.pyplot as plt
- import numpy as np
- import pandas as pd
- from scipy.linalg import cholesky
- from scipy.stats import norm
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.metrics import roc_auc_score
- import time
-
- n, p = (100, 12)
- inter_cor, intra_cor = (0, 0.85)
- n_blocks = 1
- n_signal = 2
- problem_type = "regression"
- snr = 4
- rf = RandomForestRegressor(random_state=2023)
- dict_hyper = {"max_depth": [2, 5, 10, 20]}
-
-
-
-
-
-
-
-
-.. GENERATED FROM PYTHON SOURCE LINES 51-64
-
-Generate the synthetic data
----------------------------
-The function below generates the correlation matrix between the variables
-according to the provided degrees of correlation (intra + inter). `inter_cor`
-indicates the degree of correlation between the variables/groups whereas
-`intra_cor` specifies the corresponding degree between the variables within
-each group. For the single-level case, `n_blocks` is set to 1 and the
-`intra_cor` is the unique correlation between variables.
-
-Next, we generate the synthetic data by randomly drawing n_signal predictors
-from the corresponding p variables and reordering the set of variables to put the
-n_signal predictors at the beginning. Following, the response is generated
-under a simple linear model with Gaussian noise.
-
-.. GENERATED FROM PYTHON SOURCE LINES 64-116
-
-.. code-block:: Python
-
-
-
- def generate_cor_blocks(p, inter_cor, intra_cor, n_blocks):
- vars_per_grp = int(p / n_blocks)
- cor_mat = np.zeros((p, p))
- cor_mat.fill(inter_cor)
- for i in range(n_blocks):
- cor_mat[
- (i * vars_per_grp) : ((i + 1) * vars_per_grp),
- (i * vars_per_grp) : ((i + 1) * vars_per_grp),
- ] = intra_cor
- np.fill_diagonal(cor_mat, 1)
- return cor_mat
-
-
- def _generate_data(seed):
- rng = np.random.RandomState(seed)
-
- cor_mat = generate_cor_blocks(p, inter_cor, intra_cor, n_blocks)
- x = norm.rvs(size=(p, n), random_state=seed)
- c = cholesky(cor_mat, lower=True)
- X = pd.DataFrame(np.dot(c, x).T, columns=[str(i) for i in np.arange(p)])
-
- data = X.copy()
-
- # Randomly draw n_signal predictors which are defined as signal predictors
- indices_var = list(rng.choice(range(data.shape[1]), size=n_signal, replace=False))
-
- # Reorder data matrix so that first n_signal predictors are the signal predictors
- # List of remaining indices
- indices_rem = [ind for ind in range(data.shape[1]) if ind not in indices_var]
- total_indices = indices_var + indices_rem
- # Before including the non-linear effects
- data = data.iloc[:, total_indices]
- data_signal = data.iloc[:, np.arange(n_signal)]
-
- # Determine beta coefficients
- effectset = [-0.5, -1, -2, -3, 0.5, 1, 2, 3]
- beta = rng.choice(effectset, size=data_signal.shape[1], replace=True)
-
- # Generate response
- # The product of the signal predictors with the beta coefficients
- prod_signal = np.dot(data_signal, beta)
-
- sigma_noise = np.linalg.norm(prod_signal, ord=2) / (
- snr * np.sqrt(data_signal.shape[0])
- )
- y = prod_signal + sigma_noise * rng.normal(size=prod_signal.shape[0])
-
- return data, y
-
-
-
-
-
-
-
-
-
-.. GENERATED FROM PYTHON SOURCE LINES 117-122
-
-Processing across multiple permutations
----------------------------------------
-In order to get statistical significance with p-values, we run the experiments
-across 10 repetitions.
-
-
-.. GENERATED FROM PYTHON SOURCE LINES 122-188
-
-.. code-block:: Python
-
-
-
- def compute_simulations(seed):
- X, y = _generate_data(seed)
- # Using the residuals
- start_residuals = time.time()
- bbi_residual = BlockBasedImportance(
- estimator="RF",
- importance_estimator="residuals_RF",
- do_hypertuning=True,
- dict_hypertuning=None,
- conditional=True,
- n_permutations=10,
- n_jobs=2,
- problem_type="regression",
- k_fold=2,
- variables_categories={},
- )
- bbi_residual.fit(X, y)
- results_bbi_residual = bbi_residual.compute_importance()
-
- df_residuals = {}
- df_residuals["method"] = ["residuals"] * X.shape[1]
- df_residuals["score"] = [results_bbi_residual["score_R2"]] * X.shape[1]
- df_residuals["elapsed"] = [time.time() - start_residuals] * X.shape[1]
- df_residuals["importance"] = np.ravel(results_bbi_residual["importance"])
- df_residuals["p-value"] = np.ravel(results_bbi_residual["pval"])
- df_residuals["iteration"] = [seed] * X.shape[1]
- df_residuals = pd.DataFrame(df_residuals)
-
- # Using the sampling RF
- start_sampling = time.time()
- bbi_sampling = BlockBasedImportance(
- estimator="RF",
- importance_estimator="sampling_RF",
- do_hypertuning=True,
- dict_hypertuning=None,
- conditional=True,
- n_permutations=10,
- n_jobs=2,
- problem_type="regression",
- k_fold=2,
- variables_categories={},
- )
- bbi_sampling.fit(X, y)
- results_bbi_sampling = bbi_sampling.compute_importance()
-
- df_sampling = {}
- df_sampling["method"] = ["sampling"] * X.shape[1]
- df_sampling["score"] = [results_bbi_sampling["score_R2"]] * X.shape[1]
- df_sampling["elapsed"] = [time.time() - start_sampling] * X.shape[1]
- df_sampling["importance"] = np.ravel(results_bbi_sampling["importance"])
- df_sampling["p-value"] = np.ravel(results_bbi_sampling["pval"])
- df_sampling["iteration"] = [seed] * X.shape[1]
- df_sampling = pd.DataFrame(df_sampling)
-
- df_final = pd.concat([df_residuals, df_sampling], axis=0)
- return df_final
-
-
- # Running across 10 repetitions
- parallel = Parallel(n_jobs=2, verbose=1)
- final_result = parallel(
- delayed(compute_simulations)(seed=seed) for seed in np.arange(1, 11)
- )
-
-
-
-
-
-.. rst-class:: sphx-glr-script-out
-
- .. code-block:: none
-
- [Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
- [Parallel(n_jobs=2)]: Done 10 out of 10 | elapsed: 4.4min finished
-
-
-
-
-.. GENERATED FROM PYTHON SOURCE LINES 189-197
-
-Plotting AUC score and Type-I error
------------------------------------
-With the prediction problems turns to be a binary classification problem for
-the variables being relevant or non-relevant vs the ground-truth, we measure
-the performance in terms of type-I error i.e. the rate of true non-relevant
-variables detected as relevant and AUC score related to correct significant
-variables ordering.
-
-
-.. GENERATED FROM PYTHON SOURCE LINES 197-232
-
-.. code-block:: Python
-
-
- df_final_result = pd.concat(final_result, axis=0).reset_index(drop=True)
- df_auc = df_final_result.groupby(by=["method", "iteration"]).apply(
- lambda x: roc_auc_score([1] * n_signal + [0] * (p - n_signal), -x["p-value"])
- )
- df_auc = df_auc.reset_index(name="auc")
- df_type_I = df_final_result.groupby(by=["method", "iteration"]).apply(
- lambda x: sum(x.iloc[n_signal:, :]["p-value"] <= 0.05) / x.iloc[2:, :].shape[0]
- )
- df_type_I = df_type_I.reset_index(name="type-I")
-
- auc = [
- np.array(df_auc["auc"])[: int(df_auc.shape[0] / 2)],
- np.array(df_auc["auc"])[int(df_auc.shape[0] / 2) :],
- ]
- typeI_error = [
- np.array(df_type_I["type-I"])[: int(df_type_I.shape[0] / 2)],
- np.array(df_type_I["type-I"])[int(df_type_I.shape[0] / 2) :],
- ]
-
- fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), sharey=True)
-
- # AUC score
- axs[0].violinplot(auc, showmeans=False, showmedians=True, vert=False)
- axs[0].set_title("AUC score")
- axs[0].xaxis.grid(True)
- axs[0].set_yticks([x + 1 for x in range(len(auc))], labels=["Residuals", "Sampling"])
- axs[0].set_ylabel("Method")
-
- # Type-I Error
- axs[1].violinplot(typeI_error, showmeans=False, showmedians=True, vert=False)
- axs[1].set_title("Type-I Error")
- axs[1].axvline(x=0.05, color="r", label="Nominal Rate")
- plt.show()
-
-
-
-
-.. image-sg:: /auto_examples/images/sphx_glr_plot_residuals_sampling_001.png
- :alt: AUC score, Type-I Error
- :srcset: /auto_examples/images/sphx_glr_plot_residuals_sampling_001.png
- :class: sphx-glr-single-img
-
-
-
-
-
-.. GENERATED FROM PYTHON SOURCE LINES 233-238
-
-Analysis of the results
------------------------
-We can observe that the sampling approaches'performance is almost similar to
-that of the residuals. Sampling accelerates the conditional importance
-computation by simplifying the residuals steps.
-
-
-.. rst-class:: sphx-glr-timing
-
- **Total running time of the script:** (4 minutes 25.722 seconds)
-
-**Estimated memory usage:** 608 MB
-
-
-.. _sphx_glr_download_auto_examples_plot_residuals_sampling.py:
-
-.. only:: html
-
- .. container:: sphx-glr-footer sphx-glr-footer-example
-
- .. container:: sphx-glr-download sphx-glr-download-jupyter
-
- :download:`Download Jupyter notebook: plot_residuals_sampling.ipynb `
-
- .. container:: sphx-glr-download sphx-glr-download-python
-
- :download:`Download Python source code: plot_residuals_sampling.py `
-
- .. container:: sphx-glr-download sphx-glr-download-zip
-
- :download:`Download zipped: plot_residuals_sampling.zip `
-
-
-.. only:: html
-
- .. rst-class:: sphx-glr-signature
-
- `Gallery generated by Sphinx-Gallery `_
diff --git a/docs/_sources/auto_examples/sg_execution_times.rst.txt b/docs/_sources/auto_examples/sg_execution_times.rst.txt
index 2494368..3383064 100644
--- a/docs/_sources/auto_examples/sg_execution_times.rst.txt
+++ b/docs/_sources/auto_examples/sg_execution_times.rst.txt
@@ -6,7 +6,7 @@
Computation times
=================
-**14:51.226** total execution time for 6 files **from auto_examples**:
+**09:11.064** total execution time for 5 files **from auto_examples**:
.. container::
@@ -33,20 +33,17 @@ Computation times
- Time
- Mem (MB)
* - :ref:`sphx_glr_auto_examples_plot_knockoff_aggregation.py` (``plot_knockoff_aggregation.py``)
- - 05:47.064
- - 753.1
- * - :ref:`sphx_glr_auto_examples_plot_residuals_sampling.py` (``plot_residuals_sampling.py``)
- - 04:25.722
- - 608.4
+ - 05:27.608
+ - 743.1
* - :ref:`sphx_glr_auto_examples_plot_fmri_data_example.py` (``plot_fmri_data_example.py``)
- - 01:33.832
- - 3373.7
+ - 01:29.048
+ - 3156.3
* - :ref:`sphx_glr_auto_examples_plot_2D_simulation_example.py` (``plot_2D_simulation_example.py``)
- - 01:08.156
- - 694.2
+ - 01:04.674
+ - 717.8
* - :ref:`sphx_glr_auto_examples_plot_dcrt_example.py` (``plot_dcrt_example.py``)
- - 01:02.337
- - 620.5
+ - 01:01.588
+ - 623.7
* - :ref:`sphx_glr_auto_examples_plot_diabetes_variable_importance_example.py` (``plot_diabetes_variable_importance_example.py``)
- - 00:54.114
- - 621.1
+ - 00:08.146
+ - 609.0
diff --git a/docs/_sources/generated/hidimstat.BlockBasedImportance.rst.txt b/docs/_sources/generated/hidimstat.BlockBasedImportance.rst.txt
deleted file mode 100644
index 66711cd..0000000
--- a/docs/_sources/generated/hidimstat.BlockBasedImportance.rst.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-ο»Ώhidimstat.BlockBasedImportance
-==============================
-
-.. currentmodule:: hidimstat
-
-.. autoclass:: BlockBasedImportance
-
-
- .. automethod:: __init__
-
-
- .. rubric:: Methods
-
- .. autosummary::
-
- ~BlockBasedImportance.__init__
- ~BlockBasedImportance.compute_importance
- ~BlockBasedImportance.fit
- ~BlockBasedImportance.fit_transform
- ~BlockBasedImportance.get_metadata_routing
- ~BlockBasedImportance.get_params
- ~BlockBasedImportance.predict
- ~BlockBasedImportance.predict_proba
- ~BlockBasedImportance.set_output
- ~BlockBasedImportance.set_params
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/_sources/sg_execution_times.rst.txt b/docs/_sources/sg_execution_times.rst.txt
index c436072..16a1911 100644
--- a/docs/_sources/sg_execution_times.rst.txt
+++ b/docs/_sources/sg_execution_times.rst.txt
@@ -6,7 +6,7 @@
Computation times
=================
-**14:51.226** total execution time for 6 files **from all galleries**:
+**09:11.064** total execution time for 5 files **from all galleries**:
.. container::
@@ -33,20 +33,17 @@ Computation times
- Time
- Mem (MB)
* - :ref:`sphx_glr_auto_examples_plot_knockoff_aggregation.py` (``../examples/plot_knockoff_aggregation.py``)
- - 05:47.064
- - 753.1
- * - :ref:`sphx_glr_auto_examples_plot_residuals_sampling.py` (``../examples/plot_residuals_sampling.py``)
- - 04:25.722
- - 608.4
+ - 05:27.608
+ - 743.1
* - :ref:`sphx_glr_auto_examples_plot_fmri_data_example.py` (``../examples/plot_fmri_data_example.py``)
- - 01:33.832
- - 3373.7
+ - 01:29.048
+ - 3156.3
* - :ref:`sphx_glr_auto_examples_plot_2D_simulation_example.py` (``../examples/plot_2D_simulation_example.py``)
- - 01:08.156
- - 694.2
+ - 01:04.674
+ - 717.8
* - :ref:`sphx_glr_auto_examples_plot_dcrt_example.py` (``../examples/plot_dcrt_example.py``)
- - 01:02.337
- - 620.5
+ - 01:01.588
+ - 623.7
* - :ref:`sphx_glr_auto_examples_plot_diabetes_variable_importance_example.py` (``../examples/plot_diabetes_variable_importance_example.py``)
- - 00:54.114
- - 621.1
+ - 00:08.146
+ - 609.0
diff --git a/docs/_static/basic.css b/docs/_static/basic.css
index f316efc..7ebbd6d 100644
--- a/docs/_static/basic.css
+++ b/docs/_static/basic.css
@@ -1,12 +1,5 @@
/*
- * basic.css
- * ~~~~~~~~~
- *
* Sphinx stylesheet -- basic theme.
- *
- * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
- * :license: BSD, see LICENSE for details.
- *
*/
/* -- main layout ----------------------------------------------------------- */
@@ -115,15 +108,11 @@ img {
/* -- search page ----------------------------------------------------------- */
ul.search {
- margin: 10px 0 0 20px;
- padding: 0;
+ margin-top: 10px;
}
ul.search li {
- padding: 5px 0 5px 20px;
- background-image: url(file.png);
- background-repeat: no-repeat;
- background-position: 0 7px;
+ padding: 5px 0;
}
ul.search li a {
diff --git a/docs/_static/doctools.js b/docs/_static/doctools.js
index 4d67807..0398ebb 100644
--- a/docs/_static/doctools.js
+++ b/docs/_static/doctools.js
@@ -1,12 +1,5 @@
/*
- * doctools.js
- * ~~~~~~~~~~~
- *
* Base JavaScript utilities for all Sphinx HTML documentation.
- *
- * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
- * :license: BSD, see LICENSE for details.
- *
*/
"use strict";
diff --git a/docs/_static/language_data.js b/docs/_static/language_data.js
index 367b8ed..c7fe6c6 100644
--- a/docs/_static/language_data.js
+++ b/docs/_static/language_data.js
@@ -1,13 +1,6 @@
/*
- * language_data.js
- * ~~~~~~~~~~~~~~~~
- *
* This script contains the language-specific data used by searchtools.js,
* namely the list of stopwords, stemmer, scorer and splitter.
- *
- * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
- * :license: BSD, see LICENSE for details.
- *
*/
var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"];
diff --git a/docs/_static/searchtools.js b/docs/_static/searchtools.js
index b08d58c..2c774d1 100644
--- a/docs/_static/searchtools.js
+++ b/docs/_static/searchtools.js
@@ -1,12 +1,5 @@
/*
- * searchtools.js
- * ~~~~~~~~~~~~~~~~
- *
* Sphinx JavaScript utilities for the full-text search.
- *
- * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
- * :license: BSD, see LICENSE for details.
- *
*/
"use strict";
@@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") {
// and returns the new score.
/*
score: result => {
- const [docname, title, anchor, descr, score, filename] = result
+ const [docname, title, anchor, descr, score, filename, kind] = result
return score
},
*/
@@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") {
};
}
+// Global search result kind enum, used by themes to style search results.
+class SearchResultKind {
+ static get index() { return "index"; }
+ static get object() { return "object"; }
+ static get text() { return "text"; }
+ static get title() { return "title"; }
+}
+
const _removeChildren = (element) => {
while (element && element.lastChild) element.removeChild(element.lastChild);
};
@@ -64,9 +65,13 @@ const _displayItem = (item, searchTerms, highlightTerms) => {
const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY;
const contentRoot = document.documentElement.dataset.content_root;
- const [docName, title, anchor, descr, score, _filename] = item;
+ const [docName, title, anchor, descr, score, _filename, kind] = item;
let listItem = document.createElement("li");
+ // Add a class representing the item's type:
+ // can be used by a theme's CSS selector for styling
+ // See SearchResultKind for the class names.
+ listItem.classList.add(`kind-${kind}`);
let requestUrl;
let linkUrl;
if (docBuilder === "dirhtml") {
@@ -115,8 +120,10 @@ const _finishSearch = (resultCount) => {
"Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories."
);
else
- Search.status.innerText = _(
- "Search finished, found ${resultCount} page(s) matching the search query."
+ Search.status.innerText = Documentation.ngettext(
+ "Search finished, found one page matching the search query.",
+ "Search finished, found ${resultCount} pages matching the search query.",
+ resultCount,
).replace('${resultCount}', resultCount);
};
const _displayNextItem = (
@@ -138,7 +145,7 @@ const _displayNextItem = (
else _finishSearch(resultCount);
};
// Helper function used by query() to order search results.
-// Each input is an array of [docname, title, anchor, descr, score, filename].
+// Each input is an array of [docname, title, anchor, descr, score, filename, kind].
// Order the results by score (in opposite order of appearance, since the
// `_displayNextItem` function uses pop() to retrieve items) and then alphabetically.
const _orderResultsByScoreThenName = (a, b) => {
@@ -248,6 +255,7 @@ const Search = {
searchSummary.classList.add("search-summary");
searchSummary.innerText = "";
const searchList = document.createElement("ul");
+ searchList.setAttribute("role", "list");
searchList.classList.add("search");
const out = document.getElementById("search-results");
@@ -318,7 +326,7 @@ const Search = {
const indexEntries = Search._index.indexentries;
// Collect multiple result groups to be sorted separately and then ordered.
- // Each is an array of [docname, title, anchor, descr, score, filename].
+ // Each is an array of [docname, title, anchor, descr, score, filename, kind].
const normalResults = [];
const nonMainIndexResults = [];
@@ -337,6 +345,7 @@ const Search = {
null,
score + boost,
filenames[file],
+ SearchResultKind.title,
]);
}
}
@@ -354,6 +363,7 @@ const Search = {
null,
score,
filenames[file],
+ SearchResultKind.index,
];
if (isMain) {
normalResults.push(result);
@@ -475,6 +485,7 @@ const Search = {
descr,
score,
filenames[match[0]],
+ SearchResultKind.object,
]);
};
Object.keys(objects).forEach((prefix) =>
@@ -585,6 +596,7 @@ const Search = {
null,
score,
filenames[file],
+ SearchResultKind.text,
]);
}
return results;
diff --git a/docs/api.html b/docs/api.html
index a7629f0..f6f4fd5 100644
--- a/docs/api.html
+++ b/docs/api.html
@@ -14,7 +14,7 @@
-
+
@@ -137,46 +137,43 @@