diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index f4489de..e0d891a 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -22,10 +22,12 @@ jobs: steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 + with: + python-version: '3.12' - name: Install dependencies run: | - pip install -r doc_conf/doc-requirements.txt - pip install -e . + python -m pip install -e . + python -m pip install .[doc,plotting] - name: Sphinx build id: build-docs run: | diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml index c869baf..254de7e 100644 --- a/.github/workflows/build_package.yml +++ b/.github/workflows/build_package.yml @@ -12,11 +12,12 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: python-version: ["3.12"] + os: [ubuntu-latest, macos-latest, windows-latest] steps: - uses: actions/checkout@v4 @@ -24,16 +25,9 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - # - name: Lint with flake8 - # uses: py-actions/flake8@v2.3.0 - name: Generate coverage report run: | - pip install pytest pytest-cov + python -m pip install .[test] pytest --cov=./ --cov-report=xml - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v4.5.0 diff --git a/doc_conf/conf.py b/doc_conf/conf.py index a0ff7c9..b93dff3 100644 --- a/doc_conf/conf.py +++ b/doc_conf/conf.py @@ -13,12 +13,12 @@ # serve to show the default. import os -import sys import warnings -import sphinx_gallery -import sphinx_bootstrap_theme from distutils.version import LooseVersion + import matplotlib +import sphinx_bootstrap_theme +import sphinx_gallery # Disable agg warnings in doc warnings.filterwarnings( diff --git a/doc_conf/doc-requirements.txt b/doc_conf/doc-requirements.txt deleted file mode 100644 index 4192564..0000000 --- a/doc_conf/doc-requirements.txt +++ /dev/null @@ -1,20 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 -joblib -matplotlib -memory_profiler -mne -nilearn -numpy -numpydoc -pandas -pillow -PyQt5 -pyvista -pyvistaqt -scikit-learn -scipy -sphinx-bootstrap-theme -sphinxcontrib-bibtex -sphinx-gallery -torch -torchmetrics diff --git a/docs/_downloads/0f0a1edc554c21f6f7fda8f53284928a/plot_2D_simulation_example.py b/docs/_downloads/0f0a1edc554c21f6f7fda8f53284928a/plot_2D_simulation_example.py index 57bee37..c8f5896 100644 --- a/docs/_downloads/0f0a1edc554c21f6f7fda8f53284928a/plot_2D_simulation_example.py +++ b/docs/_downloads/0f0a1edc554c21f6f7fda8f53284928a/plot_2D_simulation_example.py @@ -51,20 +51,20 @@ arXiv preprint arXiv:2106.02590. """ +import matplotlib.pyplot as plt + ############################################################################# # Imports needed for this script # ------------------------------ import numpy as np -import matplotlib.pyplot as plt -from sklearn.feature_extraction import image from sklearn.cluster import FeatureAgglomeration +from sklearn.feature_extraction import image -from hidimstat.scenario import multivariate_simulation -from hidimstat.stat_tools import zscore_from_pval, pval_from_cb -from hidimstat.desparsified_lasso import desparsified_lasso from hidimstat.clustered_inference import clustered_inference +from hidimstat.desparsified_lasso import desparsified_lasso from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference - +from hidimstat.scenario import multivariate_simulation +from hidimstat.stat_tools import pval_from_cb, zscore_from_pval ############################################################################# # Specific plotting functions diff --git a/docs/_downloads/6f624092537330c9f373c01828b2b9ae/plot_diabetes_variable_importance_example.py b/docs/_downloads/6f624092537330c9f373c01828b2b9ae/plot_diabetes_variable_importance_example.py index 29cf767..f6ccf40 100644 --- a/docs/_downloads/6f624092537330c9f373c01828b2b9ae/plot_diabetes_variable_importance_example.py +++ b/docs/_downloads/6f624092537330c9f373c01828b2b9ae/plot_diabetes_variable_importance_example.py @@ -10,10 +10,11 @@ # Imports needed for this script # ------------------------------ +import matplotlib.pyplot as plt import numpy as np -from hidimstat.BBI import BlockBasedImportance from sklearn.datasets import load_diabetes -import matplotlib.pyplot as plt + +from hidimstat.BBI import BlockBasedImportance plt.rcParams.update({"font.size": 14}) diff --git a/docs/_downloads/76c0979bf6618aa210fd11bb28dcf896/plot_fmri_data_example.py b/docs/_downloads/76c0979bf6618aa210fd11bb28dcf896/plot_fmri_data_example.py index ef0625f..25b89b8 100644 --- a/docs/_downloads/76c0979bf6618aa210fd11bb28dcf896/plot_fmri_data_example.py +++ b/docs/_downloads/76c0979bf6618aa210fd11bb28dcf896/plot_fmri_data_example.py @@ -45,21 +45,21 @@ # ------------------------------ import numpy as np import pandas as pd -from sklearn.utils import Bunch -from sklearn.cluster import FeatureAgglomeration -from sklearn.feature_extraction import image -from sklearn.linear_model import Ridge from nilearn import datasets -from nilearn.input_data import NiftiMasker from nilearn.image import mean_img +from nilearn.input_data import NiftiMasker from nilearn.plotting import plot_stat_map, show +from sklearn.cluster import FeatureAgglomeration +from sklearn.feature_extraction import image +from sklearn.linear_model import Ridge +from sklearn.utils import Bunch -from hidimstat.stat_tools import zscore_from_pval, pval_from_scale -from hidimstat.standardized_svr import standardized_svr -from hidimstat.permutation_test import permutation_test, permutation_test_cv from hidimstat.adaptive_permutation_threshold import ada_svr from hidimstat.clustered_inference import clustered_inference from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference +from hidimstat.permutation_test import permutation_test, permutation_test_cv +from hidimstat.standardized_svr import standardized_svr +from hidimstat.stat_tools import pval_from_scale, zscore_from_pval ############################################################################# diff --git a/examples/plot_2D_simulation_example.py b/examples/plot_2D_simulation_example.py index 88e329a..b19168d 100644 --- a/examples/plot_2D_simulation_example.py +++ b/examples/plot_2D_simulation_example.py @@ -51,20 +51,20 @@ arXiv preprint arXiv:2106.02590. """ +import matplotlib.pyplot as plt + ############################################################################# # Imports needed for this script # ------------------------------ import numpy as np -import matplotlib.pyplot as plt -from sklearn.feature_extraction import image from sklearn.cluster import FeatureAgglomeration +from sklearn.feature_extraction import image -from hidimstat.scenario import multivariate_simulation -from hidimstat.stat_tools import zscore_from_pval, pval_from_cb -from hidimstat.desparsified_lasso import desparsified_lasso from hidimstat.clustered_inference import clustered_inference +from hidimstat.desparsified_lasso import desparsified_lasso from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference - +from hidimstat.scenario import multivariate_simulation +from hidimstat.stat_tools import pval_from_cb, zscore_from_pval ############################################################################# # Specific plotting functions diff --git a/examples/plot_diabetes_variable_importance_example.py b/examples/plot_diabetes_variable_importance_example.py index f9fac47..3d28334 100644 --- a/examples/plot_diabetes_variable_importance_example.py +++ b/examples/plot_diabetes_variable_importance_example.py @@ -46,10 +46,11 @@ # Imports needed for this script # ------------------------------ +import matplotlib.pyplot as plt import numpy as np -from hidimstat.BBI import BlockBasedImportance from sklearn.datasets import load_diabetes -import matplotlib.pyplot as plt + +from hidimstat.BBI import BlockBasedImportance plt.rcParams.update({"font.size": 14}) diff --git a/examples/plot_fmri_data_example.py b/examples/plot_fmri_data_example.py index ef0625f..25b89b8 100644 --- a/examples/plot_fmri_data_example.py +++ b/examples/plot_fmri_data_example.py @@ -45,21 +45,21 @@ # ------------------------------ import numpy as np import pandas as pd -from sklearn.utils import Bunch -from sklearn.cluster import FeatureAgglomeration -from sklearn.feature_extraction import image -from sklearn.linear_model import Ridge from nilearn import datasets -from nilearn.input_data import NiftiMasker from nilearn.image import mean_img +from nilearn.input_data import NiftiMasker from nilearn.plotting import plot_stat_map, show +from sklearn.cluster import FeatureAgglomeration +from sklearn.feature_extraction import image +from sklearn.linear_model import Ridge +from sklearn.utils import Bunch -from hidimstat.stat_tools import zscore_from_pval, pval_from_scale -from hidimstat.standardized_svr import standardized_svr -from hidimstat.permutation_test import permutation_test, permutation_test_cv from hidimstat.adaptive_permutation_threshold import ada_svr from hidimstat.clustered_inference import clustered_inference from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference +from hidimstat.permutation_test import permutation_test, permutation_test_cv +from hidimstat.standardized_svr import standardized_svr +from hidimstat.stat_tools import pval_from_scale, zscore_from_pval ############################################################################# diff --git a/examples_not_exhibited/plot_fig_1_nguyen_et_al.py b/examples_not_exhibited/plot_fig_1_nguyen_et_al.py index 91efffb..17e8815 100644 --- a/examples_not_exhibited/plot_fig_1_nguyen_et_al.py +++ b/examples_not_exhibited/plot_fig_1_nguyen_et_al.py @@ -12,11 +12,12 @@ """ import matplotlib.pyplot as plt import numpy as np +from joblib import Parallel, delayed +from sklearn.preprocessing import StandardScaler + from hidimstat.knockoffs import knockoff_aggregation, model_x_knockoff from hidimstat.knockoffs.data_simulation import simu_data from hidimstat.knockoffs.utils import cal_fdp_power -from joblib import Parallel, delayed -from sklearn.preprocessing import StandardScaler color_blue = "#1f77b4" color_teal = "#1fbecf" diff --git a/examples_not_exhibited/plot_meg_data_example.py b/examples_not_exhibited/plot_meg_data_example.py index bc529ea..b3c19b2 100644 --- a/examples_not_exhibited/plot_meg_data_example.py +++ b/examples_not_exhibited/plot_meg_data_example.py @@ -23,14 +23,15 @@ """ import os -import numpy as np + import matplotlib.image as mpimg import matplotlib.pyplot as plt import mne -from scipy.sparse.csgraph import connected_components +import numpy as np from mne.datasets import sample, somato -from mne.inverse_sparse.mxne_inverse import _prepare_gain, _make_sparse_stc -from mne.minimum_norm import make_inverse_operator, apply_inverse +from mne.inverse_sparse.mxne_inverse import _make_sparse_stc, _prepare_gain +from mne.minimum_norm import apply_inverse, make_inverse_operator +from scipy.sparse.csgraph import connected_components from sklearn.cluster import FeatureAgglomeration from sklearn.metrics.pairwise import pairwise_distances diff --git a/hidimstat/BBI.py b/hidimstat/BBI.py index ccf15f7..521b855 100644 --- a/hidimstat/BBI.py +++ b/hidimstat/BBI.py @@ -13,20 +13,17 @@ log_loss, mean_absolute_error, mean_squared_error, - roc_auc_score, r2_score, + roc_auc_score, ) -from sklearn.model_selection import KFold, GroupKFold +from sklearn.model_selection import GroupKFold, KFold from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.utils.validation import check_is_fitted -from .compute_importance import ( - joblib_compute_conditional, - joblib_compute_permutation, -) -from .Dnn_learner import Dnn_learner -from .utils import convert_predict_proba, create_X_y, compute_imp_std +from .compute_importance import joblib_compute_conditional, joblib_compute_permutation +from .Dnn_learner import DnnLearner +from .utils import compute_imp_std, convert_predict_proba, create_X_y class BlockBasedImportance(BaseEstimator, TransformerMixin): @@ -75,8 +72,10 @@ class BlockBasedImportance(BaseEstimator, TransformerMixin): inference estimator. problem_type : str, default='regression' A classification or a regression problem. - sampling_with_repitition : bool, default=True - Sampling with repitition the train part of the train/valid scheme under + encoding_input : bool, default=True + To one-hot or ordinal encode the nominal and ordinal input variables. + sampling_with_repetition : bool, default=True + Sampling with repetition the train part of the train/valid scheme under the training set. The number of training samples in train is equal to the number of instances in the training set. split_percentage : float, default=0.8 @@ -111,7 +110,7 @@ class BlockBasedImportance(BaseEstimator, TransformerMixin): random_state : int, default=2023 Fixing the seeds of the random generator. do_compute_importance : boolean, default=True - Whether to Compute the Importance Scores. + Whether to compute the Importance Scores. group_fold : list, default=None The list of group labels to perform GroupKFold to keep subjects within the same training or test set. @@ -129,7 +128,8 @@ def __init__( do_hypertuning=True, dict_hypertuning=None, problem_type="regression", - sampling_with_repitition=True, + encoding_input=True, + sampling_with_repetition=True, split_percentage=0.8, conditional=True, variables_categories=None, @@ -153,7 +153,8 @@ def __init__( self.do_hypertuning = do_hypertuning self.dict_hypertuning = dict_hypertuning self.problem_type = problem_type - self.sampling_with_repitition = sampling_with_repitition + self.encoding_input = encoding_input + self.sampling_with_repetition = sampling_with_repetition self.split_percentage = split_percentage self.conditional = conditional self.variables_categories = variables_categories @@ -405,21 +406,23 @@ def fit(self, X, y=None): X_prev = X.copy() X = np.zeros((y.shape[0], len(self.list_grps))) - for ind_fold, (train, test) in enumerate(cv.split(X_prev)): + for index_fold, (train, test) in enumerate(cv.split(X_prev)): X_train, X_test = X_prev[train], X_prev[test] y_train, _ = y[train], y[test] if len(self.coffeine_transformers) > 1: - X_train = self.coffeine_transformers[ind_fold].fit_transform( + X_train = self.coffeine_transformers[index_fold].fit_transform( pd.DataFrame(X_train, columns=self.X_cols), np.ravel(y_train), ) - X_test = self.coffeine_transformers[ind_fold].transform( + X_test = self.coffeine_transformers[index_fold].transform( pd.DataFrame(X_test, columns=self.X_cols) ) for grp_ind, grp in enumerate(self.list_grps): - self.ridge_mods[ind_fold][grp_ind].fit(X_train[:, grp], y_train) + self.ridge_mods[index_fold][grp_ind].fit( + X_train[:, grp], y_train + ) X[test, grp_ind] = ( - self.ridge_mods[ind_fold][grp_ind] + self.ridge_mods[index_fold][grp_ind] .predict(X_test[:, grp]) .ravel() ) @@ -443,9 +446,9 @@ def fit(self, X, y=None): # Initialize the first estimator (block learner) if self.estimator == "DNN": - self.estimator = Dnn_learner( + self.estimator = DnnLearner( + encoding_outcome=True, problem_type=self.problem_type, - encode=True, do_hypertuning=False, list_continuous=self.list_continuous, list_grps=self.list_grps, @@ -485,12 +488,12 @@ def fit(self, X, y=None): ) list_splits = kf.split(X) - for ind_fold, (train_index, test_index) in enumerate(list_splits): - print(f"Processing: {ind_fold+1}") + for index_fold, (train_index, test_index) in enumerate(list_splits): + print(f"Processing: {index_fold+1}") X_fold = X.copy() y_fold = y.copy() - self.X_nominal[ind_fold] = X_nominal_org.iloc[test_index, :] + self.X_nominal[index_fold] = X_nominal_org.iloc[test_index, :] X_train, X_test = ( X_fold[train_index, :], @@ -501,26 +504,26 @@ def fit(self, X, y=None): if not self.apply_ridge: if self.coffeine_transformer is not None: - X_train = self.coffeine_transformers[ind_fold].fit_transform( + X_train = self.coffeine_transformers[index_fold].fit_transform( pd.DataFrame(X_train, columns=self.X_cols), np.ravel(y_train), ) - X_test = self.coffeine_transformers[ind_fold].transform( + X_test = self.coffeine_transformers[index_fold].transform( pd.DataFrame(X_test, columns=self.X_cols) ) - self.X_test[ind_fold] = X_test.copy() - self.y_test[ind_fold] = y_test.copy() - self.y_train[ind_fold] = y_train.copy() + self.X_test[index_fold] = X_test.copy() + self.y_test[index_fold] = y_test.copy() + self.y_train[index_fold] = y_train.copy() # Find the list of optimal sub-models to be used in the # following steps (Default estimator) if self.do_hypertuning: - self.__tuning_hyper(X_train, y_train, ind_fold) + self.__tuning_hyper(X_train, y_train, index_fold) if self.type == "DNN": self.estimator.fit(X_train, y_train) - self.list_estimators[ind_fold] = copy(self.estimator) + self.list_estimators[index_fold] = copy(self.estimator) else: self.y_train = y.copy() @@ -549,7 +552,7 @@ def fit(self, X, y=None): self.is_fitted = True return self - def __tuning_hyper(self, X, y, ind_fold=None): + def __tuning_hyper(self, X, y, index_fold=None): """ Tune the hyperparameters of the provided inference estimator. @@ -560,16 +563,16 @@ def __tuning_hyper(self, X, y, ind_fold=None): y : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs) The target values (class labels in classification, real numbers in regression). - ind_fold : int, default=None - The indice of the corresponding fold. + index_fold : int, default=None + The index of the corresponding fold. """ if not ((self.apply_ridge) and (self.group_stacking)): ( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, + X_validation_scaled, + y_validation_scaled, X_scaled, __, scaler_x, @@ -578,14 +581,14 @@ def __tuning_hyper(self, X, y, ind_fold=None): ) = create_X_y( X, y, - sampling_with_repitition=self.sampling_with_repitition, + sampling_with_repetition=self.sampling_with_repetition, split_percentage=self.split_percentage, problem_type=self.problem_type, list_continuous=self.list_continuous, random_state=self.random_state, ) if self.dict_hypertuning is not None: - list_hyper = list( + list_hypertuning = list( itertools.product(*list(self.dict_hypertuning.values())) ) list_loss = [] @@ -593,32 +596,32 @@ def __tuning_hyper(self, X, y, ind_fold=None): list_loss = self.estimator.hyper_tuning( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, - list_hyper, + X_validation_scaled, + y_validation_scaled, + list_hypertuning, random_state=self.random_state, ) else: if self.dict_hypertuning is None: self.estimator.fit(X_scaled, y) # If not a DNN learner case, need to save the scalers - self.scaler_x[ind_fold] = scaler_x - self.scaler_y[ind_fold] = scaler_y + self.scaler_x[index_fold] = scaler_x + self.scaler_y[index_fold] = scaler_y return else: - for ind_el, el in enumerate(list_hyper): + for ind_el, el in enumerate(list_hypertuning): curr_params = dict( (k, v) for v, k in zip(el, list(self.dict_hypertuning.keys())) ) - list_hyper[ind_el] = curr_params + list_hypertuning[ind_el] = curr_params self.estimator.set_params(**curr_params) if self.problem_type == "regression": y_train_curr = ( y_train_scaled * scaler_y.scale_ + scaler_y.mean_ ) - y_valid_curr = ( - y_valid_scaled * scaler_y.scale_ + scaler_y.mean_ + y_validation_curr = ( + y_validation_scaled * scaler_y.scale_ + scaler_y.mean_ ) def func(x): @@ -626,7 +629,7 @@ def func(x): else: y_train_curr = y_train_scaled.copy() - y_valid_curr = y_valid_scaled.copy() + y_validation_curr = y_validation_scaled.copy() def func(x): return self.estimator.predict_proba(x) @@ -636,17 +639,19 @@ def func(x): if self.problem_type == "classification": list_loss.append( self.loss( - y_valid_curr, - func(X_valid_scaled)[:, np.unique(y_valid_curr)], + y_validation_curr, + func(X_validation_scaled)[ + :, np.unique(y_validation_curr) + ], ) ) else: list_loss.append( - self.loss(y_valid_curr, func(X_valid_scaled)) + self.loss(y_validation_curr, func(X_validation_scaled)) ) ind_min = np.argmin(list_loss) - best_hyper = list_hyper[ind_min] + best_hyper = list_hypertuning[ind_min] if not isinstance(best_hyper, dict): best_hyper = dict(zip(self.dict_hypertuning.keys(), best_hyper)) @@ -654,13 +659,13 @@ def func(x): self.estimator.fit(X_scaled, y) # If not a DNN learner case, need to save the scalers - self.scaler_x[ind_fold] = scaler_x - self.scaler_y[ind_fold] = scaler_y + self.scaler_x[index_fold] = scaler_x + self.scaler_y[index_fold] = scaler_y else: self.estimator.fit(X, y) - def predict(self, X=None, encoding=True): + def predict(self, X=None): """ This function predicts the regression target for the input samples X. @@ -669,12 +674,11 @@ def predict(self, X=None, encoding=True): X : {array-like, sparse matrix} of shape (n_test_samples, n_features), defaut=None The input samples. - encoding : bool, default=True - Whether to encode the non-continuous input variables. Returns ------- - Average predictions across all samples. + predictions: array-like of shape (n_test_samples,) + The average predictions across all folds. """ if not isinstance(X, list): list_X = [X.copy() for el in range(max(self.k_fold, 1))] @@ -683,37 +687,37 @@ def predict(self, X=None, encoding=True): list_X = X.copy() mean_pred = False - for ind_fold, curr_X in enumerate(list_X): + for index_fold, curr_X in enumerate(list_X): # Prepare the test set for the prediction - if encoding: - X_tmp = self.__encode_input(curr_X) + if self.encoding_input: + X_tmp = self._encode_input(curr_X) else: X_tmp = curr_X.copy() if self.type != "DNN": if not isinstance(curr_X, np.ndarray): X_tmp = np.array(X_tmp) - if self.scaler_x[ind_fold] is not None: - X_tmp[:, self.list_continuous] = self.scaler_x[ind_fold].transform( - X_tmp[:, self.list_continuous] - ) - self.X_proc[ind_fold] = [X_tmp.copy()] + if self.scaler_x[index_fold] is not None: + X_tmp[:, self.list_continuous] = self.scaler_x[ + index_fold + ].transform(X_tmp[:, self.list_continuous]) + self.X_proc[index_fold] = [X_tmp.copy()] - self.org_pred[ind_fold] = self.list_estimators[ind_fold].predict(X_tmp) + self.org_pred[index_fold] = self.list_estimators[index_fold].predict(X_tmp) # Convert to the (n_samples x n_outputs) format - if len(self.org_pred[ind_fold].shape) != 2: - self.org_pred[ind_fold] = self.org_pred[ind_fold].reshape(-1, 1) + if len(self.org_pred[index_fold].shape) != 2: + self.org_pred[index_fold] = self.org_pred[index_fold].reshape(-1, 1) if self.type == "DNN": - self.X_proc[ind_fold] = np.array( - self.list_estimators[ind_fold].X_test.copy() + self.X_proc[index_fold] = np.array( + self.list_estimators[index_fold].X_test.copy() ).swapaxes(0, 1) if mean_pred: return np.mean(np.array(self.org_pred), axis=0) - def predict_proba(self, X=None, encoding=True): + def predict_proba(self, X=None): """ This function predicts the class probabilities for the input samples X. @@ -722,12 +726,11 @@ def predict_proba(self, X=None, encoding=True): X : {array-like, sparse matrix} of shape (n_test_samples, n_features), default=None The input samples. - encoding : bool, default=True - Whether to encode the non-continuous input variables. Returns ------- - Average predictions across all samples. + predictions: array-like of shape (n_test_samples,) + The average predictions across all folds. """ if not isinstance(X, list): list_X = [X.copy() for el in range(max(self.k_fold, 1))] @@ -736,37 +739,39 @@ def predict_proba(self, X=None, encoding=True): list_X = X.copy() mean_pred = False - for ind_fold, curr_X in enumerate(list_X): + for index_fold, curr_X in enumerate(list_X): # Prepare the test set for the prediction - if encoding: - X_tmp = self.__encode_input(curr_X) + if self.encoding_input: + X_tmp = self._encode_input(curr_X) else: X_tmp = curr_X.copy() if self.type != "DNN": if not isinstance(curr_X, np.ndarray): X_tmp = np.array(X_tmp) - if self.scaler_x[ind_fold] is not None: - X_tmp[:, self.list_continuous] = self.scaler_x[ind_fold].transform( - X_tmp[:, self.list_continuous] - ) - self.X_proc[ind_fold] = [X_tmp.copy()] + if self.scaler_x[index_fold] is not None: + X_tmp[:, self.list_continuous] = self.scaler_x[ + index_fold + ].transform(X_tmp[:, self.list_continuous]) + self.X_proc[index_fold] = [X_tmp.copy()] - self.org_pred[ind_fold] = self.list_estimators[ind_fold].predict_proba( + self.org_pred[index_fold] = self.list_estimators[index_fold].predict_proba( X_tmp ) if self.type == "DNN": - self.X_proc[ind_fold] = np.array( - self.list_estimators[ind_fold].X_test.copy() + self.X_proc[index_fold] = np.array( + self.list_estimators[index_fold].X_test.copy() ).swapaxes(0, 1) else: - self.org_pred[ind_fold] = convert_predict_proba(self.org_pred[ind_fold]) + self.org_pred[index_fold] = convert_predict_proba( + self.org_pred[index_fold] + ) if mean_pred: return np.mean(np.array(self.org_pred), axis=0) - def __encode_input(self, X): + def _encode_input(self, X): """ This function encodes the non-continuous variables in the design matrix X. @@ -824,12 +829,12 @@ def compute_importance(self, X=None, y=None): """ # Check is fit had been called check_is_fitted(self, ["is_fitted"]) - encoding = True + self.encoding_input = True if self.k_fold != 0: X = self.X_test.copy() y = self.y_test.copy() - encoding = False + self.encoding_input = False else: if self.coffeine_transformer is not None: X = self.coffeine_transformers[0].transform( @@ -873,36 +878,35 @@ def compute_importance(self, X=None, y=None): # Compute original predictions if self.problem_type == "regression": output_dimension = y[0].shape[1] - self.predict(X, encoding=encoding) + self.predict(X) else: output_dimension = 1 - self.predict_proba(X, encoding=encoding) - + self.predict_proba(X) list_seeds_imp = self.rng.randint(1e5, size=self.n_permutations) parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose) score_imp_l = [] results = {} # n_features x n_permutations x n_samples - for ind_fold, estimator in enumerate(self.list_estimators): + for index_fold, estimator in enumerate(self.list_estimators): if self.type == "DNN": - for y_col in range(y[ind_fold].shape[-1]): + for y_col in range(y[index_fold].shape[-1]): _ = self.estimator.encode_outcome( - self.y_train[ind_fold], train=True + self.y_train[index_fold], train=True )[y_col] - y[ind_fold] = self.estimator.encode_outcome( - y[ind_fold], train=False + y[index_fold] = self.estimator.encode_outcome( + y[index_fold], train=False )[y_col] else: if self.problem_type in ("classification", "binary"): one_hot = OneHotEncoder(handle_unknown="ignore").fit( - self.y_train[ind_fold].reshape(-1, 1) + self.y_train[index_fold].reshape(-1, 1) ) - y[ind_fold] = one_hot.transform( - y[ind_fold].reshape(-1, 1) + y[index_fold] = one_hot.transform( + y[index_fold].reshape(-1, 1) ).toarray() if self.do_compute_importance: if not self.conditional: - self.pred_scores[ind_fold], score_cur = list( + self.pred_scores[index_fold], score_cur = list( zip( *parallel( delayed(joblib_compute_permutation)( @@ -910,14 +914,14 @@ def compute_importance(self, X=None, y=None): permutation, estimator, self.type, - self.X_proc[ind_fold], - y[ind_fold], + self.X_proc[index_fold], + y[index_fold], self.problem_type, - self.org_pred[ind_fold], + self.org_pred[index_fold], dict_continuous=self.dict_continuous, dict_nominal=self.dict_nominal, processed_column=variables_interest, - iteration_index=ind_fold + 1, + iteration_index=index_fold + 1, group_stacking=self.group_stacking, random_state=list_seeds_imp[permutation], verbose=self.verbose, @@ -927,18 +931,18 @@ def compute_importance(self, X=None, y=None): ) ) ) - self.pred_scores[ind_fold] = np.array( - self.pred_scores[ind_fold] + self.pred_scores[index_fold] = np.array( + self.pred_scores[index_fold] ).reshape( ( len(self.list_columns), self.n_permutations, - y[ind_fold].shape[0], + y[index_fold].shape[0], output_dimension, ) ) else: - self.pred_scores[ind_fold], score_cur = list( + self.pred_scores[index_fold], score_cur = list( zip( *parallel( delayed(joblib_compute_conditional)( @@ -947,18 +951,18 @@ def compute_importance(self, X=None, y=None): estimator, self.type, self.importance_estimator, - self.X_proc[ind_fold], - y[ind_fold], + self.X_proc[index_fold], + y[index_fold], self.problem_type, - self.org_pred[ind_fold], + self.org_pred[index_fold], seed=self.random_state, dict_continuous=self.dict_continuous, dict_nominal=self.dict_nominal, - X_nominal=self.X_nominal[ind_fold], + X_nominal=self.X_nominal[index_fold], variables_categories=self.variables_categories, encoder=self.dict_enc, processed_column=variables_interest, - iteration_index=ind_fold + 1, + iteration_index=index_fold + 1, group_stacking=self.group_stacking, sub_groups=[self.list_columns, self.sub_groups], list_seeds=list_seeds_imp, @@ -970,19 +974,21 @@ def compute_importance(self, X=None, y=None): ) ) ) - self.pred_scores[ind_fold] = np.array(self.pred_scores[ind_fold]) + self.pred_scores[index_fold] = np.array( + self.pred_scores[index_fold] + ) score_imp_l.append(score_cur[0]) else: if self.problem_type in ("classification", "binary"): - nonzero_cols = np.where(y[ind_fold].any(axis=0))[0] + nonzero_cols = np.where(y[index_fold].any(axis=0))[0] score = roc_auc_score( - y[ind_fold][:, nonzero_cols], - self.org_pred[ind_fold][:, nonzero_cols], + y[index_fold][:, nonzero_cols], + self.org_pred[index_fold][:, nonzero_cols], ) else: score = ( - mean_absolute_error(y[ind_fold], self.org_pred[ind_fold]), - r2_score(y[ind_fold], self.org_pred[ind_fold]), + mean_absolute_error(y[index_fold], self.org_pred[index_fold]), + r2_score(y[index_fold], self.org_pred[index_fold]), ) score_imp_l.append(score) @@ -998,8 +1004,8 @@ def compute_importance(self, X=None, y=None): # Compute Importance and P-values pred_scores_full = [ - np.mean(self.pred_scores[ind_fold], axis=1) - for ind_fold in range(max(self.k_fold, 1)) + np.mean(self.pred_scores[index_fold], axis=1) + for index_fold in range(max(self.k_fold, 1)) ] results["importance"] = compute_imp_std(pred_scores_full)[0] results["std"] = compute_imp_std(pred_scores_full)[1] diff --git a/hidimstat/Dnn_learner.py b/hidimstat/Dnn_learner.py index 366e9be..48a476d 100644 --- a/hidimstat/Dnn_learner.py +++ b/hidimstat/Dnn_learner.py @@ -1,16 +1,19 @@ import numpy as np from sklearn.base import BaseEstimator -from .Dnn_learner_single import Dnn_learner_single +from .Dnn_learner_single import DnnLearnerSingle -class Dnn_learner(BaseEstimator): + +class DnnLearner(BaseEstimator): """ This class implements the high-level of the Multi-Layer Perceptron (MLP) - learner. + learner across multi-outputs. Parameters ---------- - encode : bool, default=False + preparing_test : bool, default=True + Whether to prepare the test set especially after stacking. + encoding_outcome : bool, default=False Whether to encode the categorical outcome. do_hypertuning : bool, default=True Tuning the hyperparameters of the provided estimator. @@ -22,14 +25,12 @@ class Dnn_learner(BaseEstimator): The minimal number of sub-DNNs to keep if > 10. batch_size : int, default=32 The number of samples per batch for training. - batch_size_val : int, default=128 + batch_size_validation : int, default=128 The number of samples per batch for validation. n_epoch : int, default=200 The number of epochs for the DNN learner(s). - verbose : int, default=0 - If verbose > 0, the fitted iterations will be printed. - sampling_with_repitition : bool, default=True - Application of sampling_with_repitition sampling for the training set. + sampling_with_repetition : bool, default=True + Application of sampling_with_repetition sampling for the training set. split_percentage : float, default=0.8 The training/validation cut for the provided data. problem_type : str, default='regression' @@ -59,20 +60,22 @@ class Dnn_learner(BaseEstimator): The cumsum of inputs after the linear sub-layers. random_state : int, default=2023 Fixing the seeds of the random generator. + verbose : int, default=0 + If verbose > 0, the fitted iterations will be printed. """ def __init__( self, - encode=False, + preparing_test=True, + encoding_outcome=False, do_hypertuning=False, dict_hypertuning=None, n_ensemble=10, min_keep=10, batch_size=32, - batch_size_val=128, + batch_size_validation=128, n_epoch=200, - verbose=0, - sampling_with_repitition=True, + sampling_with_repetition=True, split_percentage=0.8, problem_type="regression", list_continuous=None, @@ -87,18 +90,19 @@ def __init__( group_stacking=False, input_dimensions=None, random_state=2023, + verbose=0, ): self.list_estimators = [] - self.encode = encode + self.preparing_test = preparing_test + self.encoding_outcome = encoding_outcome self.do_hypertuning = do_hypertuning self.dict_hypertuning = dict_hypertuning self.n_ensemble = n_ensemble self.min_keep = min_keep self.batch_size = batch_size - self.batch_size_val = batch_size_val + self.batch_size_validation = batch_size_validation self.n_epoch = n_epoch - self.verbose = verbose - self.sampling_with_repitition = sampling_with_repitition + self.sampling_with_repetition = sampling_with_repetition self.split_percentage = split_percentage self.problem_type = problem_type self.list_grps = list_grps @@ -113,10 +117,9 @@ def __init__( self.group_stacking = group_stacking self.input_dimensions = input_dimensions self.random_state = random_state + self.verbose = verbose self.pred = [None] * n_ensemble - self.enc_y = [] - self.is_encoded = False - self.dim_repeat = 1 + self.dimension_repeat = 1 def fit(self, X, y=None): """ @@ -140,23 +143,24 @@ def fit(self, X, y=None): if (len(X.shape) != 3) or (X.shape[0] != y.shape[-1]): X = np.squeeze(X) X = np.array([X for i in range(y.shape[-1])]) - self.dim_repeat = y.shape[-1] + self.dimension_repeat = y.shape[-1] self.list_estimators = [None] * y.shape[-1] self.X_test = [None] * y.shape[-1] for y_col in range(y.shape[-1]): - self.list_estimators[y_col] = Dnn_learner_single( - encode=self.encode, + self.list_estimators[y_col] = DnnLearnerSingle( + preparing_test=self.preparing_test, + encoding_outcome=self.encoding_outcome, do_hypertuning=self.do_hypertuning, dict_hypertuning=self.dict_hypertuning, n_ensemble=self.n_ensemble, min_keep=self.min_keep, batch_size=self.batch_size, - batch_size_val=self.batch_size_val, + batch_size_validation=self.batch_size_validation, n_epoch=self.n_epoch, verbose=self.verbose, - sampling_with_repitition=self.sampling_with_repitition, + sampling_with_repetition=self.sampling_with_repetition, split_percentage=self.split_percentage, problem_type=self.problem_type, list_continuous=self.list_continuous, @@ -181,9 +185,9 @@ def hyper_tuning( self, X_train, y_train, - X_valid, - y_valid, - list_hyper=None, + X_validation, + y_validation, + list_hypertuning=None, random_state=None, ): """ @@ -196,27 +200,28 @@ def hyper_tuning( y_train : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs) The target values (class labels in classification, real numbers in regression) for the training samples. - X_train : {array-like, sparse matrix} of shape (n_valid_samples, n_features) + X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features) The validation input samples. - y_train : array-like of shape (n_valid_samples,) or (n_valid_samples, n_outputs) + y_validation : array-like of shape (n_validation_samples,) or (n_validation_samples, n_outputs) The target values (class labels in classification, real numbers in regression) for the validation samples. - list_hyper : list of tuples, default=None + list_hypertuning : list of tuples, default=None The list of tuples for the hyperparameters values. random_state : int, default=None Fixing the seeds of the random generator. """ - estimator = Dnn_learner_single( - encode=self.encode, + estimator = DnnLearnerSingle( + preparing_test=self.preparing_test, + encoding_outcome=self.encoding_outcome, do_hypertuning=self.do_hypertuning, dict_hypertuning=self.dict_hypertuning, n_ensemble=self.n_ensemble, min_keep=self.min_keep, batch_size=self.batch_size, - batch_size_val=self.batch_size_val, + batch_size_validation=self.batch_size_validation, n_epoch=self.n_epoch, verbose=self.verbose, - sampling_with_repitition=self.sampling_with_repitition, + sampling_with_repetition=self.sampling_with_repetition, split_percentage=self.split_percentage, problem_type=self.problem_type, list_continuous=self.list_continuous, @@ -233,10 +238,10 @@ def hyper_tuning( random_state=self.random_state, ) return estimator.hyper_tuning( - X_train, y_train, X_valid, y_valid, list_hyper, random_state + X_train, y_train, X_validation, y_validation, list_hypertuning, random_state ) - def predict(self, X, scale=True): + def predict(self, X): """ This function predicts the regression target for the input samples X. @@ -245,29 +250,28 @@ def predict(self, X, scale=True): X : {array-like, sparse matrix} of shape (n_test_samples, n_features), default=None The input samples. - scale : bool, default=True - Whether to scale the continuous input variables. Returns ------- - predictions : {array-like, sparse matrix) - The average predictions across the sub-DNN models. + predictions : {array-like, sparse matrix) of shape (n_test_samples, n_outputs) + The predictions across multi-outputs. """ if isinstance(X, list): - X = [self.check_X_dim(el) for el in X] + X = [self.check_X_dimension(el) for el in X] else: - X = self.check_X_dim(X) + X = self.check_X_dimension(X) list_res = [] for estimator_ind, estimator in enumerate(self.list_estimators): + estimator.preparing_test = self.preparing_test if isinstance(X, list): curr_X = [el[estimator_ind, ...] for el in X] - list_res.append(estimator.predict(curr_X, scale)) + list_res.append(estimator.predict(curr_X)) else: - list_res.append(estimator.predict(X[estimator_ind, ...], scale)) + list_res.append(estimator.predict(X[estimator_ind, ...])) self.X_test[estimator_ind] = estimator.X_test.copy() return np.array(list_res) - def predict_proba(self, X, scale=True): + def predict_proba(self, X): """ This function predicts the class probabilities for the input samples X. @@ -276,26 +280,25 @@ def predict_proba(self, X, scale=True): X : {array-like, sparse matrix} of shape (n_test_samples, n_features), default=None The input samples. - scale : bool, default=True - Whether to scale the continuous input variables. Returns ------- - predictions : {array-like, sparse matrix) - The average predictions across the sub-DNN models. + predictions : {array-like, sparse matrix) of shape (n_test_samples, n_outputs) + The predictions across multi-outputs. """ if isinstance(X, list): - X = [self.check_X_dim(el) for el in X] + X = [self.check_X_dimension(el) for el in X] else: - X = self.check_X_dim(X) + X = self.check_X_dimension(X) list_res = [] for estimator_ind, estimator in enumerate(self.list_estimators): + estimator.preparing_test = self.preparing_test if isinstance(X, list): curr_X = [el[estimator_ind, ...] for el in X] - list_res.append(estimator.predict_proba(curr_X, scale)) + list_res.append(estimator.predict_proba(curr_X)) else: - list_res.append(estimator.predict_proba(X[estimator_ind, ...], scale)) + list_res.append(estimator.predict_proba(X[estimator_ind, ...])) self.X_test[estimator_ind] = estimator.X_test.copy() return np.squeeze(np.array(list_res)) @@ -308,13 +311,13 @@ def set_params(self, **kwargs): for estimator in self.list_estimators: setattr(estimator, key, value) - def check_X_dim(self, X): + def check_X_dimension(self, X): """ This function checks for the compatibility of the dimensions of X """ - if (len(X.shape) != 3) or (X.shape[0] != self.dim_repeat): + if (len(X.shape) != 3) or (X.shape[0] != self.dimension_repeat): X = np.squeeze(X) - X = np.array([X for i in range(self.dim_repeat)]) + X = np.array([X for _ in range(self.dimension_repeat)]) return X diff --git a/hidimstat/Dnn_learner_single.py b/hidimstat/Dnn_learner_single.py index 64b7a7b..ddffe5b 100644 --- a/hidimstat/Dnn_learner_single.py +++ b/hidimstat/Dnn_learner_single.py @@ -19,14 +19,16 @@ ) -class Dnn_learner_single(BaseEstimator): +class DnnLearnerSingle(BaseEstimator): """ This class implements the Multi-Layer Perceptron (MLP) default inference learner for Block-Based Importance (BBI) framework. Parameters ---------- - encode : bool, default=False + preparing_test : bool, default=True + Whether to scale the continuous variables in the test set. + encoding_outcome : bool, default=False Whether to encode the categorical outcome. do_hypertuning : bool, default=True Tuning the hyperparameters of the provided estimator. @@ -38,14 +40,12 @@ class Dnn_learner_single(BaseEstimator): The minimal number of sub-DNNs to keep if > 10. batch_size : int, default=32 The number of samples per batch for training. - batch_size_val : int, default=128 + batch_size_validation : int, default=128 The number of samples per batch for validation. n_epoch : int, default=200 The number of epochs for the DNN learner(s). - verbose : int, default=0 - If verbose > 0, the fitted iterations will be printed. - sampling_with_repitition : bool, default=True - Application of sampling_with_repitition sampling for the training set + sampling_with_repetition : bool, default=True + Application of sampling_with_repetition sampling for the training set split_percentage : float, default=0.8 The training/validation cut for the provided data. problem_type : str, default='regression' @@ -75,20 +75,22 @@ class Dnn_learner_single(BaseEstimator): The cumsum of inputs after the linear sub-layers. random_state : int, default=2023 Fixing the seeds of the random generator. + verbose : int, default=0 + If verbose > 0, the fitted iterations will be printed. """ def __init__( self, - encode=False, + preparing_test=True, + encoding_outcome=False, do_hypertuning=False, dict_hypertuning=None, n_ensemble=10, min_keep=10, batch_size=32, - batch_size_val=128, + batch_size_validation=128, n_epoch=200, - verbose=0, - sampling_with_repitition=True, + sampling_with_repetition=True, split_percentage=0.8, problem_type="regression", list_continuous=None, @@ -103,17 +105,18 @@ def __init__( group_stacking=False, input_dimensions=None, random_state=2023, + verbose=0, ): - self.encode = encode + self.preparing_test = preparing_test + self.encoding_outcome = encoding_outcome self.do_hypertuning = do_hypertuning self.dict_hypertuning = dict_hypertuning self.n_ensemble = n_ensemble self.min_keep = min_keep self.batch_size = batch_size - self.batch_size_val = batch_size_val + self.batch_size_validation = batch_size_validation self.n_epoch = n_epoch - self.verbose = verbose - self.sampling_with_repitition = sampling_with_repitition + self.sampling_with_repetition = sampling_with_repetition self.split_percentage = split_percentage self.problem_type = problem_type self.list_grps = list_grps @@ -128,6 +131,7 @@ def __init__( self.group_stacking = group_stacking self.input_dimensions = input_dimensions self.random_state = random_state + self.verbose = verbose self.enc_y = [] self.activation_outcome = { "classification": softmax, @@ -159,9 +163,9 @@ def fit(self, X, y=None): y = y.reshape(-1, 1) # Disabling the encoding parameter with the regression case if self.problem_type == "regression": - self.encode = False + self.encoding_outcome = False - if self.encode: + if self.encoding_outcome: y_encoded = self.encode_outcome(y) self.is_encoded = True y_encoded = np.squeeze(y_encoded, axis=0) @@ -215,7 +219,7 @@ def fit(self, X, y=None): activation_outcome=self.activation_outcome, list_continuous=self.list_continuous, list_grps=self.list_grps, - sampling_with_repitition=self.sampling_with_repitition, + sampling_with_repetition=self.sampling_with_repetition, split_percentage=self.split_percentage, group_stacking=self.group_stacking, input_dimensions=self.input_dimensions, @@ -305,8 +309,8 @@ def hyper_tuning( self, X_train, y_train, - X_valid, - y_valid, + X_validation, + y_validation, list_hyper=None, random_state=None, ): @@ -320,9 +324,9 @@ def hyper_tuning( y_train : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs) The target values (class labels in classification, real numbers in regression) for the training samples. - X_train : {array-like, sparse matrix} of shape (n_valid_samples, n_features) + X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features) The validation input samples. - y_train : array-like of shape (n_valid_samples,) or (n_valid_samples, n_outputs) + y_validation : array-like of shape (n_validation_samples,) or (n_validation_samples, n_outputs) The target values (class labels in classification, real numbers in regression) for the validation samples. list_hyper : list of tuples, default=None @@ -334,7 +338,7 @@ def hyper_tuning( n_jobs=min(self.n_jobs, self.n_ensemble), verbose=self.verbose ) y_train = self.encode_outcome(y_train) - y_valid = self.encode_outcome(y_valid, train=False) + y_validation = self.encode_outcome(y_validation, train=False) return [ list( zip( @@ -342,8 +346,8 @@ def hyper_tuning( delayed(dnn_net)( X_train, y_train[i, ...], - X_valid, - y_valid[i, ...], + X_validation, + y_validation[i, ...], problem_type=self.problem_type, n_epoch=self.n_epoch, batch_size=self.batch_size, @@ -380,8 +384,8 @@ def __tuning_hyper(self, X, y): ( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, + X_validation_scaled, + y_validation_scaled, X_scaled, __, scaler_x, @@ -390,7 +394,7 @@ def __tuning_hyper(self, X, y): ) = create_X_y( X, y, - sampling_with_repitition=self.sampling_with_repitition, + sampling_with_repetition=self.sampling_with_repetition, split_percentage=self.split_percentage, problem_type=self.problem_type, list_continuous=self.list_continuous, @@ -400,8 +404,8 @@ def __tuning_hyper(self, X, y): list_loss = self.hyper_tuning( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, + X_validation_scaled, + y_validation_scaled, list_hyper, random_state=self.random_state, ) @@ -411,7 +415,7 @@ def __tuning_hyper(self, X, y): best_hyper = dict(zip(self.dict_hypertuning.keys(), best_hyper)) self.set_params(**best_hyper) - def predict(self, X, scale=True): + def predict(self, X): """ This function predicts the regression target for the input samples X. @@ -420,37 +424,35 @@ def predict(self, X, scale=True): X : {array-like, sparse matrix} of shape (n_test_samples, n_features), default=None The input samples. - scale : bool, default=True - Whether to scale the continuous input variables. Returns ------- - res_pred : {array-like, sparse matrix) + predictions : {array-like, sparse matrix) of shape (n_test_samples,) The average predictions across the sub-DNN models. """ if self.problem_type != "regression": raise Exception("Use the predict_proba function for classification") # Prepare the test set for the prediction - if scale: - X = self.__scale_test(X) + if self.preparing_test: + X = self._prepare_test(X) # Process the common prediction part - self.__pred_common(X) + self._pred_common(X) - res_pred = np.zeros((self.pred[0].shape)) + predictions = np.zeros((self.pred[0].shape)) total_n_elements = 0 for ind_mod, pred in enumerate(self.pred): - res_pred += ( + predictions += ( pred * self.optimal_list[ind_mod][1][1].scale_ + self.optimal_list[ind_mod][1][1].mean_ ) total_n_elements += 1 - res_pred = res_pred.copy() / total_n_elements + predictions = predictions.copy() / total_n_elements - return res_pred + return predictions - def predict_proba(self, X, scale=True): + def predict_proba(self, X): """ This function predicts the class probabilities for the input samples X. @@ -459,38 +461,39 @@ def predict_proba(self, X, scale=True): X : {array-like, sparse matrix} of shape (n_test_samples, n_features), default=None The input samples. - scale : bool, default=True - Whether to scale the continuous input variables. Returns ------- - res_pred : {array-like, sparse matrix) + predictions : {array-like, sparse matrix) of shape (n_test_samples,) The average predictions across the sub-DNN models. """ if self.problem_type == "regression": raise Exception("Use the predict function for classification") # Prepare the test set for the prediction - if scale: - X = self.__scale_test(X) + if self.preparing_test: + X = self._prepare_test(X) # Process the common prediction part - self.__pred_common(X) + self._pred_common(X) - res_pred = np.zeros((self.pred[0].shape)) + predictions = np.zeros((self.pred[0].shape)) total_n_elements = 0 for pred in self.pred: - res_pred += self.activation_outcome[self.problem_type](pred) + predictions += self.activation_outcome[self.problem_type](pred) total_n_elements += 1 - res_pred = res_pred.copy() / total_n_elements + predictions = predictions.copy() / total_n_elements if self.problem_type == "binary": - res_pred = np.array( - [[1 - res_pred[i][0], res_pred[i][0]] for i in range(res_pred.shape[0])] + predictions = np.array( + [ + [1 - predictions[i][0], predictions[i][0]] + for i in range(predictions.shape[0]) + ] ) - return res_pred + return predictions - def __scale_test(self, X): + def _prepare_test(self, X): """ This function prepares the input of the DNN estimator either in the default case or after applying the stacking method @@ -584,7 +587,7 @@ def __scale_test(self, X): self.X_test = X_test_n.copy() return X_test_n - def __pred_common(self, X): + def _pred_common(self, X): """ This function performs the prediction for the DNN learner @@ -594,7 +597,7 @@ def __pred_common(self, X): The input samples. """ if not self.group_stacking: - X = [X[0].copy() for i in range(self.n_ensemble)] + X = [X[0].copy() for _ in range(self.n_ensemble)] n_layer = len(self.optimal_list[0][0][0]) - 1 for ind_mod, mod in enumerate(self.optimal_list): diff --git a/hidimstat/__init__.py b/hidimstat/__init__.py index 47c955c..9eb5f9d 100644 --- a/hidimstat/__init__.py +++ b/hidimstat/__init__.py @@ -2,11 +2,11 @@ from .BBI import BlockBasedImportance from .clustered_inference import clustered_inference, hd_inference from .desparsified_lasso import desparsified_group_lasso, desparsified_lasso -from .Dnn_learner_single import Dnn_learner_single +from .Dnn_learner_single import DnnLearnerSingle from .ensemble_clustered_inference import ensemble_clustered_inference from .importance_functions import compute_loco -from .knockoffs import model_x_knockoff from .knockoff_aggregation import knockoff_aggregation +from .knockoffs import model_x_knockoff from .multi_sample_split import aggregate_quantiles from .noise_std import group_reid, reid from .permutation_test import permutation_test_cv @@ -23,7 +23,7 @@ "compute_loco", "desparsified_lasso", "desparsified_group_lasso", - "Dnn_learner_single", + "DnnLearnerSingle", "ensemble_clustered_inference", "group_reid", "hd_inference", diff --git a/hidimstat/clustered_inference.py b/hidimstat/clustered_inference.py index 1273df9..5202f0a 100644 --- a/hidimstat/clustered_inference.py +++ b/hidimstat/clustered_inference.py @@ -1,10 +1,10 @@ import numpy as np -from sklearn.utils import resample from sklearn.preprocessing import StandardScaler +from sklearn.utils import resample from sklearn.utils.validation import check_memory +from .desparsified_lasso import desparsified_group_lasso, desparsified_lasso from .stat_tools import pval_from_cb -from .desparsified_lasso import desparsified_lasso, desparsified_group_lasso def _subsampling(n_samples, train_size, groups=None, seed=0): diff --git a/hidimstat/compute_importance.py b/hidimstat/compute_importance.py index ec99a75..457f52b 100644 --- a/hidimstat/compute_importance.py +++ b/hidimstat/compute_importance.py @@ -1,6 +1,7 @@ +import itertools import warnings from collections import Counter -import itertools + import numpy as np from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.metrics import mean_absolute_error, r2_score, roc_auc_score @@ -617,7 +618,10 @@ def joblib_compute_conditional( if problem_type == "regression": if type_predictor == "DNN": - pred_i = estimator.predict(current_X_test_list, scale=False) + tmp_prepare_test_state = estimator.preparing_test + estimator.preparing_test = False + pred_i = estimator.predict(current_X_test_list) + estimator.preparing_test = tmp_prepare_test_state else: pred_i = estimator.predict(current_X_test_list[0].squeeze()) @@ -630,7 +634,10 @@ def joblib_compute_conditional( ) ** 2 else: if type_predictor == "DNN": - pred_i = estimator.predict_proba(current_X_test_list, scale=False) + tmp_prepare_test_state = estimator.preparing_test + estimator.preparing_test = False + pred_i = estimator.predict_proba(current_X_test_list) + estimator.preparing_test = tmp_prepare_test_state else: pred_i = convert_predict_proba( estimator.predict_proba(current_X_test_list[0].squeeze()) @@ -739,7 +746,10 @@ def joblib_compute_permutation( ) if type_predictor == "DNN": - pred_i = estimator.predict(current_X_test_list, scale=False) + tmp_prepare_test_state = estimator.preparing_test + estimator.preparing_test = False + pred_i = estimator.predict(current_X_test_list) + estimator.preparing_test = tmp_prepare_test_state else: pred_i = estimator.predict(current_X_test_list[0]) @@ -754,7 +764,10 @@ def joblib_compute_permutation( y_test[:, nonzero_cols], original_predictions[:, nonzero_cols] ) if type_predictor == "DNN": - pred_i = estimator.predict_proba(current_X_test_list, scale=False) + tmp_prepare_test_state = estimator.preparing_test + estimator.preparing_test = False + pred_i = estimator.predict_proba(current_X_test_list) + estimator.preparing_test = tmp_prepare_test_state else: pred_i = convert_predict_proba( estimator.predict_proba(current_X_test_list[0]) diff --git a/hidimstat/desparsified_lasso.py b/hidimstat/desparsified_lasso.py index a6169ce..aaa5e03 100644 --- a/hidimstat/desparsified_lasso.py +++ b/hidimstat/desparsified_lasso.py @@ -1,12 +1,12 @@ import numpy as np +from joblib import Parallel, delayed from numpy.linalg import multi_dot from scipy import stats from scipy.linalg import inv -from joblib import Parallel, delayed -from sklearn.utils.validation import check_memory from sklearn.linear_model import Lasso +from sklearn.utils.validation import check_memory -from .noise_std import reid, group_reid +from .noise_std import group_reid, reid from .stat_tools import pval_from_two_sided_pval_and_sign diff --git a/hidimstat/ensemble_clustered_inference.py b/hidimstat/ensemble_clustered_inference.py index d44a277..2c4233e 100644 --- a/hidimstat/ensemble_clustered_inference.py +++ b/hidimstat/ensemble_clustered_inference.py @@ -1,8 +1,8 @@ import numpy as np from joblib import Parallel, delayed -from .multi_sample_split import aggregate_medians, aggregate_quantiles from .clustered_inference import clustered_inference +from .multi_sample_split import aggregate_medians, aggregate_quantiles def _ensembling( @@ -59,7 +59,7 @@ def ensemble_clustered_inference( n_jobs=1, memory=None, verbose=1, - **kwargs + **kwargs, ): """Ensemble clustered inference algorithm @@ -171,7 +171,7 @@ def ensemble_clustered_inference( n_jobs=1, memory=memory, verbose=verbose, - **kwargs + **kwargs, ) for i in np.arange(seed, seed + n_bootstraps) ) diff --git a/hidimstat/importance_functions.py b/hidimstat/importance_functions.py index c6c61c8..a77e79c 100644 --- a/hidimstat/importance_functions.py +++ b/hidimstat/importance_functions.py @@ -1,10 +1,11 @@ -from hidimstat.Dnn_learner_single import Dnn_learner_single import numpy as np from scipy.stats import ttest_1samp from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import OneHotEncoder +from hidimstat.Dnn_learner_single import DnnLearnerSingle + def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed=2024): """ @@ -38,8 +39,8 @@ def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed= dict_encode_outcome = {"regression": False, "classification": True} if use_dnn: - clf_rf_full = Dnn_learner_single( - encode=dict_encode_outcome[problem_type], + clf_rf_full = DnnLearnerSingle( + encoding_outcome=dict_encode_outcome[problem_type], problem_type=problem_type, do_hypertuning=True, random_state=seed, @@ -85,8 +86,8 @@ def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed= # Retrain model for col in range(X.shape[1]): if use_dnn: - clf_rf_retrain = Dnn_learner_single( - encode=dict_encode_outcome[problem_type], + clf_rf_retrain = DnnLearnerSingle( + encoding_outcome=dict_encode_outcome[problem_type], problem_type=problem_type, do_hypertuning=True, random_state=seed, diff --git a/hidimstat/noise_std.py b/hidimstat/noise_std.py index 64080ef..4d953ea 100644 --- a/hidimstat/noise_std.py +++ b/hidimstat/noise_std.py @@ -1,6 +1,6 @@ import numpy as np from numpy.linalg import norm -from scipy.linalg import toeplitz, solve +from scipy.linalg import solve, toeplitz from sklearn.linear_model import LassoCV, MultiTaskLassoCV from sklearn.model_selection import KFold diff --git a/hidimstat/permutation_test.py b/hidimstat/permutation_test.py index a3249bd..3492385 100644 --- a/hidimstat/permutation_test.py +++ b/hidimstat/permutation_test.py @@ -1,11 +1,10 @@ import numpy as np from joblib import Parallel, delayed - from sklearn.base import clone -from sklearn.utils import _safe_indexing -from sklearn.svm import LinearSVR from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline +from sklearn.svm import LinearSVR +from sklearn.utils import _safe_indexing from hidimstat.stat_tools import pval_from_two_sided_pval_and_sign diff --git a/hidimstat/standardized_svr.py b/hidimstat/standardized_svr.py index 9158e2d..e6dcca3 100644 --- a/hidimstat/standardized_svr.py +++ b/hidimstat/standardized_svr.py @@ -1,8 +1,8 @@ import numpy as np from numpy.linalg import norm -from sklearn.svm import LinearSVR from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline +from sklearn.svm import LinearSVR def standardized_svr(X, y, Cs=np.logspace(-7, 1, 9), n_jobs=1): diff --git a/hidimstat/test/test_BBI.py b/hidimstat/test/test_BBI.py index b442d27..b7be1f9 100644 --- a/hidimstat/test/test_BBI.py +++ b/hidimstat/test/test_BBI.py @@ -4,10 +4,11 @@ import numpy as np import pandas as pd -from hidimstat.BBI import BlockBasedImportance from sklearn.datasets import make_classification, make_regression from sklearn.model_selection import train_test_split +from hidimstat.BBI import BlockBasedImportance + # Fixing the random seed rng = np.random.RandomState(2024) @@ -112,7 +113,7 @@ def test_BBI_splitting_scheme(): estimator="RF", do_hypertuning=True, dict_hypertuning=None, - sampling_with_repitition=True, + sampling_with_repetition=True, conditional=False, problem_type="regression", k_fold=2, @@ -130,7 +131,7 @@ def test_BBI_splitting_scheme(): estimator="RF", do_hypertuning=True, dict_hypertuning=None, - sampling_with_repitition=True, + sampling_with_repetition=True, split_percentage=0.8, conditional=False, problem_type="regression", diff --git a/hidimstat/test/test_adaptive_permutation_threshold.py b/hidimstat/test/test_adaptive_permutation_threshold.py index 88939a1..7886253 100644 --- a/hidimstat/test/test_adaptive_permutation_threshold.py +++ b/hidimstat/test/test_adaptive_permutation_threshold.py @@ -5,9 +5,9 @@ import numpy as np from numpy.testing import assert_almost_equal +from hidimstat.adaptive_permutation_threshold import ada_svr from hidimstat.scenario import multivariate_1D_simulation from hidimstat.stat_tools import pval_from_scale -from hidimstat.adaptive_permutation_threshold import ada_svr def test_ada_svr(): diff --git a/hidimstat/test/test_clustered_inference.py b/hidimstat/test/test_clustered_inference.py index 59f0ad5..94e04ef 100644 --- a/hidimstat/test/test_clustered_inference.py +++ b/hidimstat/test/test_clustered_inference.py @@ -3,13 +3,15 @@ """ import numpy as np +from numpy.testing import assert_almost_equal from sklearn.cluster import FeatureAgglomeration from sklearn.feature_extraction import image -from numpy.testing import assert_almost_equal -from hidimstat.scenario import multivariate_1D_simulation -from hidimstat.scenario import multivariate_temporal_simulation from hidimstat.clustered_inference import clustered_inference +from hidimstat.scenario import ( + multivariate_1D_simulation, + multivariate_temporal_simulation, +) def test_clustered_inference(): diff --git a/hidimstat/test/test_desparsified_lasso.py b/hidimstat/test/test_desparsified_lasso.py index bd70d83..713bd2c 100644 --- a/hidimstat/test/test_desparsified_lasso.py +++ b/hidimstat/test/test_desparsified_lasso.py @@ -6,10 +6,11 @@ from numpy.testing import assert_almost_equal, assert_equal from scipy.linalg import toeplitz -from hidimstat.scenario import multivariate_1D_simulation -from hidimstat.scenario import multivariate_temporal_simulation -from hidimstat.desparsified_lasso import desparsified_lasso -from hidimstat.desparsified_lasso import desparsified_group_lasso +from hidimstat.desparsified_lasso import desparsified_group_lasso, desparsified_lasso +from hidimstat.scenario import ( + multivariate_1D_simulation, + multivariate_temporal_simulation, +) def test_desparsified_lasso(): diff --git a/hidimstat/test/test_ensemble_clustered_inference.py b/hidimstat/test/test_ensemble_clustered_inference.py index bc8a7ed..5129a22 100644 --- a/hidimstat/test/test_ensemble_clustered_inference.py +++ b/hidimstat/test/test_ensemble_clustered_inference.py @@ -3,13 +3,15 @@ """ import numpy as np +from numpy.testing import assert_almost_equal from sklearn.cluster import FeatureAgglomeration from sklearn.feature_extraction import image -from numpy.testing import assert_almost_equal -from hidimstat.scenario import multivariate_1D_simulation -from hidimstat.scenario import multivariate_temporal_simulation from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference +from hidimstat.scenario import ( + multivariate_1D_simulation, + multivariate_temporal_simulation, +) def test_ensemble_clustered_inference(): diff --git a/hidimstat/test/test_importance_funtions.py b/hidimstat/test/test_importance_funtions.py index 8ee3127..80f6b32 100644 --- a/hidimstat/test/test_importance_funtions.py +++ b/hidimstat/test/test_importance_funtions.py @@ -2,11 +2,12 @@ Test the importance functions module """ -from hidimstat.importance_functions import compute_loco import numpy as np import pandas as pd from sklearn.datasets import make_classification, make_regression +from hidimstat.importance_functions import compute_loco + # Fixing the random seed rng = np.random.RandomState(2024) diff --git a/hidimstat/test/test_knockoff_aggregation.py b/hidimstat/test/test_knockoff_aggregation.py index d4895b8..a20c8c8 100644 --- a/hidimstat/test/test_knockoff_aggregation.py +++ b/hidimstat/test/test_knockoff_aggregation.py @@ -1,4 +1,5 @@ import numpy as np + from hidimstat import knockoff_aggregation, model_x_knockoff from hidimstat.data_simulation import simu_data from hidimstat.utils import cal_fdp_power diff --git a/hidimstat/test/test_model_x_knockoff.py b/hidimstat/test/test_model_x_knockoff.py index 42bbae7..504e469 100644 --- a/hidimstat/test/test_model_x_knockoff.py +++ b/hidimstat/test/test_model_x_knockoff.py @@ -1,5 +1,5 @@ -from hidimstat.data_simulation import simu_data from hidimstat import model_x_knockoff +from hidimstat.data_simulation import simu_data from hidimstat.utils import cal_fdp_power seed = 42 diff --git a/hidimstat/test/test_noise_std.py b/hidimstat/test/test_noise_std.py index 6d5ec6a..cb89052 100644 --- a/hidimstat/test/test_noise_std.py +++ b/hidimstat/test/test_noise_std.py @@ -6,9 +6,11 @@ from numpy.testing import assert_almost_equal from scipy.linalg import toeplitz -from hidimstat.scenario import multivariate_1D_simulation -from hidimstat.scenario import multivariate_temporal_simulation -from hidimstat.noise_std import reid, group_reid, empirical_snr +from hidimstat.noise_std import empirical_snr, group_reid, reid +from hidimstat.scenario import ( + multivariate_1D_simulation, + multivariate_temporal_simulation, +) def test_reid(): diff --git a/hidimstat/test/test_permutation_test.py b/hidimstat/test/test_permutation_test.py index c33ae4e..6e1835d 100644 --- a/hidimstat/test/test_permutation_test.py +++ b/hidimstat/test/test_permutation_test.py @@ -5,8 +5,8 @@ import numpy as np from numpy.testing import assert_almost_equal -from hidimstat.scenario import multivariate_1D_simulation from hidimstat.permutation_test import permutation_test_cv +from hidimstat.scenario import multivariate_1D_simulation def test_permutation_test(): diff --git a/hidimstat/test/test_scenario.py b/hidimstat/test/test_scenario.py index e655e43..503f684 100644 --- a/hidimstat/test/test_scenario.py +++ b/hidimstat/test/test_scenario.py @@ -5,9 +5,11 @@ import numpy as np from numpy.testing import assert_almost_equal, assert_equal -from hidimstat.scenario import multivariate_1D_simulation -from hidimstat.scenario import multivariate_simulation -from hidimstat.scenario import multivariate_temporal_simulation +from hidimstat.scenario import ( + multivariate_1D_simulation, + multivariate_simulation, + multivariate_temporal_simulation, +) ROI_SIZE_2D = 2 SHAPE_2D = (12, 12) diff --git a/hidimstat/test/test_standardized_svr.py b/hidimstat/test/test_standardized_svr.py index 945108e..a6665c5 100644 --- a/hidimstat/test/test_standardized_svr.py +++ b/hidimstat/test/test_standardized_svr.py @@ -6,8 +6,8 @@ from numpy.testing import assert_almost_equal from hidimstat.scenario import multivariate_1D_simulation -from hidimstat.stat_tools import pval_from_scale from hidimstat.standardized_svr import standardized_svr +from hidimstat.stat_tools import pval_from_scale def test_standardized_svr(): diff --git a/hidimstat/test/test_stat_tools.py b/hidimstat/test/test_stat_tools.py index 7761110..5e5f8fa 100644 --- a/hidimstat/test/test_stat_tools.py +++ b/hidimstat/test/test_stat_tools.py @@ -8,14 +8,14 @@ from hidimstat.stat_tools import ( _replace_infinity, pval_corr_from_pval, - pval_from_scale, - zscore_from_cb, pval_from_cb, - two_sided_pval_from_zscore, - two_sided_pval_from_cb, - zscore_from_pval, + pval_from_scale, pval_from_two_sided_pval_and_sign, + two_sided_pval_from_cb, two_sided_pval_from_pval, + two_sided_pval_from_zscore, + zscore_from_cb, + zscore_from_pval, ) diff --git a/hidimstat/utils.py b/hidimstat/utils.py index 1e30337..0c9c421 100644 --- a/hidimstat/utils.py +++ b/hidimstat/utils.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -# Authors: Binh Nguyen & Jerome-Alexis Chevalier & Ahmad Chamma import copy import numpy as np import torch @@ -103,7 +101,7 @@ def _fixed_quantile_aggregation(pvals, gamma=0.5): Parameters ---------- - pvals : 2D ndarray (n_sampling_with_repitition, n_test) + pvals : 2D ndarray (n_sampling_with_repetition, n_test) p-value (adjusted) gamma : float @@ -131,7 +129,7 @@ def _adaptive_quantile_aggregation(pvals, gamma_min=0.05): def create_X_y( X, y, - sampling_with_repitition=True, + sampling_with_repetition=True, split_percentage=0.8, problem_type="regression", list_continuous=None, @@ -146,8 +144,8 @@ def create_X_y( The input samples before the splitting process. y : ndarray, shape (n_samples, ) The output samples before the splitting process. - sampling_with_repitition : bool, default=True - Sampling with repitition the train part of the train/valid scheme under + sampling_with_repetition : bool, default=True + Sampling with repetition the train part of the train/valid scheme under the training set. The number of training samples in train is equal to the number of instances in the training set. split_percentage : float, default=0.8 @@ -162,16 +160,16 @@ def create_X_y( Returns ------- X_train_scaled : {array-like, sparse matrix} of shape (n_train_samples, n_features) - The sampling_with_repititionped training input samples with scaled continuous variables. + The sampling_with_repetitionped training input samples with scaled continuous variables. y_train_scaled : {array-like} of shape (n_train_samples, ) - The sampling_with_repititionped training output samples scaled if continous. - X_valid_scaled : {array-like, sparse matrix} of shape (n_valid_samples, n_features) + The sampling_with_repetitionped training output samples scaled if continous. + X_validation_scaled : {array-like, sparse matrix} of shape (n_validation_samples, n_features) The validation input samples with scaled continuous variables. - y_valid_scaled : {array-like} of shape (n_valid_samples, ) + y_validation_scaled : {array-like} of shape (n_validation_samples, ) The validation output samples scaled if continous. X_scaled : {array-like, sparse matrix} of shape (n_samples, n_features) The original input samples with scaled continuous variables. - y_valid : {array-like} of shape (n_samples, ) + y_validation : {array-like} of shape (n_samples, ) The original output samples with validation indices. scaler_x : Scikit-learn StandardScaler The standard scaler encoder for the continuous variables of the input. @@ -184,7 +182,7 @@ def create_X_y( scaler_x, scaler_y = StandardScaler(), StandardScaler() n = X.shape[0] - if sampling_with_repitition: + if sampling_with_repetition: train_ind = rng.choice(n, n, replace=True) else: train_ind = rng.choice( @@ -192,36 +190,36 @@ def create_X_y( ) valid_ind = np.array([ind for ind in range(n) if ind not in train_ind]) - X_train, X_valid = X[train_ind], X[valid_ind] - y_train, y_valid = y[train_ind], y[valid_ind] + X_train, X_validation = X[train_ind], X[valid_ind] + y_train, y_validation = y[train_ind], y[valid_ind] # Scaling X and y X_train_scaled = X_train.copy() - X_valid_scaled = X_valid.copy() + X_validation_scaled = X_validation.copy() X_scaled = X.copy() if len(list_continuous) > 0: X_train_scaled[:, list_continuous] = scaler_x.fit_transform( X_train[:, list_continuous] ) - X_valid_scaled[:, list_continuous] = scaler_x.transform( - X_valid[:, list_continuous] + X_validation_scaled[:, list_continuous] = scaler_x.transform( + X_validation[:, list_continuous] ) X_scaled[:, list_continuous] = scaler_x.transform(X[:, list_continuous]) if problem_type == "regression": y_train_scaled = scaler_y.fit_transform(y_train) - y_valid_scaled = scaler_y.transform(y_valid) + y_validation_scaled = scaler_y.transform(y_validation) else: y_train_scaled = y_train.copy() - y_valid_scaled = y_valid.copy() + y_validation_scaled = y_validation.copy() return ( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, + X_validation_scaled, + y_validation_scaled, X_scaled, - y_valid, + y_validation, scaler_x, scaler_y, valid_ind, @@ -315,7 +313,7 @@ def joblib_ensemble_dnnet( activation_outcome=None, list_continuous=None, list_grps=None, - sampling_with_repitition=False, + sampling_with_repetition=False, split_percentage=0.8, group_stacking=False, input_dimensions=None, @@ -349,8 +347,8 @@ def joblib_ensemble_dnnet( list_grps : list of lists, default=None A list collecting the indices of the groups' variables while applying the stacking method. - sampling_with_repitition : bool, default=True - Application of sampling_with_repitition sampling for the training set. + sampling_with_repetition : bool, default=True + Application of sampling_with_repetition sampling for the training set. split_percentage : float, default=0.8 The training/validation cut for the provided data. group_stacking : bool, default=False @@ -395,17 +393,17 @@ def joblib_ensemble_dnnet( ( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, + X_validation_scaled, + y_validation_scaled, X_scaled, - y_valid, + y_validation, scaler_x, scaler_y, valid_ind, ) = create_X_y( X, y, - sampling_with_repitition=sampling_with_repitition, + sampling_with_repetition=sampling_with_repetition, split_percentage=split_percentage, problem_type=problem_type, list_continuous=list_continuous, @@ -415,8 +413,8 @@ def joblib_ensemble_dnnet( current_model = dnn_net( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, + X_validation_scaled, + y_validation_scaled, problem_type=problem_type, n_epoch=n_epoch, batch_size=batch_size, @@ -476,12 +474,14 @@ def joblib_ensemble_dnnet( pred_v = pred * scaler_y.scale_ + scaler_y.mean_ else: pred_v = activation_outcome[problem_type](pred) - loss = np.std(y_valid) ** 2 - mean_squared_error(y_valid, pred_v[valid_ind]) + loss = np.std(y_validation) ** 2 - mean_squared_error( + y_validation, pred_v[valid_ind] + ) else: pred_v = activation_outcome[problem_type](pred) loss = log_loss( - y_valid, np.ones(y_valid.shape) * np.mean(y_valid, axis=0) - ) - log_loss(y_valid, pred_v[valid_ind]) + y_validation, np.ones(y_validation.shape) * np.mean(y_validation, axis=0) + ) - log_loss(y_validation, pred_v[valid_ind]) return (current_model, scaler_x, scaler_y, pred_v, loss) @@ -649,12 +649,12 @@ def evaluate(model, loader, device, problem_type): def dnn_net( X_train, y_train, - X_valid, - y_valid, + X_validation, + y_validation, problem_type="regression", n_epoch=200, batch_size=32, - batch_size_val=128, + batch_size_validation=128, beta1=0.9, beta2=0.999, lr=1e-3, @@ -677,9 +677,9 @@ def dnn_net( The training input samples. y_train : {array-like} of shape (n_train_samples, ) The training output samples. - X_valid : {array-like, sparse matrix} of shape (n_valid_samples, n_features) + X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features) The validation input samples. - y_valid : {array-like} of shape (n_valid_samples, ) + y_validation : {array-like} of shape (n_validation_samples, ) The validation output samples. problem_type : str, default='regression' A classification or a regression problem. @@ -687,7 +687,7 @@ def dnn_net( The number of epochs for the DNN learner(s). batch_size : int, default=32 The number of samples per batch for training. - batch_size_val : int, default=128 + batch_size_validation : int, default=128 The number of samples per batch for validation. beta1 : float, default=0.9 The exponential decay rate for the first moment estimates. @@ -720,7 +720,9 @@ def dnn_net( shuffle=True, batch_size=batch_size, ) - validate_loader = Dataset_Loader(X_valid, y_valid, batch_size=batch_size_val) + validate_loader = Dataset_Loader( + X_validation, y_validation, batch_size=batch_size_validation + ) # Set the seed for PyTorch's random number generator torch.manual_seed(random_state) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..0eff25f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,68 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "HiDimStat" +version = "0.0.1" +dependencies = [ + "joblib>=1.4.2", + "numpy>=2.0.0", + "pandas>=2.2.2", + "scikit-learn>=1.5.1", + "scipy>=1.14.0", + "torch>=2.3.1", + "torchmetrics>=1.4.0.post0", +] +requires-python = ">=3.12" +authors = [{name = "HiDimStat developers"}] +maintainers = [ + {name = "Bertrand Thirion", email = "bertrand.thirion@inria.fr"} +] +description = "High-dimensional statistical inference tools for Python" +readme = "README.md" +classifiers = [ + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "Operating System :: Microsoft :: Windows", + "Operating System :: Unix", + "Operating System :: MacOS", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", +] +license = {file = "LICENSE"} + +[project.optional-dependencies] +# Requirements necessary for building the documentation +doc = [ + "memory_profiler>=0.61.0", + "mne>=1.7.1", + "nilearn>=0.10.4", + "numpydoc>=1.7.0", + "pillow>=10.4.0", + "PyQt5>=5.15.10", + "pyvista>=0.44.0", + "pyvistaqt>=0.11.1", + "sphinx-bootstrap-theme>=0.8.1", + "sphinxcontrib-bibtex>=2.6.2", + "sphinx-gallery>=0.16.0", +] +plotting = [ + "matplotlib>=3.9.1", +] +style = [ + "black>=24.4.2", + "isort>=5.13.2", +] +# For running unit and docstring tests +test = [ + "coverage>=7.6.0", + "pytest>=8.2.2", + "pytest-cov>=5.0.0" +] + +[project.urls] +Development = "https://github.com/nilearn/nilearn" +Homepage = "https://mind-inria.github.io/hidimstat" +Repository = "https://github.com/mind-inria/hidimstat" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 2372b18..0000000 --- a/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cpu -numpy -joblib -scipy -scikit-learn -torch -torchmetrics -pandas -mne diff --git a/setup.py b/setup.py deleted file mode 100644 index 67a37e1..0000000 --- a/setup.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python - -import os -import sys - -from setuptools import find_packages - -PKG = "hidimstat" -DESCRIPTION = "High-dimensional statistical inference tools for Python" -LONG_DESCRIPTION = open("README.md").read() -MAINTAINER = "Chevalier (ja-che), Nguyen (tbng), Alexandre Blain (alexblnn), Ahmad Chamma (achamma723) and Bertrand Thirion (bthirion)" -MAINTAINER_EMAIL = "bertrand.thirion@inria.fr" -URL = "https://github.com/Parietal-INRIA/hidimstat" -DOWNLOAD_URL = "https://github.com/Parietal-INRIA/hidimstat" -LICENSE = "BSD" - - -def load_version(): - """Executes hidimstat/version.py in a globals dictionary and return it. - Following format from Nilearn repo on github. - """ - # load all vars into globals, otherwise - # the later function call using global vars doesn't work. - globals_dict = {} - with open(os.path.join("hidimstat", "version.py")) as fp: - exec(fp.read(), globals_dict) - - return globals_dict - - -def setup_package(version): - local_path = os.path.dirname(os.path.abspath(sys.argv[0])) - - os.chdir(local_path) - sys.path.insert(0, local_path) - - from numpy.distutils.core import setup - - setup( - packages=find_packages(exclude=["contrib", "docs", "tests"]), - name=PKG, - maintainer=MAINTAINER, - include_package_data=True, - maintainer_email=MAINTAINER_EMAIL, - description=DESCRIPTION, - long_description=LONG_DESCRIPTION, - long_description_content_type="text/markdown", - license=LICENSE, - url=URL, - version=version, - # download_url=DOWNLOAD_URL, - zip_safe=False, # the package can run out of an .egg file - classifiers=[ - "Programming Language :: Python", - "Programming Language :: Python :: 3.5", - "Development Status :: 3 - Alpha", - ], - ) - - -_VERSION_GLOBALS = load_version() -VERSION = _VERSION_GLOBALS["__version__"] - -if __name__ == "__main__": - setup_package(VERSION)