From bea5578eb279f409d9632ef687a0e3660e51fddb Mon Sep 17 00:00:00 2001 From: achamma723 Date: Thu, 18 Jul 2024 15:28:34 +0200 Subject: [PATCH] Tackle comments + standardize predict functions --- hidimstat/BBI.py | 239 ++++++++++++++++-------------- hidimstat/Dnn_learner.py | 116 ++++++++------- hidimstat/Dnn_learner_single.py | 117 ++++++++------- hidimstat/__init__.py | 4 +- hidimstat/compute_importance.py | 20 ++- hidimstat/importance_functions.py | 10 +- hidimstat/test/test_BBI.py | 4 +- hidimstat/utils.py | 85 +++++------ 8 files changed, 311 insertions(+), 284 deletions(-) diff --git a/hidimstat/BBI.py b/hidimstat/BBI.py index b6d90a7..521b855 100644 --- a/hidimstat/BBI.py +++ b/hidimstat/BBI.py @@ -22,7 +22,7 @@ from sklearn.utils.validation import check_is_fitted from .compute_importance import joblib_compute_conditional, joblib_compute_permutation -from .Dnn_learner import Dnn_learner +from .Dnn_learner import DnnLearner from .utils import compute_imp_std, convert_predict_proba, create_X_y @@ -72,8 +72,10 @@ class BlockBasedImportance(BaseEstimator, TransformerMixin): inference estimator. problem_type : str, default='regression' A classification or a regression problem. - sampling_with_repitition : bool, default=True - Sampling with repitition the train part of the train/valid scheme under + encoding_input : bool, default=True + To one-hot or ordinal encode the nominal and ordinal input variables. + sampling_with_repetition : bool, default=True + Sampling with repetition the train part of the train/valid scheme under the training set. The number of training samples in train is equal to the number of instances in the training set. split_percentage : float, default=0.8 @@ -108,7 +110,7 @@ class BlockBasedImportance(BaseEstimator, TransformerMixin): random_state : int, default=2023 Fixing the seeds of the random generator. do_compute_importance : boolean, default=True - Whether to Compute the Importance Scores. + Whether to compute the Importance Scores. group_fold : list, default=None The list of group labels to perform GroupKFold to keep subjects within the same training or test set. @@ -126,7 +128,8 @@ def __init__( do_hypertuning=True, dict_hypertuning=None, problem_type="regression", - sampling_with_repitition=True, + encoding_input=True, + sampling_with_repetition=True, split_percentage=0.8, conditional=True, variables_categories=None, @@ -150,7 +153,8 @@ def __init__( self.do_hypertuning = do_hypertuning self.dict_hypertuning = dict_hypertuning self.problem_type = problem_type - self.sampling_with_repitition = sampling_with_repitition + self.encoding_input = encoding_input + self.sampling_with_repetition = sampling_with_repetition self.split_percentage = split_percentage self.conditional = conditional self.variables_categories = variables_categories @@ -402,21 +406,23 @@ def fit(self, X, y=None): X_prev = X.copy() X = np.zeros((y.shape[0], len(self.list_grps))) - for ind_fold, (train, test) in enumerate(cv.split(X_prev)): + for index_fold, (train, test) in enumerate(cv.split(X_prev)): X_train, X_test = X_prev[train], X_prev[test] y_train, _ = y[train], y[test] if len(self.coffeine_transformers) > 1: - X_train = self.coffeine_transformers[ind_fold].fit_transform( + X_train = self.coffeine_transformers[index_fold].fit_transform( pd.DataFrame(X_train, columns=self.X_cols), np.ravel(y_train), ) - X_test = self.coffeine_transformers[ind_fold].transform( + X_test = self.coffeine_transformers[index_fold].transform( pd.DataFrame(X_test, columns=self.X_cols) ) for grp_ind, grp in enumerate(self.list_grps): - self.ridge_mods[ind_fold][grp_ind].fit(X_train[:, grp], y_train) + self.ridge_mods[index_fold][grp_ind].fit( + X_train[:, grp], y_train + ) X[test, grp_ind] = ( - self.ridge_mods[ind_fold][grp_ind] + self.ridge_mods[index_fold][grp_ind] .predict(X_test[:, grp]) .ravel() ) @@ -440,9 +446,9 @@ def fit(self, X, y=None): # Initialize the first estimator (block learner) if self.estimator == "DNN": - self.estimator = Dnn_learner( + self.estimator = DnnLearner( + encoding_outcome=True, problem_type=self.problem_type, - encode=True, do_hypertuning=False, list_continuous=self.list_continuous, list_grps=self.list_grps, @@ -482,12 +488,12 @@ def fit(self, X, y=None): ) list_splits = kf.split(X) - for ind_fold, (train_index, test_index) in enumerate(list_splits): - print(f"Processing: {ind_fold+1}") + for index_fold, (train_index, test_index) in enumerate(list_splits): + print(f"Processing: {index_fold+1}") X_fold = X.copy() y_fold = y.copy() - self.X_nominal[ind_fold] = X_nominal_org.iloc[test_index, :] + self.X_nominal[index_fold] = X_nominal_org.iloc[test_index, :] X_train, X_test = ( X_fold[train_index, :], @@ -498,26 +504,26 @@ def fit(self, X, y=None): if not self.apply_ridge: if self.coffeine_transformer is not None: - X_train = self.coffeine_transformers[ind_fold].fit_transform( + X_train = self.coffeine_transformers[index_fold].fit_transform( pd.DataFrame(X_train, columns=self.X_cols), np.ravel(y_train), ) - X_test = self.coffeine_transformers[ind_fold].transform( + X_test = self.coffeine_transformers[index_fold].transform( pd.DataFrame(X_test, columns=self.X_cols) ) - self.X_test[ind_fold] = X_test.copy() - self.y_test[ind_fold] = y_test.copy() - self.y_train[ind_fold] = y_train.copy() + self.X_test[index_fold] = X_test.copy() + self.y_test[index_fold] = y_test.copy() + self.y_train[index_fold] = y_train.copy() # Find the list of optimal sub-models to be used in the # following steps (Default estimator) if self.do_hypertuning: - self.__tuning_hyper(X_train, y_train, ind_fold) + self.__tuning_hyper(X_train, y_train, index_fold) if self.type == "DNN": self.estimator.fit(X_train, y_train) - self.list_estimators[ind_fold] = copy(self.estimator) + self.list_estimators[index_fold] = copy(self.estimator) else: self.y_train = y.copy() @@ -546,7 +552,7 @@ def fit(self, X, y=None): self.is_fitted = True return self - def __tuning_hyper(self, X, y, ind_fold=None): + def __tuning_hyper(self, X, y, index_fold=None): """ Tune the hyperparameters of the provided inference estimator. @@ -557,16 +563,16 @@ def __tuning_hyper(self, X, y, ind_fold=None): y : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs) The target values (class labels in classification, real numbers in regression). - ind_fold : int, default=None - The indice of the corresponding fold. + index_fold : int, default=None + The index of the corresponding fold. """ if not ((self.apply_ridge) and (self.group_stacking)): ( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, + X_validation_scaled, + y_validation_scaled, X_scaled, __, scaler_x, @@ -575,14 +581,14 @@ def __tuning_hyper(self, X, y, ind_fold=None): ) = create_X_y( X, y, - sampling_with_repitition=self.sampling_with_repitition, + sampling_with_repetition=self.sampling_with_repetition, split_percentage=self.split_percentage, problem_type=self.problem_type, list_continuous=self.list_continuous, random_state=self.random_state, ) if self.dict_hypertuning is not None: - list_hyper = list( + list_hypertuning = list( itertools.product(*list(self.dict_hypertuning.values())) ) list_loss = [] @@ -590,32 +596,32 @@ def __tuning_hyper(self, X, y, ind_fold=None): list_loss = self.estimator.hyper_tuning( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, - list_hyper, + X_validation_scaled, + y_validation_scaled, + list_hypertuning, random_state=self.random_state, ) else: if self.dict_hypertuning is None: self.estimator.fit(X_scaled, y) # If not a DNN learner case, need to save the scalers - self.scaler_x[ind_fold] = scaler_x - self.scaler_y[ind_fold] = scaler_y + self.scaler_x[index_fold] = scaler_x + self.scaler_y[index_fold] = scaler_y return else: - for ind_el, el in enumerate(list_hyper): + for ind_el, el in enumerate(list_hypertuning): curr_params = dict( (k, v) for v, k in zip(el, list(self.dict_hypertuning.keys())) ) - list_hyper[ind_el] = curr_params + list_hypertuning[ind_el] = curr_params self.estimator.set_params(**curr_params) if self.problem_type == "regression": y_train_curr = ( y_train_scaled * scaler_y.scale_ + scaler_y.mean_ ) - y_valid_curr = ( - y_valid_scaled * scaler_y.scale_ + scaler_y.mean_ + y_validation_curr = ( + y_validation_scaled * scaler_y.scale_ + scaler_y.mean_ ) def func(x): @@ -623,7 +629,7 @@ def func(x): else: y_train_curr = y_train_scaled.copy() - y_valid_curr = y_valid_scaled.copy() + y_validation_curr = y_validation_scaled.copy() def func(x): return self.estimator.predict_proba(x) @@ -633,17 +639,19 @@ def func(x): if self.problem_type == "classification": list_loss.append( self.loss( - y_valid_curr, - func(X_valid_scaled)[:, np.unique(y_valid_curr)], + y_validation_curr, + func(X_validation_scaled)[ + :, np.unique(y_validation_curr) + ], ) ) else: list_loss.append( - self.loss(y_valid_curr, func(X_valid_scaled)) + self.loss(y_validation_curr, func(X_validation_scaled)) ) ind_min = np.argmin(list_loss) - best_hyper = list_hyper[ind_min] + best_hyper = list_hypertuning[ind_min] if not isinstance(best_hyper, dict): best_hyper = dict(zip(self.dict_hypertuning.keys(), best_hyper)) @@ -651,13 +659,13 @@ def func(x): self.estimator.fit(X_scaled, y) # If not a DNN learner case, need to save the scalers - self.scaler_x[ind_fold] = scaler_x - self.scaler_y[ind_fold] = scaler_y + self.scaler_x[index_fold] = scaler_x + self.scaler_y[index_fold] = scaler_y else: self.estimator.fit(X, y) - def predict(self, X=None, encoding=True): + def predict(self, X=None): """ This function predicts the regression target for the input samples X. @@ -666,12 +674,11 @@ def predict(self, X=None, encoding=True): X : {array-like, sparse matrix} of shape (n_test_samples, n_features), defaut=None The input samples. - encoding : bool, default=True - Whether to encode the non-continuous input variables. Returns ------- - Average predictions across all samples. + predictions: array-like of shape (n_test_samples,) + The average predictions across all folds. """ if not isinstance(X, list): list_X = [X.copy() for el in range(max(self.k_fold, 1))] @@ -680,37 +687,37 @@ def predict(self, X=None, encoding=True): list_X = X.copy() mean_pred = False - for ind_fold, curr_X in enumerate(list_X): + for index_fold, curr_X in enumerate(list_X): # Prepare the test set for the prediction - if encoding: - X_tmp = self.__encode_input(curr_X) + if self.encoding_input: + X_tmp = self._encode_input(curr_X) else: X_tmp = curr_X.copy() if self.type != "DNN": if not isinstance(curr_X, np.ndarray): X_tmp = np.array(X_tmp) - if self.scaler_x[ind_fold] is not None: - X_tmp[:, self.list_continuous] = self.scaler_x[ind_fold].transform( - X_tmp[:, self.list_continuous] - ) - self.X_proc[ind_fold] = [X_tmp.copy()] + if self.scaler_x[index_fold] is not None: + X_tmp[:, self.list_continuous] = self.scaler_x[ + index_fold + ].transform(X_tmp[:, self.list_continuous]) + self.X_proc[index_fold] = [X_tmp.copy()] - self.org_pred[ind_fold] = self.list_estimators[ind_fold].predict(X_tmp) + self.org_pred[index_fold] = self.list_estimators[index_fold].predict(X_tmp) # Convert to the (n_samples x n_outputs) format - if len(self.org_pred[ind_fold].shape) != 2: - self.org_pred[ind_fold] = self.org_pred[ind_fold].reshape(-1, 1) + if len(self.org_pred[index_fold].shape) != 2: + self.org_pred[index_fold] = self.org_pred[index_fold].reshape(-1, 1) if self.type == "DNN": - self.X_proc[ind_fold] = np.array( - self.list_estimators[ind_fold].X_test.copy() + self.X_proc[index_fold] = np.array( + self.list_estimators[index_fold].X_test.copy() ).swapaxes(0, 1) if mean_pred: return np.mean(np.array(self.org_pred), axis=0) - def predict_proba(self, X=None, encoding=True): + def predict_proba(self, X=None): """ This function predicts the class probabilities for the input samples X. @@ -719,12 +726,11 @@ def predict_proba(self, X=None, encoding=True): X : {array-like, sparse matrix} of shape (n_test_samples, n_features), default=None The input samples. - encoding : bool, default=True - Whether to encode the non-continuous input variables. Returns ------- - Average predictions across all samples. + predictions: array-like of shape (n_test_samples,) + The average predictions across all folds. """ if not isinstance(X, list): list_X = [X.copy() for el in range(max(self.k_fold, 1))] @@ -733,37 +739,39 @@ def predict_proba(self, X=None, encoding=True): list_X = X.copy() mean_pred = False - for ind_fold, curr_X in enumerate(list_X): + for index_fold, curr_X in enumerate(list_X): # Prepare the test set for the prediction - if encoding: - X_tmp = self.__encode_input(curr_X) + if self.encoding_input: + X_tmp = self._encode_input(curr_X) else: X_tmp = curr_X.copy() if self.type != "DNN": if not isinstance(curr_X, np.ndarray): X_tmp = np.array(X_tmp) - if self.scaler_x[ind_fold] is not None: - X_tmp[:, self.list_continuous] = self.scaler_x[ind_fold].transform( - X_tmp[:, self.list_continuous] - ) - self.X_proc[ind_fold] = [X_tmp.copy()] + if self.scaler_x[index_fold] is not None: + X_tmp[:, self.list_continuous] = self.scaler_x[ + index_fold + ].transform(X_tmp[:, self.list_continuous]) + self.X_proc[index_fold] = [X_tmp.copy()] - self.org_pred[ind_fold] = self.list_estimators[ind_fold].predict_proba( + self.org_pred[index_fold] = self.list_estimators[index_fold].predict_proba( X_tmp ) if self.type == "DNN": - self.X_proc[ind_fold] = np.array( - self.list_estimators[ind_fold].X_test.copy() + self.X_proc[index_fold] = np.array( + self.list_estimators[index_fold].X_test.copy() ).swapaxes(0, 1) else: - self.org_pred[ind_fold] = convert_predict_proba(self.org_pred[ind_fold]) + self.org_pred[index_fold] = convert_predict_proba( + self.org_pred[index_fold] + ) if mean_pred: return np.mean(np.array(self.org_pred), axis=0) - def __encode_input(self, X): + def _encode_input(self, X): """ This function encodes the non-continuous variables in the design matrix X. @@ -821,12 +829,12 @@ def compute_importance(self, X=None, y=None): """ # Check is fit had been called check_is_fitted(self, ["is_fitted"]) - encoding = True + self.encoding_input = True if self.k_fold != 0: X = self.X_test.copy() y = self.y_test.copy() - encoding = False + self.encoding_input = False else: if self.coffeine_transformer is not None: X = self.coffeine_transformers[0].transform( @@ -870,36 +878,35 @@ def compute_importance(self, X=None, y=None): # Compute original predictions if self.problem_type == "regression": output_dimension = y[0].shape[1] - self.predict(X, encoding=encoding) + self.predict(X) else: output_dimension = 1 - self.predict_proba(X, encoding=encoding) - + self.predict_proba(X) list_seeds_imp = self.rng.randint(1e5, size=self.n_permutations) parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose) score_imp_l = [] results = {} # n_features x n_permutations x n_samples - for ind_fold, estimator in enumerate(self.list_estimators): + for index_fold, estimator in enumerate(self.list_estimators): if self.type == "DNN": - for y_col in range(y[ind_fold].shape[-1]): + for y_col in range(y[index_fold].shape[-1]): _ = self.estimator.encode_outcome( - self.y_train[ind_fold], train=True + self.y_train[index_fold], train=True )[y_col] - y[ind_fold] = self.estimator.encode_outcome( - y[ind_fold], train=False + y[index_fold] = self.estimator.encode_outcome( + y[index_fold], train=False )[y_col] else: if self.problem_type in ("classification", "binary"): one_hot = OneHotEncoder(handle_unknown="ignore").fit( - self.y_train[ind_fold].reshape(-1, 1) + self.y_train[index_fold].reshape(-1, 1) ) - y[ind_fold] = one_hot.transform( - y[ind_fold].reshape(-1, 1) + y[index_fold] = one_hot.transform( + y[index_fold].reshape(-1, 1) ).toarray() if self.do_compute_importance: if not self.conditional: - self.pred_scores[ind_fold], score_cur = list( + self.pred_scores[index_fold], score_cur = list( zip( *parallel( delayed(joblib_compute_permutation)( @@ -907,14 +914,14 @@ def compute_importance(self, X=None, y=None): permutation, estimator, self.type, - self.X_proc[ind_fold], - y[ind_fold], + self.X_proc[index_fold], + y[index_fold], self.problem_type, - self.org_pred[ind_fold], + self.org_pred[index_fold], dict_continuous=self.dict_continuous, dict_nominal=self.dict_nominal, processed_column=variables_interest, - iteration_index=ind_fold + 1, + iteration_index=index_fold + 1, group_stacking=self.group_stacking, random_state=list_seeds_imp[permutation], verbose=self.verbose, @@ -924,18 +931,18 @@ def compute_importance(self, X=None, y=None): ) ) ) - self.pred_scores[ind_fold] = np.array( - self.pred_scores[ind_fold] + self.pred_scores[index_fold] = np.array( + self.pred_scores[index_fold] ).reshape( ( len(self.list_columns), self.n_permutations, - y[ind_fold].shape[0], + y[index_fold].shape[0], output_dimension, ) ) else: - self.pred_scores[ind_fold], score_cur = list( + self.pred_scores[index_fold], score_cur = list( zip( *parallel( delayed(joblib_compute_conditional)( @@ -944,18 +951,18 @@ def compute_importance(self, X=None, y=None): estimator, self.type, self.importance_estimator, - self.X_proc[ind_fold], - y[ind_fold], + self.X_proc[index_fold], + y[index_fold], self.problem_type, - self.org_pred[ind_fold], + self.org_pred[index_fold], seed=self.random_state, dict_continuous=self.dict_continuous, dict_nominal=self.dict_nominal, - X_nominal=self.X_nominal[ind_fold], + X_nominal=self.X_nominal[index_fold], variables_categories=self.variables_categories, encoder=self.dict_enc, processed_column=variables_interest, - iteration_index=ind_fold + 1, + iteration_index=index_fold + 1, group_stacking=self.group_stacking, sub_groups=[self.list_columns, self.sub_groups], list_seeds=list_seeds_imp, @@ -967,19 +974,21 @@ def compute_importance(self, X=None, y=None): ) ) ) - self.pred_scores[ind_fold] = np.array(self.pred_scores[ind_fold]) + self.pred_scores[index_fold] = np.array( + self.pred_scores[index_fold] + ) score_imp_l.append(score_cur[0]) else: if self.problem_type in ("classification", "binary"): - nonzero_cols = np.where(y[ind_fold].any(axis=0))[0] + nonzero_cols = np.where(y[index_fold].any(axis=0))[0] score = roc_auc_score( - y[ind_fold][:, nonzero_cols], - self.org_pred[ind_fold][:, nonzero_cols], + y[index_fold][:, nonzero_cols], + self.org_pred[index_fold][:, nonzero_cols], ) else: score = ( - mean_absolute_error(y[ind_fold], self.org_pred[ind_fold]), - r2_score(y[ind_fold], self.org_pred[ind_fold]), + mean_absolute_error(y[index_fold], self.org_pred[index_fold]), + r2_score(y[index_fold], self.org_pred[index_fold]), ) score_imp_l.append(score) @@ -995,8 +1004,8 @@ def compute_importance(self, X=None, y=None): # Compute Importance and P-values pred_scores_full = [ - np.mean(self.pred_scores[ind_fold], axis=1) - for ind_fold in range(max(self.k_fold, 1)) + np.mean(self.pred_scores[index_fold], axis=1) + for index_fold in range(max(self.k_fold, 1)) ] results["importance"] = compute_imp_std(pred_scores_full)[0] results["std"] = compute_imp_std(pred_scores_full)[1] diff --git a/hidimstat/Dnn_learner.py b/hidimstat/Dnn_learner.py index c986aa1..48a476d 100644 --- a/hidimstat/Dnn_learner.py +++ b/hidimstat/Dnn_learner.py @@ -1,17 +1,19 @@ import numpy as np from sklearn.base import BaseEstimator -from .Dnn_learner_single import Dnn_learner_single +from .Dnn_learner_single import DnnLearnerSingle -class Dnn_learner(BaseEstimator): +class DnnLearner(BaseEstimator): """ This class implements the high-level of the Multi-Layer Perceptron (MLP) - learner. + learner across multi-outputs. Parameters ---------- - encode : bool, default=False + preparing_test : bool, default=True + Whether to prepare the test set especially after stacking. + encoding_outcome : bool, default=False Whether to encode the categorical outcome. do_hypertuning : bool, default=True Tuning the hyperparameters of the provided estimator. @@ -23,14 +25,12 @@ class Dnn_learner(BaseEstimator): The minimal number of sub-DNNs to keep if > 10. batch_size : int, default=32 The number of samples per batch for training. - batch_size_val : int, default=128 + batch_size_validation : int, default=128 The number of samples per batch for validation. n_epoch : int, default=200 The number of epochs for the DNN learner(s). - verbose : int, default=0 - If verbose > 0, the fitted iterations will be printed. - sampling_with_repitition : bool, default=True - Application of sampling_with_repitition sampling for the training set. + sampling_with_repetition : bool, default=True + Application of sampling_with_repetition sampling for the training set. split_percentage : float, default=0.8 The training/validation cut for the provided data. problem_type : str, default='regression' @@ -60,20 +60,22 @@ class Dnn_learner(BaseEstimator): The cumsum of inputs after the linear sub-layers. random_state : int, default=2023 Fixing the seeds of the random generator. + verbose : int, default=0 + If verbose > 0, the fitted iterations will be printed. """ def __init__( self, - encode=False, + preparing_test=True, + encoding_outcome=False, do_hypertuning=False, dict_hypertuning=None, n_ensemble=10, min_keep=10, batch_size=32, - batch_size_val=128, + batch_size_validation=128, n_epoch=200, - verbose=0, - sampling_with_repitition=True, + sampling_with_repetition=True, split_percentage=0.8, problem_type="regression", list_continuous=None, @@ -88,18 +90,19 @@ def __init__( group_stacking=False, input_dimensions=None, random_state=2023, + verbose=0, ): self.list_estimators = [] - self.encode = encode + self.preparing_test = preparing_test + self.encoding_outcome = encoding_outcome self.do_hypertuning = do_hypertuning self.dict_hypertuning = dict_hypertuning self.n_ensemble = n_ensemble self.min_keep = min_keep self.batch_size = batch_size - self.batch_size_val = batch_size_val + self.batch_size_validation = batch_size_validation self.n_epoch = n_epoch - self.verbose = verbose - self.sampling_with_repitition = sampling_with_repitition + self.sampling_with_repetition = sampling_with_repetition self.split_percentage = split_percentage self.problem_type = problem_type self.list_grps = list_grps @@ -114,10 +117,9 @@ def __init__( self.group_stacking = group_stacking self.input_dimensions = input_dimensions self.random_state = random_state + self.verbose = verbose self.pred = [None] * n_ensemble - self.enc_y = [] - self.is_encoded = False - self.dim_repeat = 1 + self.dimension_repeat = 1 def fit(self, X, y=None): """ @@ -141,23 +143,24 @@ def fit(self, X, y=None): if (len(X.shape) != 3) or (X.shape[0] != y.shape[-1]): X = np.squeeze(X) X = np.array([X for i in range(y.shape[-1])]) - self.dim_repeat = y.shape[-1] + self.dimension_repeat = y.shape[-1] self.list_estimators = [None] * y.shape[-1] self.X_test = [None] * y.shape[-1] for y_col in range(y.shape[-1]): - self.list_estimators[y_col] = Dnn_learner_single( - encode=self.encode, + self.list_estimators[y_col] = DnnLearnerSingle( + preparing_test=self.preparing_test, + encoding_outcome=self.encoding_outcome, do_hypertuning=self.do_hypertuning, dict_hypertuning=self.dict_hypertuning, n_ensemble=self.n_ensemble, min_keep=self.min_keep, batch_size=self.batch_size, - batch_size_val=self.batch_size_val, + batch_size_validation=self.batch_size_validation, n_epoch=self.n_epoch, verbose=self.verbose, - sampling_with_repitition=self.sampling_with_repitition, + sampling_with_repetition=self.sampling_with_repetition, split_percentage=self.split_percentage, problem_type=self.problem_type, list_continuous=self.list_continuous, @@ -182,9 +185,9 @@ def hyper_tuning( self, X_train, y_train, - X_valid, - y_valid, - list_hyper=None, + X_validation, + y_validation, + list_hypertuning=None, random_state=None, ): """ @@ -197,27 +200,28 @@ def hyper_tuning( y_train : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs) The target values (class labels in classification, real numbers in regression) for the training samples. - X_train : {array-like, sparse matrix} of shape (n_valid_samples, n_features) + X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features) The validation input samples. - y_train : array-like of shape (n_valid_samples,) or (n_valid_samples, n_outputs) + y_validation : array-like of shape (n_validation_samples,) or (n_validation_samples, n_outputs) The target values (class labels in classification, real numbers in regression) for the validation samples. - list_hyper : list of tuples, default=None + list_hypertuning : list of tuples, default=None The list of tuples for the hyperparameters values. random_state : int, default=None Fixing the seeds of the random generator. """ - estimator = Dnn_learner_single( - encode=self.encode, + estimator = DnnLearnerSingle( + preparing_test=self.preparing_test, + encoding_outcome=self.encoding_outcome, do_hypertuning=self.do_hypertuning, dict_hypertuning=self.dict_hypertuning, n_ensemble=self.n_ensemble, min_keep=self.min_keep, batch_size=self.batch_size, - batch_size_val=self.batch_size_val, + batch_size_validation=self.batch_size_validation, n_epoch=self.n_epoch, verbose=self.verbose, - sampling_with_repitition=self.sampling_with_repitition, + sampling_with_repetition=self.sampling_with_repetition, split_percentage=self.split_percentage, problem_type=self.problem_type, list_continuous=self.list_continuous, @@ -234,10 +238,10 @@ def hyper_tuning( random_state=self.random_state, ) return estimator.hyper_tuning( - X_train, y_train, X_valid, y_valid, list_hyper, random_state + X_train, y_train, X_validation, y_validation, list_hypertuning, random_state ) - def predict(self, X, scale=True): + def predict(self, X): """ This function predicts the regression target for the input samples X. @@ -246,29 +250,28 @@ def predict(self, X, scale=True): X : {array-like, sparse matrix} of shape (n_test_samples, n_features), default=None The input samples. - scale : bool, default=True - Whether to scale the continuous input variables. Returns ------- - predictions : {array-like, sparse matrix) - The average predictions across the sub-DNN models. + predictions : {array-like, sparse matrix) of shape (n_test_samples, n_outputs) + The predictions across multi-outputs. """ if isinstance(X, list): - X = [self.check_X_dim(el) for el in X] + X = [self.check_X_dimension(el) for el in X] else: - X = self.check_X_dim(X) + X = self.check_X_dimension(X) list_res = [] for estimator_ind, estimator in enumerate(self.list_estimators): + estimator.preparing_test = self.preparing_test if isinstance(X, list): curr_X = [el[estimator_ind, ...] for el in X] - list_res.append(estimator.predict(curr_X, scale)) + list_res.append(estimator.predict(curr_X)) else: - list_res.append(estimator.predict(X[estimator_ind, ...], scale)) + list_res.append(estimator.predict(X[estimator_ind, ...])) self.X_test[estimator_ind] = estimator.X_test.copy() return np.array(list_res) - def predict_proba(self, X, scale=True): + def predict_proba(self, X): """ This function predicts the class probabilities for the input samples X. @@ -277,26 +280,25 @@ def predict_proba(self, X, scale=True): X : {array-like, sparse matrix} of shape (n_test_samples, n_features), default=None The input samples. - scale : bool, default=True - Whether to scale the continuous input variables. Returns ------- - predictions : {array-like, sparse matrix) - The average predictions across the sub-DNN models. + predictions : {array-like, sparse matrix) of shape (n_test_samples, n_outputs) + The predictions across multi-outputs. """ if isinstance(X, list): - X = [self.check_X_dim(el) for el in X] + X = [self.check_X_dimension(el) for el in X] else: - X = self.check_X_dim(X) + X = self.check_X_dimension(X) list_res = [] for estimator_ind, estimator in enumerate(self.list_estimators): + estimator.preparing_test = self.preparing_test if isinstance(X, list): curr_X = [el[estimator_ind, ...] for el in X] - list_res.append(estimator.predict_proba(curr_X, scale)) + list_res.append(estimator.predict_proba(curr_X)) else: - list_res.append(estimator.predict_proba(X[estimator_ind, ...], scale)) + list_res.append(estimator.predict_proba(X[estimator_ind, ...])) self.X_test[estimator_ind] = estimator.X_test.copy() return np.squeeze(np.array(list_res)) @@ -309,13 +311,13 @@ def set_params(self, **kwargs): for estimator in self.list_estimators: setattr(estimator, key, value) - def check_X_dim(self, X): + def check_X_dimension(self, X): """ This function checks for the compatibility of the dimensions of X """ - if (len(X.shape) != 3) or (X.shape[0] != self.dim_repeat): + if (len(X.shape) != 3) or (X.shape[0] != self.dimension_repeat): X = np.squeeze(X) - X = np.array([X for i in range(self.dim_repeat)]) + X = np.array([X for _ in range(self.dimension_repeat)]) return X diff --git a/hidimstat/Dnn_learner_single.py b/hidimstat/Dnn_learner_single.py index 64b7a7b..ddffe5b 100644 --- a/hidimstat/Dnn_learner_single.py +++ b/hidimstat/Dnn_learner_single.py @@ -19,14 +19,16 @@ ) -class Dnn_learner_single(BaseEstimator): +class DnnLearnerSingle(BaseEstimator): """ This class implements the Multi-Layer Perceptron (MLP) default inference learner for Block-Based Importance (BBI) framework. Parameters ---------- - encode : bool, default=False + preparing_test : bool, default=True + Whether to scale the continuous variables in the test set. + encoding_outcome : bool, default=False Whether to encode the categorical outcome. do_hypertuning : bool, default=True Tuning the hyperparameters of the provided estimator. @@ -38,14 +40,12 @@ class Dnn_learner_single(BaseEstimator): The minimal number of sub-DNNs to keep if > 10. batch_size : int, default=32 The number of samples per batch for training. - batch_size_val : int, default=128 + batch_size_validation : int, default=128 The number of samples per batch for validation. n_epoch : int, default=200 The number of epochs for the DNN learner(s). - verbose : int, default=0 - If verbose > 0, the fitted iterations will be printed. - sampling_with_repitition : bool, default=True - Application of sampling_with_repitition sampling for the training set + sampling_with_repetition : bool, default=True + Application of sampling_with_repetition sampling for the training set split_percentage : float, default=0.8 The training/validation cut for the provided data. problem_type : str, default='regression' @@ -75,20 +75,22 @@ class Dnn_learner_single(BaseEstimator): The cumsum of inputs after the linear sub-layers. random_state : int, default=2023 Fixing the seeds of the random generator. + verbose : int, default=0 + If verbose > 0, the fitted iterations will be printed. """ def __init__( self, - encode=False, + preparing_test=True, + encoding_outcome=False, do_hypertuning=False, dict_hypertuning=None, n_ensemble=10, min_keep=10, batch_size=32, - batch_size_val=128, + batch_size_validation=128, n_epoch=200, - verbose=0, - sampling_with_repitition=True, + sampling_with_repetition=True, split_percentage=0.8, problem_type="regression", list_continuous=None, @@ -103,17 +105,18 @@ def __init__( group_stacking=False, input_dimensions=None, random_state=2023, + verbose=0, ): - self.encode = encode + self.preparing_test = preparing_test + self.encoding_outcome = encoding_outcome self.do_hypertuning = do_hypertuning self.dict_hypertuning = dict_hypertuning self.n_ensemble = n_ensemble self.min_keep = min_keep self.batch_size = batch_size - self.batch_size_val = batch_size_val + self.batch_size_validation = batch_size_validation self.n_epoch = n_epoch - self.verbose = verbose - self.sampling_with_repitition = sampling_with_repitition + self.sampling_with_repetition = sampling_with_repetition self.split_percentage = split_percentage self.problem_type = problem_type self.list_grps = list_grps @@ -128,6 +131,7 @@ def __init__( self.group_stacking = group_stacking self.input_dimensions = input_dimensions self.random_state = random_state + self.verbose = verbose self.enc_y = [] self.activation_outcome = { "classification": softmax, @@ -159,9 +163,9 @@ def fit(self, X, y=None): y = y.reshape(-1, 1) # Disabling the encoding parameter with the regression case if self.problem_type == "regression": - self.encode = False + self.encoding_outcome = False - if self.encode: + if self.encoding_outcome: y_encoded = self.encode_outcome(y) self.is_encoded = True y_encoded = np.squeeze(y_encoded, axis=0) @@ -215,7 +219,7 @@ def fit(self, X, y=None): activation_outcome=self.activation_outcome, list_continuous=self.list_continuous, list_grps=self.list_grps, - sampling_with_repitition=self.sampling_with_repitition, + sampling_with_repetition=self.sampling_with_repetition, split_percentage=self.split_percentage, group_stacking=self.group_stacking, input_dimensions=self.input_dimensions, @@ -305,8 +309,8 @@ def hyper_tuning( self, X_train, y_train, - X_valid, - y_valid, + X_validation, + y_validation, list_hyper=None, random_state=None, ): @@ -320,9 +324,9 @@ def hyper_tuning( y_train : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs) The target values (class labels in classification, real numbers in regression) for the training samples. - X_train : {array-like, sparse matrix} of shape (n_valid_samples, n_features) + X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features) The validation input samples. - y_train : array-like of shape (n_valid_samples,) or (n_valid_samples, n_outputs) + y_validation : array-like of shape (n_validation_samples,) or (n_validation_samples, n_outputs) The target values (class labels in classification, real numbers in regression) for the validation samples. list_hyper : list of tuples, default=None @@ -334,7 +338,7 @@ def hyper_tuning( n_jobs=min(self.n_jobs, self.n_ensemble), verbose=self.verbose ) y_train = self.encode_outcome(y_train) - y_valid = self.encode_outcome(y_valid, train=False) + y_validation = self.encode_outcome(y_validation, train=False) return [ list( zip( @@ -342,8 +346,8 @@ def hyper_tuning( delayed(dnn_net)( X_train, y_train[i, ...], - X_valid, - y_valid[i, ...], + X_validation, + y_validation[i, ...], problem_type=self.problem_type, n_epoch=self.n_epoch, batch_size=self.batch_size, @@ -380,8 +384,8 @@ def __tuning_hyper(self, X, y): ( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, + X_validation_scaled, + y_validation_scaled, X_scaled, __, scaler_x, @@ -390,7 +394,7 @@ def __tuning_hyper(self, X, y): ) = create_X_y( X, y, - sampling_with_repitition=self.sampling_with_repitition, + sampling_with_repetition=self.sampling_with_repetition, split_percentage=self.split_percentage, problem_type=self.problem_type, list_continuous=self.list_continuous, @@ -400,8 +404,8 @@ def __tuning_hyper(self, X, y): list_loss = self.hyper_tuning( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, + X_validation_scaled, + y_validation_scaled, list_hyper, random_state=self.random_state, ) @@ -411,7 +415,7 @@ def __tuning_hyper(self, X, y): best_hyper = dict(zip(self.dict_hypertuning.keys(), best_hyper)) self.set_params(**best_hyper) - def predict(self, X, scale=True): + def predict(self, X): """ This function predicts the regression target for the input samples X. @@ -420,37 +424,35 @@ def predict(self, X, scale=True): X : {array-like, sparse matrix} of shape (n_test_samples, n_features), default=None The input samples. - scale : bool, default=True - Whether to scale the continuous input variables. Returns ------- - res_pred : {array-like, sparse matrix) + predictions : {array-like, sparse matrix) of shape (n_test_samples,) The average predictions across the sub-DNN models. """ if self.problem_type != "regression": raise Exception("Use the predict_proba function for classification") # Prepare the test set for the prediction - if scale: - X = self.__scale_test(X) + if self.preparing_test: + X = self._prepare_test(X) # Process the common prediction part - self.__pred_common(X) + self._pred_common(X) - res_pred = np.zeros((self.pred[0].shape)) + predictions = np.zeros((self.pred[0].shape)) total_n_elements = 0 for ind_mod, pred in enumerate(self.pred): - res_pred += ( + predictions += ( pred * self.optimal_list[ind_mod][1][1].scale_ + self.optimal_list[ind_mod][1][1].mean_ ) total_n_elements += 1 - res_pred = res_pred.copy() / total_n_elements + predictions = predictions.copy() / total_n_elements - return res_pred + return predictions - def predict_proba(self, X, scale=True): + def predict_proba(self, X): """ This function predicts the class probabilities for the input samples X. @@ -459,38 +461,39 @@ def predict_proba(self, X, scale=True): X : {array-like, sparse matrix} of shape (n_test_samples, n_features), default=None The input samples. - scale : bool, default=True - Whether to scale the continuous input variables. Returns ------- - res_pred : {array-like, sparse matrix) + predictions : {array-like, sparse matrix) of shape (n_test_samples,) The average predictions across the sub-DNN models. """ if self.problem_type == "regression": raise Exception("Use the predict function for classification") # Prepare the test set for the prediction - if scale: - X = self.__scale_test(X) + if self.preparing_test: + X = self._prepare_test(X) # Process the common prediction part - self.__pred_common(X) + self._pred_common(X) - res_pred = np.zeros((self.pred[0].shape)) + predictions = np.zeros((self.pred[0].shape)) total_n_elements = 0 for pred in self.pred: - res_pred += self.activation_outcome[self.problem_type](pred) + predictions += self.activation_outcome[self.problem_type](pred) total_n_elements += 1 - res_pred = res_pred.copy() / total_n_elements + predictions = predictions.copy() / total_n_elements if self.problem_type == "binary": - res_pred = np.array( - [[1 - res_pred[i][0], res_pred[i][0]] for i in range(res_pred.shape[0])] + predictions = np.array( + [ + [1 - predictions[i][0], predictions[i][0]] + for i in range(predictions.shape[0]) + ] ) - return res_pred + return predictions - def __scale_test(self, X): + def _prepare_test(self, X): """ This function prepares the input of the DNN estimator either in the default case or after applying the stacking method @@ -584,7 +587,7 @@ def __scale_test(self, X): self.X_test = X_test_n.copy() return X_test_n - def __pred_common(self, X): + def _pred_common(self, X): """ This function performs the prediction for the DNN learner @@ -594,7 +597,7 @@ def __pred_common(self, X): The input samples. """ if not self.group_stacking: - X = [X[0].copy() for i in range(self.n_ensemble)] + X = [X[0].copy() for _ in range(self.n_ensemble)] n_layer = len(self.optimal_list[0][0][0]) - 1 for ind_mod, mod in enumerate(self.optimal_list): diff --git a/hidimstat/__init__.py b/hidimstat/__init__.py index f17c584..9eb5f9d 100644 --- a/hidimstat/__init__.py +++ b/hidimstat/__init__.py @@ -2,7 +2,7 @@ from .BBI import BlockBasedImportance from .clustered_inference import clustered_inference, hd_inference from .desparsified_lasso import desparsified_group_lasso, desparsified_lasso -from .Dnn_learner_single import Dnn_learner_single +from .Dnn_learner_single import DnnLearnerSingle from .ensemble_clustered_inference import ensemble_clustered_inference from .importance_functions import compute_loco from .knockoff_aggregation import knockoff_aggregation @@ -23,7 +23,7 @@ "compute_loco", "desparsified_lasso", "desparsified_group_lasso", - "Dnn_learner_single", + "DnnLearnerSingle", "ensemble_clustered_inference", "group_reid", "hd_inference", diff --git a/hidimstat/compute_importance.py b/hidimstat/compute_importance.py index 8e20d71..457f52b 100644 --- a/hidimstat/compute_importance.py +++ b/hidimstat/compute_importance.py @@ -618,7 +618,10 @@ def joblib_compute_conditional( if problem_type == "regression": if type_predictor == "DNN": - pred_i = estimator.predict(current_X_test_list, scale=False) + tmp_prepare_test_state = estimator.preparing_test + estimator.preparing_test = False + pred_i = estimator.predict(current_X_test_list) + estimator.preparing_test = tmp_prepare_test_state else: pred_i = estimator.predict(current_X_test_list[0].squeeze()) @@ -631,7 +634,10 @@ def joblib_compute_conditional( ) ** 2 else: if type_predictor == "DNN": - pred_i = estimator.predict_proba(current_X_test_list, scale=False) + tmp_prepare_test_state = estimator.preparing_test + estimator.preparing_test = False + pred_i = estimator.predict_proba(current_X_test_list) + estimator.preparing_test = tmp_prepare_test_state else: pred_i = convert_predict_proba( estimator.predict_proba(current_X_test_list[0].squeeze()) @@ -740,7 +746,10 @@ def joblib_compute_permutation( ) if type_predictor == "DNN": - pred_i = estimator.predict(current_X_test_list, scale=False) + tmp_prepare_test_state = estimator.preparing_test + estimator.preparing_test = False + pred_i = estimator.predict(current_X_test_list) + estimator.preparing_test = tmp_prepare_test_state else: pred_i = estimator.predict(current_X_test_list[0]) @@ -755,7 +764,10 @@ def joblib_compute_permutation( y_test[:, nonzero_cols], original_predictions[:, nonzero_cols] ) if type_predictor == "DNN": - pred_i = estimator.predict_proba(current_X_test_list, scale=False) + tmp_prepare_test_state = estimator.preparing_test + estimator.preparing_test = False + pred_i = estimator.predict_proba(current_X_test_list) + estimator.preparing_test = tmp_prepare_test_state else: pred_i = convert_predict_proba( estimator.predict_proba(current_X_test_list[0]) diff --git a/hidimstat/importance_functions.py b/hidimstat/importance_functions.py index b912d97..a77e79c 100644 --- a/hidimstat/importance_functions.py +++ b/hidimstat/importance_functions.py @@ -4,7 +4,7 @@ from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import OneHotEncoder -from hidimstat.Dnn_learner_single import Dnn_learner_single +from hidimstat.Dnn_learner_single import DnnLearnerSingle def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed=2024): @@ -39,8 +39,8 @@ def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed= dict_encode_outcome = {"regression": False, "classification": True} if use_dnn: - clf_rf_full = Dnn_learner_single( - encode=dict_encode_outcome[problem_type], + clf_rf_full = DnnLearnerSingle( + encoding_outcome=dict_encode_outcome[problem_type], problem_type=problem_type, do_hypertuning=True, random_state=seed, @@ -86,8 +86,8 @@ def compute_loco(X, y, ntree=100, problem_type="regression", use_dnn=True, seed= # Retrain model for col in range(X.shape[1]): if use_dnn: - clf_rf_retrain = Dnn_learner_single( - encode=dict_encode_outcome[problem_type], + clf_rf_retrain = DnnLearnerSingle( + encoding_outcome=dict_encode_outcome[problem_type], problem_type=problem_type, do_hypertuning=True, random_state=seed, diff --git a/hidimstat/test/test_BBI.py b/hidimstat/test/test_BBI.py index 3674eab..b7be1f9 100644 --- a/hidimstat/test/test_BBI.py +++ b/hidimstat/test/test_BBI.py @@ -113,7 +113,7 @@ def test_BBI_splitting_scheme(): estimator="RF", do_hypertuning=True, dict_hypertuning=None, - sampling_with_repitition=True, + sampling_with_repetition=True, conditional=False, problem_type="regression", k_fold=2, @@ -131,7 +131,7 @@ def test_BBI_splitting_scheme(): estimator="RF", do_hypertuning=True, dict_hypertuning=None, - sampling_with_repitition=True, + sampling_with_repetition=True, split_percentage=0.8, conditional=False, problem_type="regression", diff --git a/hidimstat/utils.py b/hidimstat/utils.py index 4fd16ce..0c9c421 100644 --- a/hidimstat/utils.py +++ b/hidimstat/utils.py @@ -1,7 +1,4 @@ -# -*- coding: utf-8 -*- -# Authors: Binh Nguyen & Jerome-Alexis Chevalier & Ahmad Chamma import copy - import numpy as np import torch import torch.nn as nn @@ -104,7 +101,7 @@ def _fixed_quantile_aggregation(pvals, gamma=0.5): Parameters ---------- - pvals : 2D ndarray (n_sampling_with_repitition, n_test) + pvals : 2D ndarray (n_sampling_with_repetition, n_test) p-value (adjusted) gamma : float @@ -132,7 +129,7 @@ def _adaptive_quantile_aggregation(pvals, gamma_min=0.05): def create_X_y( X, y, - sampling_with_repitition=True, + sampling_with_repetition=True, split_percentage=0.8, problem_type="regression", list_continuous=None, @@ -147,8 +144,8 @@ def create_X_y( The input samples before the splitting process. y : ndarray, shape (n_samples, ) The output samples before the splitting process. - sampling_with_repitition : bool, default=True - Sampling with repitition the train part of the train/valid scheme under + sampling_with_repetition : bool, default=True + Sampling with repetition the train part of the train/valid scheme under the training set. The number of training samples in train is equal to the number of instances in the training set. split_percentage : float, default=0.8 @@ -163,16 +160,16 @@ def create_X_y( Returns ------- X_train_scaled : {array-like, sparse matrix} of shape (n_train_samples, n_features) - The sampling_with_repititionped training input samples with scaled continuous variables. + The sampling_with_repetitionped training input samples with scaled continuous variables. y_train_scaled : {array-like} of shape (n_train_samples, ) - The sampling_with_repititionped training output samples scaled if continous. - X_valid_scaled : {array-like, sparse matrix} of shape (n_valid_samples, n_features) + The sampling_with_repetitionped training output samples scaled if continous. + X_validation_scaled : {array-like, sparse matrix} of shape (n_validation_samples, n_features) The validation input samples with scaled continuous variables. - y_valid_scaled : {array-like} of shape (n_valid_samples, ) + y_validation_scaled : {array-like} of shape (n_validation_samples, ) The validation output samples scaled if continous. X_scaled : {array-like, sparse matrix} of shape (n_samples, n_features) The original input samples with scaled continuous variables. - y_valid : {array-like} of shape (n_samples, ) + y_validation : {array-like} of shape (n_samples, ) The original output samples with validation indices. scaler_x : Scikit-learn StandardScaler The standard scaler encoder for the continuous variables of the input. @@ -185,7 +182,7 @@ def create_X_y( scaler_x, scaler_y = StandardScaler(), StandardScaler() n = X.shape[0] - if sampling_with_repitition: + if sampling_with_repetition: train_ind = rng.choice(n, n, replace=True) else: train_ind = rng.choice( @@ -193,36 +190,36 @@ def create_X_y( ) valid_ind = np.array([ind for ind in range(n) if ind not in train_ind]) - X_train, X_valid = X[train_ind], X[valid_ind] - y_train, y_valid = y[train_ind], y[valid_ind] + X_train, X_validation = X[train_ind], X[valid_ind] + y_train, y_validation = y[train_ind], y[valid_ind] # Scaling X and y X_train_scaled = X_train.copy() - X_valid_scaled = X_valid.copy() + X_validation_scaled = X_validation.copy() X_scaled = X.copy() if len(list_continuous) > 0: X_train_scaled[:, list_continuous] = scaler_x.fit_transform( X_train[:, list_continuous] ) - X_valid_scaled[:, list_continuous] = scaler_x.transform( - X_valid[:, list_continuous] + X_validation_scaled[:, list_continuous] = scaler_x.transform( + X_validation[:, list_continuous] ) X_scaled[:, list_continuous] = scaler_x.transform(X[:, list_continuous]) if problem_type == "regression": y_train_scaled = scaler_y.fit_transform(y_train) - y_valid_scaled = scaler_y.transform(y_valid) + y_validation_scaled = scaler_y.transform(y_validation) else: y_train_scaled = y_train.copy() - y_valid_scaled = y_valid.copy() + y_validation_scaled = y_validation.copy() return ( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, + X_validation_scaled, + y_validation_scaled, X_scaled, - y_valid, + y_validation, scaler_x, scaler_y, valid_ind, @@ -316,7 +313,7 @@ def joblib_ensemble_dnnet( activation_outcome=None, list_continuous=None, list_grps=None, - sampling_with_repitition=False, + sampling_with_repetition=False, split_percentage=0.8, group_stacking=False, input_dimensions=None, @@ -350,8 +347,8 @@ def joblib_ensemble_dnnet( list_grps : list of lists, default=None A list collecting the indices of the groups' variables while applying the stacking method. - sampling_with_repitition : bool, default=True - Application of sampling_with_repitition sampling for the training set. + sampling_with_repetition : bool, default=True + Application of sampling_with_repetition sampling for the training set. split_percentage : float, default=0.8 The training/validation cut for the provided data. group_stacking : bool, default=False @@ -396,17 +393,17 @@ def joblib_ensemble_dnnet( ( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, + X_validation_scaled, + y_validation_scaled, X_scaled, - y_valid, + y_validation, scaler_x, scaler_y, valid_ind, ) = create_X_y( X, y, - sampling_with_repitition=sampling_with_repitition, + sampling_with_repetition=sampling_with_repetition, split_percentage=split_percentage, problem_type=problem_type, list_continuous=list_continuous, @@ -416,8 +413,8 @@ def joblib_ensemble_dnnet( current_model = dnn_net( X_train_scaled, y_train_scaled, - X_valid_scaled, - y_valid_scaled, + X_validation_scaled, + y_validation_scaled, problem_type=problem_type, n_epoch=n_epoch, batch_size=batch_size, @@ -477,12 +474,14 @@ def joblib_ensemble_dnnet( pred_v = pred * scaler_y.scale_ + scaler_y.mean_ else: pred_v = activation_outcome[problem_type](pred) - loss = np.std(y_valid) ** 2 - mean_squared_error(y_valid, pred_v[valid_ind]) + loss = np.std(y_validation) ** 2 - mean_squared_error( + y_validation, pred_v[valid_ind] + ) else: pred_v = activation_outcome[problem_type](pred) loss = log_loss( - y_valid, np.ones(y_valid.shape) * np.mean(y_valid, axis=0) - ) - log_loss(y_valid, pred_v[valid_ind]) + y_validation, np.ones(y_validation.shape) * np.mean(y_validation, axis=0) + ) - log_loss(y_validation, pred_v[valid_ind]) return (current_model, scaler_x, scaler_y, pred_v, loss) @@ -650,12 +649,12 @@ def evaluate(model, loader, device, problem_type): def dnn_net( X_train, y_train, - X_valid, - y_valid, + X_validation, + y_validation, problem_type="regression", n_epoch=200, batch_size=32, - batch_size_val=128, + batch_size_validation=128, beta1=0.9, beta2=0.999, lr=1e-3, @@ -678,9 +677,9 @@ def dnn_net( The training input samples. y_train : {array-like} of shape (n_train_samples, ) The training output samples. - X_valid : {array-like, sparse matrix} of shape (n_valid_samples, n_features) + X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features) The validation input samples. - y_valid : {array-like} of shape (n_valid_samples, ) + y_validation : {array-like} of shape (n_validation_samples, ) The validation output samples. problem_type : str, default='regression' A classification or a regression problem. @@ -688,7 +687,7 @@ def dnn_net( The number of epochs for the DNN learner(s). batch_size : int, default=32 The number of samples per batch for training. - batch_size_val : int, default=128 + batch_size_validation : int, default=128 The number of samples per batch for validation. beta1 : float, default=0.9 The exponential decay rate for the first moment estimates. @@ -721,7 +720,9 @@ def dnn_net( shuffle=True, batch_size=batch_size, ) - validate_loader = Dataset_Loader(X_valid, y_valid, batch_size=batch_size_val) + validate_loader = Dataset_Loader( + X_validation, y_validation, batch_size=batch_size_validation + ) # Set the seed for PyTorch's random number generator torch.manual_seed(random_state)