From ef94df2d5b6e289d1ce1ce9ac8ab71feb2abc55c Mon Sep 17 00:00:00 2001
From: jpaillard <joseph.paillard@inria.fr>
Date: Thu, 7 Nov 2024 11:53:58 +0100
Subject: [PATCH] revert changes in utils

---
 hidimstat/utils.py | 1210 ++++++++++++++++++++++----------------------
 1 file changed, 604 insertions(+), 606 deletions(-)

diff --git a/hidimstat/utils.py b/hidimstat/utils.py
index d0d9cd1..a15c4d8 100644
--- a/hidimstat/utils.py
+++ b/hidimstat/utils.py
@@ -1,14 +1,12 @@
 import copy
 
 import numpy as np
-
-# import torch
-# import torch.nn as nn
-# import torch.nn.functional as F
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from sklearn.metrics import log_loss, mean_squared_error
 from sklearn.preprocessing import StandardScaler
-
-# from torchmetrics import Accuracy
+from torchmetrics import Accuracy
 
 
 def quantile_aggregation(pvals, gamma=0.5, gamma_min=0.05, adaptive=False):
@@ -284,603 +282,603 @@ def _check_vim_predict_method(method):
         )
 
 
-# def sigmoid(x):
-#     """
-#     This function applies the sigmoid function element-wise to the input array x
-#     """
-#     return 1 / (1 + np.exp(-x))
-
-
-# def softmax(x):
-#     """
-#     This function applies the softmax function element-wise to the input array x
-#     """
-#     # Ensure numerical stability by subtracting the maximum value of x from each element of x
-#     # This prevents overflow errors when exponentiating large numbers
-#     x = x - np.max(x, axis=-1, keepdims=True)
-#     exp_x = np.exp(x)
-#     return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
-
-
-# def relu(x):
-#     """
-#     This function applies the relu function element-wise to the input array x
-#     """
-#     return (abs(x) + x) / 2
-
-
-# def relu_(x):
-#     """
-#     This function applies the derivative of the relu function element-wise
-#     to the input array x
-#     """
-#     return (x > 0) * 1
-
-
-# def convert_predict_proba(list_probs):
-#     """
-#     If the classification is done using a one-hot encoded variable, the list of
-#     probabilites will be a list of lists for the probabilities of each of the categories.
-#     This function takes the probabilities of having each category (=1 with binary) and stack
-#     them into one ndarray.
-#     """
-#     if len(list_probs.shape) == 3:
-#         list_probs = np.array(list_probs)[..., 1].T
-#     return list_probs
-
-
-# def ordinal_encode(y):
-#     """
-#     This function encodes the ordinal variable with a special gradual encoding storing also
-#     the natural order information.
-#     """
-#     list_y = []
-#     for y_col in range(y.shape[-1]):
-#         # Retrieve the unique values
-#         unique_vals = np.unique(y[:, y_col])
-#         # Mapping each unique value to its corresponding index
-#         mapping_dict = {}
-#         for i, val in enumerate(unique_vals):
-#             mapping_dict[val] = i + 1
-#         # create a zero-filled array for the ordinal encoding
-#         y_ordinal = np.zeros((len(y[:, y_col]), len(set(y[:, y_col]))))
-#         # set the appropriate indices to 1 for each ordinal value and all lower ordinal values
-#         for ind_el, el in enumerate(y[:, y_col]):
-#             y_ordinal[ind_el, np.arange(mapping_dict[el])] = 1
-#         list_y.append(y_ordinal[:, 1:])
-
-#     return list_y
-
-
-# def sample_predictions(predictions, random_state=None):
-#     """
-#     This function samples from the same leaf node of the input sample
-#     in both the regression and the classification cases
-#     """
-#     rng = np.random.RandomState(random_state)
-#     # print(predictions[..., rng.randint(predictions.shape[2]), :])
-#     # print(predictions.shape)
-#     # exit(0)
-#     return predictions[..., rng.randint(predictions.shape[2]), :]
-
-
-# def joblib_ensemble_dnnet(
-#     X,
-#     y,
-#     problem_type="regression",
-#     activation_outcome=None,
-#     list_continuous=None,
-#     list_grps=None,
-#     sampling_with_repetition=False,
-#     split_percentage=0.8,
-#     group_stacking=False,
-#     input_dimensions=None,
-#     n_epoch=200,
-#     batch_size=32,
-#     beta1=0.9,
-#     beta2=0.999,
-#     lr=1e-3,
-#     l1_weight=1e-2,
-#     l2_weight=1e-2,
-#     epsilon=1e-8,
-#     random_state=None,
-# ):
-#     """
-#     This function implements the ensemble learning of the sub-DNN models
-
-#     Parameters
-#     ----------
-#     X : {array-like, sparse matrix} of shape (n_train_samples, n_features)
-#         The input samples.
-#     y : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs)
-#         The target values (class labels in classification, real numbers in
-#         regression).
-#     problem_type : str, default='regression'
-#         A classification or a regression problem.
-#     activation_outcome : str, default=None
-#         The activation function to apply in the outcome layer, "softmax" for
-#         classification and "sigmoid" for both ordinal and binary cases.
-#     list_continuous : list, default=None
-#         The list of continuous variables.
-#     list_grps : list of lists, default=None
-#         A list collecting the indices of the groups' variables
-#         while applying the stacking method.
-#     sampling_with_repetition : bool, default=True
-#         Application of sampling_with_repetition sampling for the training set.
-#     split_percentage : float, default=0.8
-#         The training/validation cut for the provided data.
-#     group_stacking : bool, default=False
-#         Apply the stacking-based method for the provided groups.
-#     input_dimensions : list, default=None
-#         The cumsum of inputs after the linear sub-layers.
-#     n_epoch : int, default=200
-#         The number of epochs for the DNN learner(s).
-#     batch_size : int, default=32
-#         The number of samples per batch for training.
-#     beta1 : float, default=0.9
-#         The exponential decay rate for the first moment estimates.
-#     beta2 : float, default=0.999
-#         The exponential decay rate for the second moment estimates.
-#     lr : float, default=1e-3
-#         The learning rate.
-#     l1_weight : float, default=1e-2
-#         The L1-regularization paramter for weight decay.
-#     l2_weight : float, default=0
-#         The L2-regularization paramter for weight decay.
-#     epsilon : float, default=1e-8
-#         A small constant added to the denominator to prevent division by zero.
-#     random_state : int, default=2023
-#         Fixing the seeds of the random generator.
-
-#     Returns
-#     -------
-#     current_model : list
-#         The parameters of the sub-DNN model
-#     scaler_x : list of Scikit-learn StandardScalers
-#         The scalers for the continuous input variables.
-#     scaler_y : Scikit-learn StandardScaler
-#         The scaler for the continuous output variable.
-#     pred_v : ndarray
-#         The predictions of the sub-DNN model.
-#     loss : float
-#         The loss score of the sub-DNN model.
-#     """
-
-#     pred_v = np.empty(X.shape[0])
-#     # Sampling and Train/Validate splitting
-#     (
-#         X_train_scaled,
-#         y_train_scaled,
-#         X_validation_scaled,
-#         y_validation_scaled,
-#         X_scaled,
-#         y_validation,
-#         scaler_x,
-#         scaler_y,
-#         valid_ind,
-#     ) = create_X_y(
-#         X,
-#         y,
-#         sampling_with_repetition=sampling_with_repetition,
-#         split_percentage=split_percentage,
-#         problem_type=problem_type,
-#         list_continuous=list_continuous,
-#         random_state=random_state,
-#     )
-
-#     current_model = dnn_net(
-#         X_train_scaled,
-#         y_train_scaled,
-#         X_validation_scaled,
-#         y_validation_scaled,
-#         problem_type=problem_type,
-#         n_epoch=n_epoch,
-#         batch_size=batch_size,
-#         beta1=beta1,
-#         beta2=beta2,
-#         lr=lr,
-#         l1_weight=l1_weight,
-#         l2_weight=l2_weight,
-#         epsilon=epsilon,
-#         list_grps=list_grps,
-#         group_stacking=group_stacking,
-#         input_dimensions=input_dimensions,
-#         random_state=random_state,
-#     )
-
-#     if not group_stacking:
-#         X_scaled_n = X_scaled.copy()
-#     else:
-#         X_scaled_n = np.zeros((X_scaled.shape[0], input_dimensions[-1]))
-#         for grp_ind in range(len(list_grps)):
-#             n_layer_stacking = len(current_model[3][grp_ind]) - 1
-#             curr_pred = X_scaled[:, list_grps[grp_ind]].copy()
-#             for ind_w_b in range(n_layer_stacking):
-#                 if ind_w_b == 0:
-#                     curr_pred = relu(
-#                         X_scaled[:, list_grps[grp_ind]].dot(
-#                             current_model[3][grp_ind][ind_w_b]
-#                         )
-#                         + current_model[4][grp_ind][ind_w_b]
-#                     )
-#                 else:
-#                     curr_pred = relu(
-#                         curr_pred.dot(current_model[3][grp_ind][ind_w_b])
-#                         + current_model[4][grp_ind][ind_w_b]
-#                     )
-#             X_scaled_n[
-#                 :,
-#                 list(
-#                     np.arange(input_dimensions[grp_ind], input_dimensions[grp_ind + 1])
-#                 ),
-#             ] = (
-#                 curr_pred.dot(current_model[3][grp_ind][n_layer_stacking])
-#                 + current_model[4][grp_ind][n_layer_stacking]
-#             )
-
-#     n_layer = len(current_model[0]) - 1
-#     for j in range(n_layer):
-#         if j == 0:
-#             pred = relu(X_scaled_n.dot(current_model[0][j]) + current_model[1][j])
-#         else:
-#             pred = relu(pred.dot(current_model[0][j]) + current_model[1][j])
-
-#     pred = pred.dot(current_model[0][n_layer]) + current_model[1][n_layer]
-
-#     if problem_type not in ("classification", "binary"):
-#         if problem_type != "ordinal":
-#             pred_v = pred * scaler_y.scale_ + scaler_y.mean_
-#         else:
-#             pred_v = activation_outcome[problem_type](pred)
-#         loss = np.std(y_validation) ** 2 - mean_squared_error(
-#             y_validation, pred_v[valid_ind]
-#         )
-#     else:
-#         pred_v = activation_outcome[problem_type](pred)
-#         loss = log_loss(
-#             y_validation, np.ones(y_validation.shape) * np.mean(y_validation, axis=0)
-#         ) - log_loss(y_validation, pred_v[valid_ind])
-
-#     return (current_model, scaler_x, scaler_y, pred_v, loss)
-
-
-# def initialize_weights(layer):
-#     if isinstance(layer, nn.Linear):
-#         layer.weight.data = (layer.weight.data.uniform_() - 0.5) * 0.2
-#         layer.bias.data = (layer.bias.data.uniform_() - 0.5) * 0.1
-
-
-# def Dataset_Loader(X, y, shuffle=False, batch_size=50):
-#     if y.shape[-1] == 2:
-#         y = y[:, [1]]
-#     dataset = torch.utils.data.TensorDataset(
-#         torch.from_numpy(X).float(), torch.from_numpy(y).float()
-#     )
-
-#     loader = torch.utils.data.DataLoader(
-#         dataset, batch_size=batch_size, shuffle=shuffle
-#     )
-#     return loader
-
-
-# class DNN(nn.Module):
-#     """
-#     Feedfoward Neural Network with 4 hidden layers
-#     """
-
-#     def __init__(
-#         self, input_dim, group_stacking, list_grps, output_dimension, problem_type
-#     ):
-#         super().__init__()
-#         if problem_type == "classification":
-#             self.accuracy = Accuracy(task="multiclass", num_classes=output_dimension)
-#         else:
-#             self.accuracy = Accuracy(task="binary")
-#         self.list_grps = list_grps
-#         self.group_stacking = group_stacking
-#         if group_stacking:
-#             self.layers_stacking = nn.ModuleList(
-#                 [
-#                     nn.Linear(
-#                         in_features=len(grp),
-#                         out_features=input_dim[grp_ind + 1] - input_dim[grp_ind],
-#                     )
-#                     # nn.Sequential(
-#                     #     nn.Linear(
-#                     #         in_features=len(grp),
-#                     #         # out_features=max(1, int(0.1 * len(grp))),
-#                     #         out_features=input_dim[grp_ind + 1]
-#                     #         - input_dim[grp_ind],
-#                     #     ),
-#                     #     nn.ReLU(),
-#                     # nn.Linear(
-#                     #     in_features=max(1, int(0.1 * len(grp))),
-#                     #     out_features=input_dim[grp_ind + 1]
-#                     #     - input_dim[grp_ind],
-#                     # ),
-#                     # nn.ReLU(),
-#                     # nn.Linear(
-#                     #     in_features=max(1, int(0.1 * len(grp))),
-#                     #     out_features=input_dim[grp_ind + 1]
-#                     #     - input_dim[grp_ind],
-#                     # ),
-#                     # )
-#                     for grp_ind, grp in enumerate(list_grps)
-#                 ]
-#             )
-#             input_dim = input_dim[-1]
-#         self.layers = nn.Sequential(
-#             # hidden layers
-#             nn.Linear(input_dim, 50),
-#             nn.ReLU(),
-#             nn.Linear(50, 40),
-#             nn.ReLU(),
-#             nn.Linear(40, 30),
-#             nn.ReLU(),
-#             nn.Linear(30, 20),
-#             nn.ReLU(),
-#             # output layer
-#             nn.Linear(20, output_dimension),
-#         )
-#         self.loss = 0
-
-#     def forward(self, x):
-#         if self.group_stacking:
-#             list_stacking = [None] * len(self.layers_stacking)
-#             for ind_layer, layer in enumerate(self.layers_stacking):
-#                 list_stacking[ind_layer] = layer(x[:, self.list_grps[ind_layer]])
-#             x = torch.cat(list_stacking, dim=1)
-#         return self.layers(x)
-
-#     def training_step(self, batch, device, problem_type):
-#         X, y = batch[0].to(device), batch[1].to(device)
-#         y_pred = self(X)  # Generate predictions
-#         if problem_type == "regression":
-#             loss = F.mse_loss(y_pred, y)
-#         elif problem_type == "classification":
-#             loss = F.cross_entropy(y_pred, y)  # Calculate loss
-#         else:
-#             loss = F.binary_cross_entropy_with_logits(y_pred, y)
-#         return loss
-
-#     def validation_step(self, batch, device, problem_type):
-#         X, y = batch[0].to(device), batch[1].to(device)
-#         y_pred = self(X)  # Generate predictions
-#         if problem_type == "regression":
-#             loss = F.mse_loss(y_pred, y)
-#             return {
-#                 "val_mse": loss,
-#                 "batch_size": len(X),
-#             }
-#         else:
-#             if problem_type == "classification":
-#                 loss = F.cross_entropy(y_pred, y)  # Calculate loss
-#             else:
-#                 loss = F.binary_cross_entropy_with_logits(y_pred, y)
-#             acc = self.accuracy(y_pred, y.int())
-#             return {
-#                 "val_loss": loss,
-#                 "val_acc": acc,
-#                 "batch_size": len(X),
-#             }
-
-#     def validation_epoch_end(self, outputs, problem_type):
-#         if problem_type in ("classification", "binary"):
-#             batch_losses = []
-#             batch_accs = []
-#             batch_sizes = []
-#             for x in outputs:
-#                 batch_losses.append(x["val_loss"] * x["batch_size"])
-#                 batch_accs.append(x["val_acc"] * x["batch_size"])
-#                 batch_sizes.append(x["batch_size"])
-#             self.loss = torch.stack(batch_losses).sum().item() / np.sum(
-#                 batch_sizes
-#             )  # Combine losses
-#             epoch_acc = torch.stack(batch_accs).sum().item() / np.sum(
-#                 batch_sizes
-#             )  # Combine accuracies
-#             return {"val_loss": self.loss, "val_acc": epoch_acc}
-#         else:
-#             batch_losses = [x["val_mse"] * x["batch_size"] for x in outputs]
-#             batch_sizes = [x["batch_size"] for x in outputs]
-#             self.loss = torch.stack(batch_losses).sum().item() / np.sum(
-#                 batch_sizes
-#             )  # Combine losses
-#             return {"val_mse": self.loss}
-
-#     def epoch_end(self, epoch, result):
-#         if len(result) == 2:
-#             print(
-#                 "Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format(
-#                     epoch + 1, result["val_loss"], result["val_acc"]
-#                 )
-#             )
-#         else:
-#             print("Epoch [{}], val_mse: {:.4f}".format(epoch + 1, result["val_mse"]))
-
-
-# def evaluate(model, loader, device, problem_type):
-#     outputs = [model.validation_step(batch, device, problem_type) for batch in loader]
-#     return model.validation_epoch_end(outputs, problem_type)
-
-
-# def dnn_net(
-#     X_train,
-#     y_train,
-#     X_validation,
-#     y_validation,
-#     problem_type="regression",
-#     n_epoch=200,
-#     batch_size=32,
-#     batch_size_validation=128,
-#     beta1=0.9,
-#     beta2=0.999,
-#     lr=1e-3,
-#     l1_weight=1e-2,
-#     l2_weight=1e-2,
-#     epsilon=1e-8,
-#     list_grps=None,
-#     group_stacking=False,
-#     input_dimensions=None,
-#     random_state=2023,
-#     verbose=0,
-# ):
-#     """
-#     This function implements the training/validation process of the sub-DNN
-#     models
-
-#     Parameters
-#     ----------
-#     X_train : {array-like, sparse matrix} of shape (n_train_samples, n_features)
-#         The training input samples.
-#     y_train : {array-like} of shape (n_train_samples, )
-#         The training output samples.
-#     X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features)
-#         The validation input samples.
-#     y_validation : {array-like} of shape (n_validation_samples, )
-#         The validation output samples.
-#     problem_type : str, default='regression'
-#         A classification or a regression problem.
-#     n_epoch : int, default=200
-#         The number of epochs for the DNN learner(s).
-#     batch_size : int, default=32
-#         The number of samples per batch for training.
-#     batch_size_validation : int, default=128
-#         The number of samples per batch for validation.
-#     beta1 : float, default=0.9
-#         The exponential decay rate for the first moment estimates.
-#     beta2 : float, default=0.999
-#         The exponential decay rate for the second moment estimates.
-#     lr : float, default=1e-3
-#         The learning rate.
-#     l1_weight : float, default=1e-2
-#         The L1-regularization paramter for weight decay.
-#     l2_weight : float, default=0
-#         The L2-regularization paramter for weight decay.
-#     epsilon : float, default=1e-8
-#         A small constant added to the denominator to prevent division by zero.
-#     list_grps : list of lists, default=None
-#         A list collecting the indices of the groups' variables
-#         while applying the stacking method.
-#     group_stacking : bool, default=False
-#         Apply the stacking-based method for the provided groups.
-#     input_dimensions : list, default=None
-#         The cumsum of inputs after the linear sub-layers.
-#     random_state : int, default=2023
-#         Fixing the seeds of the random generator.
-#     verbose : int, default=0
-#         If verbose > 0, the fitted iterations will be printed.
-#     """
-#     # Creating DataLoaders
-#     train_loader = Dataset_Loader(
-#         X_train,
-#         y_train,
-#         shuffle=True,
-#         batch_size=batch_size,
-#     )
-#     validate_loader = Dataset_Loader(
-#         X_validation, y_validation, batch_size=batch_size_validation
-#     )
-#     # Set the seed for PyTorch's random number generator
-#     torch.manual_seed(random_state)
-
-#     # Set the seed for PyTorch's CUDA random number generator(s), if available
-#     if torch.cuda.is_available():
-#         torch.cuda.manual_seed(random_state)
-#         torch.cuda.manual_seed_all(random_state)
-
-#     # Specify whether to use GPU or CPU
-#     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-#     device = torch.device("cpu")
-
-#     if problem_type in ("regression", "binary"):
-#         output_dimension = 1
-#     else:
-#         output_dimension = y_train.shape[-1]
-
-#     # DNN model
-#     input_dim = input_dimensions.copy() if group_stacking else X_train.shape[1]
-#     model = DNN(input_dim, group_stacking, list_grps, output_dimension, problem_type)
-#     model.to(device)
-#     # Initializing weights/bias
-#     model.apply(initialize_weights)
-#     # Adam Optimizer
-#     optimizer = torch.optim.Adam(
-#         model.parameters(), lr=lr, betas=(beta1, beta2), eps=epsilon
-#     )
-
-#     best_loss = 1e100
-#     for epoch in range(n_epoch):
-#         # Training Phase
-#         model.train()
-#         for batch in train_loader:
-#             optimizer.zero_grad()
-#             loss = model.training_step(batch, device, problem_type)
-
-#             loss.backward()
-#             optimizer.step()
-#             for name, param in model.named_parameters():
-#                 if "bias" not in name:
-#                     # if name.split(".")[0] == "layers_stacking":
-#                     #     param.data -= l2_weight * param.data
-#                     # else:
-#                     param.data -= (
-#                         l1_weight * torch.sign(param.data) + l2_weight * param.data
-#                     )
-#         # Validation Phase
-#         model.eval()
-#         result = evaluate(model, validate_loader, device, problem_type)
-#         if model.loss < best_loss:
-#             best_loss = model.loss
-#             dict_params = copy.deepcopy(model.state_dict())
-#         if verbose >= 2:
-#             model.epoch_end(epoch, result)
-
-#     best_weight = []
-#     best_bias = []
-#     best_weight_stack = [[].copy() for _ in range(len(list_grps))]
-#     best_bias_stack = [[].copy() for _ in range(len(list_grps))]
-
-#     for name, param in dict_params.items():
-#         if name.split(".")[0] == "layers":
-#             if name.split(".")[-1] == "weight":
-#                 best_weight.append(param.numpy().T)
-#             if name.split(".")[-1] == "bias":
-#                 best_bias.append(param.numpy()[np.newaxis, :])
-#         if name.split(".")[0] == "layers_stacking":
-#             curr_ind = int(name.split(".")[1])
-#             if name.split(".")[-1] == "weight":
-#                 best_weight_stack[curr_ind].append(param.numpy().T)
-#             if name.split(".")[-1] == "bias":
-#                 best_bias_stack[curr_ind].append(param.numpy()[np.newaxis, :])
-
-#     return [
-#         best_weight,
-#         best_bias,
-#         best_loss,
-#         best_weight_stack,
-#         best_bias_stack,
-#     ]
-
-
-# def compute_imp_std(pred_scores):
-#     weights = np.array([el.shape[-2] for el in pred_scores])
-#     # Compute the mean of each fold over the number of observations
-#     pred_mean = np.array([np.mean(el.copy(), axis=-2) for el in pred_scores])
-
-#     # Weighted average
-#     imp = np.average(pred_mean, axis=0, weights=weights)
-
-#     # Compute the standard deviation of each fold
-#     # over the number of observations
-#     pred_std = np.array(
-#         [
-#             np.mean(
-#                 (el - imp[..., np.newaxis]) ** 2,
-#                 axis=-2,
-#             )
-#             for el in pred_scores
-#         ]
-#     )
-#     std = np.sqrt(np.average(pred_std, axis=0, weights=weights) / (np.sum(weights) - 1))
-#     return (imp, std)
+def sigmoid(x):
+    """
+    This function applies the sigmoid function element-wise to the input array x
+    """
+    return 1 / (1 + np.exp(-x))
+
+
+def softmax(x):
+    """
+    This function applies the softmax function element-wise to the input array x
+    """
+    # Ensure numerical stability by subtracting the maximum value of x from each element of x
+    # This prevents overflow errors when exponentiating large numbers
+    x = x - np.max(x, axis=-1, keepdims=True)
+    exp_x = np.exp(x)
+    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
+
+
+def relu(x):
+    """
+    This function applies the relu function element-wise to the input array x
+    """
+    return (abs(x) + x) / 2
+
+
+def relu_(x):
+    """
+    This function applies the derivative of the relu function element-wise
+    to the input array x
+    """
+    return (x > 0) * 1
+
+
+def convert_predict_proba(list_probs):
+    """
+    If the classification is done using a one-hot encoded variable, the list of
+    probabilites will be a list of lists for the probabilities of each of the categories.
+    This function takes the probabilities of having each category (=1 with binary) and stack
+    them into one ndarray.
+    """
+    if len(list_probs.shape) == 3:
+        list_probs = np.array(list_probs)[..., 1].T
+    return list_probs
+
+
+def ordinal_encode(y):
+    """
+    This function encodes the ordinal variable with a special gradual encoding storing also
+    the natural order information.
+    """
+    list_y = []
+    for y_col in range(y.shape[-1]):
+        # Retrieve the unique values
+        unique_vals = np.unique(y[:, y_col])
+        # Mapping each unique value to its corresponding index
+        mapping_dict = {}
+        for i, val in enumerate(unique_vals):
+            mapping_dict[val] = i + 1
+        # create a zero-filled array for the ordinal encoding
+        y_ordinal = np.zeros((len(y[:, y_col]), len(set(y[:, y_col]))))
+        # set the appropriate indices to 1 for each ordinal value and all lower ordinal values
+        for ind_el, el in enumerate(y[:, y_col]):
+            y_ordinal[ind_el, np.arange(mapping_dict[el])] = 1
+        list_y.append(y_ordinal[:, 1:])
+
+    return list_y
+
+
+def sample_predictions(predictions, random_state=None):
+    """
+    This function samples from the same leaf node of the input sample
+    in both the regression and the classification cases
+    """
+    rng = np.random.RandomState(random_state)
+    # print(predictions[..., rng.randint(predictions.shape[2]), :])
+    # print(predictions.shape)
+    # exit(0)
+    return predictions[..., rng.randint(predictions.shape[2]), :]
+
+
+def joblib_ensemble_dnnet(
+    X,
+    y,
+    problem_type="regression",
+    activation_outcome=None,
+    list_continuous=None,
+    list_grps=None,
+    sampling_with_repetition=False,
+    split_percentage=0.8,
+    group_stacking=False,
+    input_dimensions=None,
+    n_epoch=200,
+    batch_size=32,
+    beta1=0.9,
+    beta2=0.999,
+    lr=1e-3,
+    l1_weight=1e-2,
+    l2_weight=1e-2,
+    epsilon=1e-8,
+    random_state=None,
+):
+    """
+    This function implements the ensemble learning of the sub-DNN models
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_train_samples, n_features)
+        The input samples.
+    y : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs)
+        The target values (class labels in classification, real numbers in
+        regression).
+    problem_type : str, default='regression'
+        A classification or a regression problem.
+    activation_outcome : str, default=None
+        The activation function to apply in the outcome layer, "softmax" for
+        classification and "sigmoid" for both ordinal and binary cases.
+    list_continuous : list, default=None
+        The list of continuous variables.
+    list_grps : list of lists, default=None
+        A list collecting the indices of the groups' variables
+        while applying the stacking method.
+    sampling_with_repetition : bool, default=True
+        Application of sampling_with_repetition sampling for the training set.
+    split_percentage : float, default=0.8
+        The training/validation cut for the provided data.
+    group_stacking : bool, default=False
+        Apply the stacking-based method for the provided groups.
+    input_dimensions : list, default=None
+        The cumsum of inputs after the linear sub-layers.
+    n_epoch : int, default=200
+        The number of epochs for the DNN learner(s).
+    batch_size : int, default=32
+        The number of samples per batch for training.
+    beta1 : float, default=0.9
+        The exponential decay rate for the first moment estimates.
+    beta2 : float, default=0.999
+        The exponential decay rate for the second moment estimates.
+    lr : float, default=1e-3
+        The learning rate.
+    l1_weight : float, default=1e-2
+        The L1-regularization paramter for weight decay.
+    l2_weight : float, default=0
+        The L2-regularization paramter for weight decay.
+    epsilon : float, default=1e-8
+        A small constant added to the denominator to prevent division by zero.
+    random_state : int, default=2023
+        Fixing the seeds of the random generator.
+
+    Returns
+    -------
+    current_model : list
+        The parameters of the sub-DNN model
+    scaler_x : list of Scikit-learn StandardScalers
+        The scalers for the continuous input variables.
+    scaler_y : Scikit-learn StandardScaler
+        The scaler for the continuous output variable.
+    pred_v : ndarray
+        The predictions of the sub-DNN model.
+    loss : float
+        The loss score of the sub-DNN model.
+    """
+
+    pred_v = np.empty(X.shape[0])
+    # Sampling and Train/Validate splitting
+    (
+        X_train_scaled,
+        y_train_scaled,
+        X_validation_scaled,
+        y_validation_scaled,
+        X_scaled,
+        y_validation,
+        scaler_x,
+        scaler_y,
+        valid_ind,
+    ) = create_X_y(
+        X,
+        y,
+        sampling_with_repetition=sampling_with_repetition,
+        split_percentage=split_percentage,
+        problem_type=problem_type,
+        list_continuous=list_continuous,
+        random_state=random_state,
+    )
+
+    current_model = dnn_net(
+        X_train_scaled,
+        y_train_scaled,
+        X_validation_scaled,
+        y_validation_scaled,
+        problem_type=problem_type,
+        n_epoch=n_epoch,
+        batch_size=batch_size,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        l1_weight=l1_weight,
+        l2_weight=l2_weight,
+        epsilon=epsilon,
+        list_grps=list_grps,
+        group_stacking=group_stacking,
+        input_dimensions=input_dimensions,
+        random_state=random_state,
+    )
+
+    if not group_stacking:
+        X_scaled_n = X_scaled.copy()
+    else:
+        X_scaled_n = np.zeros((X_scaled.shape[0], input_dimensions[-1]))
+        for grp_ind in range(len(list_grps)):
+            n_layer_stacking = len(current_model[3][grp_ind]) - 1
+            curr_pred = X_scaled[:, list_grps[grp_ind]].copy()
+            for ind_w_b in range(n_layer_stacking):
+                if ind_w_b == 0:
+                    curr_pred = relu(
+                        X_scaled[:, list_grps[grp_ind]].dot(
+                            current_model[3][grp_ind][ind_w_b]
+                        )
+                        + current_model[4][grp_ind][ind_w_b]
+                    )
+                else:
+                    curr_pred = relu(
+                        curr_pred.dot(current_model[3][grp_ind][ind_w_b])
+                        + current_model[4][grp_ind][ind_w_b]
+                    )
+            X_scaled_n[
+                :,
+                list(
+                    np.arange(input_dimensions[grp_ind], input_dimensions[grp_ind + 1])
+                ),
+            ] = (
+                curr_pred.dot(current_model[3][grp_ind][n_layer_stacking])
+                + current_model[4][grp_ind][n_layer_stacking]
+            )
+
+    n_layer = len(current_model[0]) - 1
+    for j in range(n_layer):
+        if j == 0:
+            pred = relu(X_scaled_n.dot(current_model[0][j]) + current_model[1][j])
+        else:
+            pred = relu(pred.dot(current_model[0][j]) + current_model[1][j])
+
+    pred = pred.dot(current_model[0][n_layer]) + current_model[1][n_layer]
+
+    if problem_type not in ("classification", "binary"):
+        if problem_type != "ordinal":
+            pred_v = pred * scaler_y.scale_ + scaler_y.mean_
+        else:
+            pred_v = activation_outcome[problem_type](pred)
+        loss = np.std(y_validation) ** 2 - mean_squared_error(
+            y_validation, pred_v[valid_ind]
+        )
+    else:
+        pred_v = activation_outcome[problem_type](pred)
+        loss = log_loss(
+            y_validation, np.ones(y_validation.shape) * np.mean(y_validation, axis=0)
+        ) - log_loss(y_validation, pred_v[valid_ind])
+
+    return (current_model, scaler_x, scaler_y, pred_v, loss)
+
+
+def initialize_weights(layer):
+    if isinstance(layer, nn.Linear):
+        layer.weight.data = (layer.weight.data.uniform_() - 0.5) * 0.2
+        layer.bias.data = (layer.bias.data.uniform_() - 0.5) * 0.1
+
+
+def Dataset_Loader(X, y, shuffle=False, batch_size=50):
+    if y.shape[-1] == 2:
+        y = y[:, [1]]
+    dataset = torch.utils.data.TensorDataset(
+        torch.from_numpy(X).float(), torch.from_numpy(y).float()
+    )
+
+    loader = torch.utils.data.DataLoader(
+        dataset, batch_size=batch_size, shuffle=shuffle
+    )
+    return loader
+
+
+class DNN(nn.Module):
+    """
+    Feedfoward Neural Network with 4 hidden layers
+    """
+
+    def __init__(
+        self, input_dim, group_stacking, list_grps, output_dimension, problem_type
+    ):
+        super().__init__()
+        if problem_type == "classification":
+            self.accuracy = Accuracy(task="multiclass", num_classes=output_dimension)
+        else:
+            self.accuracy = Accuracy(task="binary")
+        self.list_grps = list_grps
+        self.group_stacking = group_stacking
+        if group_stacking:
+            self.layers_stacking = nn.ModuleList(
+                [
+                    nn.Linear(
+                        in_features=len(grp),
+                        out_features=input_dim[grp_ind + 1] - input_dim[grp_ind],
+                    )
+                    # nn.Sequential(
+                    #     nn.Linear(
+                    #         in_features=len(grp),
+                    #         # out_features=max(1, int(0.1 * len(grp))),
+                    #         out_features=input_dim[grp_ind + 1]
+                    #         - input_dim[grp_ind],
+                    #     ),
+                    #     nn.ReLU(),
+                    # nn.Linear(
+                    #     in_features=max(1, int(0.1 * len(grp))),
+                    #     out_features=input_dim[grp_ind + 1]
+                    #     - input_dim[grp_ind],
+                    # ),
+                    # nn.ReLU(),
+                    # nn.Linear(
+                    #     in_features=max(1, int(0.1 * len(grp))),
+                    #     out_features=input_dim[grp_ind + 1]
+                    #     - input_dim[grp_ind],
+                    # ),
+                    # )
+                    for grp_ind, grp in enumerate(list_grps)
+                ]
+            )
+            input_dim = input_dim[-1]
+        self.layers = nn.Sequential(
+            # hidden layers
+            nn.Linear(input_dim, 50),
+            nn.ReLU(),
+            nn.Linear(50, 40),
+            nn.ReLU(),
+            nn.Linear(40, 30),
+            nn.ReLU(),
+            nn.Linear(30, 20),
+            nn.ReLU(),
+            # output layer
+            nn.Linear(20, output_dimension),
+        )
+        self.loss = 0
+
+    def forward(self, x):
+        if self.group_stacking:
+            list_stacking = [None] * len(self.layers_stacking)
+            for ind_layer, layer in enumerate(self.layers_stacking):
+                list_stacking[ind_layer] = layer(x[:, self.list_grps[ind_layer]])
+            x = torch.cat(list_stacking, dim=1)
+        return self.layers(x)
+
+    def training_step(self, batch, device, problem_type):
+        X, y = batch[0].to(device), batch[1].to(device)
+        y_pred = self(X)  # Generate predictions
+        if problem_type == "regression":
+            loss = F.mse_loss(y_pred, y)
+        elif problem_type == "classification":
+            loss = F.cross_entropy(y_pred, y)  # Calculate loss
+        else:
+            loss = F.binary_cross_entropy_with_logits(y_pred, y)
+        return loss
+
+    def validation_step(self, batch, device, problem_type):
+        X, y = batch[0].to(device), batch[1].to(device)
+        y_pred = self(X)  # Generate predictions
+        if problem_type == "regression":
+            loss = F.mse_loss(y_pred, y)
+            return {
+                "val_mse": loss,
+                "batch_size": len(X),
+            }
+        else:
+            if problem_type == "classification":
+                loss = F.cross_entropy(y_pred, y)  # Calculate loss
+            else:
+                loss = F.binary_cross_entropy_with_logits(y_pred, y)
+            acc = self.accuracy(y_pred, y.int())
+            return {
+                "val_loss": loss,
+                "val_acc": acc,
+                "batch_size": len(X),
+            }
+
+    def validation_epoch_end(self, outputs, problem_type):
+        if problem_type in ("classification", "binary"):
+            batch_losses = []
+            batch_accs = []
+            batch_sizes = []
+            for x in outputs:
+                batch_losses.append(x["val_loss"] * x["batch_size"])
+                batch_accs.append(x["val_acc"] * x["batch_size"])
+                batch_sizes.append(x["batch_size"])
+            self.loss = torch.stack(batch_losses).sum().item() / np.sum(
+                batch_sizes
+            )  # Combine losses
+            epoch_acc = torch.stack(batch_accs).sum().item() / np.sum(
+                batch_sizes
+            )  # Combine accuracies
+            return {"val_loss": self.loss, "val_acc": epoch_acc}
+        else:
+            batch_losses = [x["val_mse"] * x["batch_size"] for x in outputs]
+            batch_sizes = [x["batch_size"] for x in outputs]
+            self.loss = torch.stack(batch_losses).sum().item() / np.sum(
+                batch_sizes
+            )  # Combine losses
+            return {"val_mse": self.loss}
+
+    def epoch_end(self, epoch, result):
+        if len(result) == 2:
+            print(
+                "Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format(
+                    epoch + 1, result["val_loss"], result["val_acc"]
+                )
+            )
+        else:
+            print("Epoch [{}], val_mse: {:.4f}".format(epoch + 1, result["val_mse"]))
+
+
+def evaluate(model, loader, device, problem_type):
+    outputs = [model.validation_step(batch, device, problem_type) for batch in loader]
+    return model.validation_epoch_end(outputs, problem_type)
+
+
+def dnn_net(
+    X_train,
+    y_train,
+    X_validation,
+    y_validation,
+    problem_type="regression",
+    n_epoch=200,
+    batch_size=32,
+    batch_size_validation=128,
+    beta1=0.9,
+    beta2=0.999,
+    lr=1e-3,
+    l1_weight=1e-2,
+    l2_weight=1e-2,
+    epsilon=1e-8,
+    list_grps=None,
+    group_stacking=False,
+    input_dimensions=None,
+    random_state=2023,
+    verbose=0,
+):
+    """
+    This function implements the training/validation process of the sub-DNN
+    models
+
+    Parameters
+    ----------
+    X_train : {array-like, sparse matrix} of shape (n_train_samples, n_features)
+        The training input samples.
+    y_train : {array-like} of shape (n_train_samples, )
+        The training output samples.
+    X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features)
+        The validation input samples.
+    y_validation : {array-like} of shape (n_validation_samples, )
+        The validation output samples.
+    problem_type : str, default='regression'
+        A classification or a regression problem.
+    n_epoch : int, default=200
+        The number of epochs for the DNN learner(s).
+    batch_size : int, default=32
+        The number of samples per batch for training.
+    batch_size_validation : int, default=128
+        The number of samples per batch for validation.
+    beta1 : float, default=0.9
+        The exponential decay rate for the first moment estimates.
+    beta2 : float, default=0.999
+        The exponential decay rate for the second moment estimates.
+    lr : float, default=1e-3
+        The learning rate.
+    l1_weight : float, default=1e-2
+        The L1-regularization paramter for weight decay.
+    l2_weight : float, default=0
+        The L2-regularization paramter for weight decay.
+    epsilon : float, default=1e-8
+        A small constant added to the denominator to prevent division by zero.
+    list_grps : list of lists, default=None
+        A list collecting the indices of the groups' variables
+        while applying the stacking method.
+    group_stacking : bool, default=False
+        Apply the stacking-based method for the provided groups.
+    input_dimensions : list, default=None
+        The cumsum of inputs after the linear sub-layers.
+    random_state : int, default=2023
+        Fixing the seeds of the random generator.
+    verbose : int, default=0
+        If verbose > 0, the fitted iterations will be printed.
+    """
+    # Creating DataLoaders
+    train_loader = Dataset_Loader(
+        X_train,
+        y_train,
+        shuffle=True,
+        batch_size=batch_size,
+    )
+    validate_loader = Dataset_Loader(
+        X_validation, y_validation, batch_size=batch_size_validation
+    )
+    # Set the seed for PyTorch's random number generator
+    torch.manual_seed(random_state)
+
+    # Set the seed for PyTorch's CUDA random number generator(s), if available
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(random_state)
+        torch.cuda.manual_seed_all(random_state)
+
+    # Specify whether to use GPU or CPU
+    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    device = torch.device("cpu")
+
+    if problem_type in ("regression", "binary"):
+        output_dimension = 1
+    else:
+        output_dimension = y_train.shape[-1]
+
+    # DNN model
+    input_dim = input_dimensions.copy() if group_stacking else X_train.shape[1]
+    model = DNN(input_dim, group_stacking, list_grps, output_dimension, problem_type)
+    model.to(device)
+    # Initializing weights/bias
+    model.apply(initialize_weights)
+    # Adam Optimizer
+    optimizer = torch.optim.Adam(
+        model.parameters(), lr=lr, betas=(beta1, beta2), eps=epsilon
+    )
+
+    best_loss = 1e100
+    for epoch in range(n_epoch):
+        # Training Phase
+        model.train()
+        for batch in train_loader:
+            optimizer.zero_grad()
+            loss = model.training_step(batch, device, problem_type)
+
+            loss.backward()
+            optimizer.step()
+            for name, param in model.named_parameters():
+                if "bias" not in name:
+                    # if name.split(".")[0] == "layers_stacking":
+                    #     param.data -= l2_weight * param.data
+                    # else:
+                    param.data -= (
+                        l1_weight * torch.sign(param.data) + l2_weight * param.data
+                    )
+        # Validation Phase
+        model.eval()
+        result = evaluate(model, validate_loader, device, problem_type)
+        if model.loss < best_loss:
+            best_loss = model.loss
+            dict_params = copy.deepcopy(model.state_dict())
+        if verbose >= 2:
+            model.epoch_end(epoch, result)
+
+    best_weight = []
+    best_bias = []
+    best_weight_stack = [[].copy() for _ in range(len(list_grps))]
+    best_bias_stack = [[].copy() for _ in range(len(list_grps))]
+
+    for name, param in dict_params.items():
+        if name.split(".")[0] == "layers":
+            if name.split(".")[-1] == "weight":
+                best_weight.append(param.numpy().T)
+            if name.split(".")[-1] == "bias":
+                best_bias.append(param.numpy()[np.newaxis, :])
+        if name.split(".")[0] == "layers_stacking":
+            curr_ind = int(name.split(".")[1])
+            if name.split(".")[-1] == "weight":
+                best_weight_stack[curr_ind].append(param.numpy().T)
+            if name.split(".")[-1] == "bias":
+                best_bias_stack[curr_ind].append(param.numpy()[np.newaxis, :])
+
+    return [
+        best_weight,
+        best_bias,
+        best_loss,
+        best_weight_stack,
+        best_bias_stack,
+    ]
+
+
+def compute_imp_std(pred_scores):
+    weights = np.array([el.shape[-2] for el in pred_scores])
+    # Compute the mean of each fold over the number of observations
+    pred_mean = np.array([np.mean(el.copy(), axis=-2) for el in pred_scores])
+
+    # Weighted average
+    imp = np.average(pred_mean, axis=0, weights=weights)
+
+    # Compute the standard deviation of each fold
+    # over the number of observations
+    pred_std = np.array(
+        [
+            np.mean(
+                (el - imp[..., np.newaxis]) ** 2,
+                axis=-2,
+            )
+            for el in pred_scores
+        ]
+    )
+    std = np.sqrt(np.average(pred_std, axis=0, weights=weights) / (np.sum(weights) - 1))
+    return (imp, std)