diff --git a/hidimstat/test/test_cpi.py b/hidimstat/test/test_cpi.py
index c7ce617..83da78d 100644
--- a/hidimstat/test/test_cpi.py
+++ b/hidimstat/test/test_cpi.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pandas as pd
+import pytest
 from sklearn.base import clone
+from sklearn.exceptions import NotFittedError
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.metrics import log_loss
 from sklearn.model_selection import train_test_split
@@ -91,3 +93,47 @@ def test_cpi(linear_scenario):
         groups=None,
     )
     vim = cpi.score(X_test, y_test_clf)
+
+
+def test_raises_value_error(
+    linear_scenario,
+):
+    test_str = "unknown method"
+    with pytest.raises(ValueError):
+        CPI(
+            estimator=LinearRegression(),
+            imputation_model=LinearRegression(),
+            method=test_str,
+        )
+    X, y, beta = linear_scenario
+
+    with pytest.raises(NotFittedError):
+        cpi = CPI(
+            estimator=LinearRegression(),
+            imputation_model=LinearRegression(),
+            method="predict",
+        )
+        cpi.predict(X)
+    with pytest.raises(NotFittedError):
+        cpi = CPI(
+            estimator=LinearRegression(),
+            imputation_model=LinearRegression(),
+            method="predict",
+        )
+        cpi.score(X, y)
+    with pytest.raises(ValueError):
+        fitted_model = LinearRegression().fit(X, y)
+        cpi = CPI(
+            estimator=fitted_model,
+            imputation_model=LinearRegression(),
+            method="predict",
+        )
+        cpi.predict(X)
+    with pytest.raises(ValueError):
+        fitted_model = LinearRegression().fit(X, y)
+        cpi = CPI(
+            estimator=fitted_model,
+            imputation_model=LinearRegression(),
+            method="predict",
+        )
+        cpi.score(X, y)
diff --git a/hidimstat/utils.py b/hidimstat/utils.py
index a15c4d8..d0d9cd1 100644
--- a/hidimstat/utils.py
+++ b/hidimstat/utils.py
@@ -1,12 +1,14 @@
 import copy
 
 import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
+
+# import torch
+# import torch.nn as nn
+# import torch.nn.functional as F
 from sklearn.metrics import log_loss, mean_squared_error
 from sklearn.preprocessing import StandardScaler
-from torchmetrics import Accuracy
+
+# from torchmetrics import Accuracy
 
 
 def quantile_aggregation(pvals, gamma=0.5, gamma_min=0.05, adaptive=False):
@@ -282,603 +284,603 @@ def _check_vim_predict_method(method):
         )
 
 
-def sigmoid(x):
-    """
-    This function applies the sigmoid function element-wise to the input array x
-    """
-    return 1 / (1 + np.exp(-x))
-
-
-def softmax(x):
-    """
-    This function applies the softmax function element-wise to the input array x
-    """
-    # Ensure numerical stability by subtracting the maximum value of x from each element of x
-    # This prevents overflow errors when exponentiating large numbers
-    x = x - np.max(x, axis=-1, keepdims=True)
-    exp_x = np.exp(x)
-    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
-
-
-def relu(x):
-    """
-    This function applies the relu function element-wise to the input array x
-    """
-    return (abs(x) + x) / 2
-
-
-def relu_(x):
-    """
-    This function applies the derivative of the relu function element-wise
-    to the input array x
-    """
-    return (x > 0) * 1
-
-
-def convert_predict_proba(list_probs):
-    """
-    If the classification is done using a one-hot encoded variable, the list of
-    probabilites will be a list of lists for the probabilities of each of the categories.
-    This function takes the probabilities of having each category (=1 with binary) and stack
-    them into one ndarray.
-    """
-    if len(list_probs.shape) == 3:
-        list_probs = np.array(list_probs)[..., 1].T
-    return list_probs
-
-
-def ordinal_encode(y):
-    """
-    This function encodes the ordinal variable with a special gradual encoding storing also
-    the natural order information.
-    """
-    list_y = []
-    for y_col in range(y.shape[-1]):
-        # Retrieve the unique values
-        unique_vals = np.unique(y[:, y_col])
-        # Mapping each unique value to its corresponding index
-        mapping_dict = {}
-        for i, val in enumerate(unique_vals):
-            mapping_dict[val] = i + 1
-        # create a zero-filled array for the ordinal encoding
-        y_ordinal = np.zeros((len(y[:, y_col]), len(set(y[:, y_col]))))
-        # set the appropriate indices to 1 for each ordinal value and all lower ordinal values
-        for ind_el, el in enumerate(y[:, y_col]):
-            y_ordinal[ind_el, np.arange(mapping_dict[el])] = 1
-        list_y.append(y_ordinal[:, 1:])
-
-    return list_y
-
-
-def sample_predictions(predictions, random_state=None):
-    """
-    This function samples from the same leaf node of the input sample
-    in both the regression and the classification cases
-    """
-    rng = np.random.RandomState(random_state)
-    # print(predictions[..., rng.randint(predictions.shape[2]), :])
-    # print(predictions.shape)
-    # exit(0)
-    return predictions[..., rng.randint(predictions.shape[2]), :]
-
-
-def joblib_ensemble_dnnet(
-    X,
-    y,
-    problem_type="regression",
-    activation_outcome=None,
-    list_continuous=None,
-    list_grps=None,
-    sampling_with_repetition=False,
-    split_percentage=0.8,
-    group_stacking=False,
-    input_dimensions=None,
-    n_epoch=200,
-    batch_size=32,
-    beta1=0.9,
-    beta2=0.999,
-    lr=1e-3,
-    l1_weight=1e-2,
-    l2_weight=1e-2,
-    epsilon=1e-8,
-    random_state=None,
-):
-    """
-    This function implements the ensemble learning of the sub-DNN models
-
-    Parameters
-    ----------
-    X : {array-like, sparse matrix} of shape (n_train_samples, n_features)
-        The input samples.
-    y : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs)
-        The target values (class labels in classification, real numbers in
-        regression).
-    problem_type : str, default='regression'
-        A classification or a regression problem.
-    activation_outcome : str, default=None
-        The activation function to apply in the outcome layer, "softmax" for
-        classification and "sigmoid" for both ordinal and binary cases.
-    list_continuous : list, default=None
-        The list of continuous variables.
-    list_grps : list of lists, default=None
-        A list collecting the indices of the groups' variables
-        while applying the stacking method.
-    sampling_with_repetition : bool, default=True
-        Application of sampling_with_repetition sampling for the training set.
-    split_percentage : float, default=0.8
-        The training/validation cut for the provided data.
-    group_stacking : bool, default=False
-        Apply the stacking-based method for the provided groups.
-    input_dimensions : list, default=None
-        The cumsum of inputs after the linear sub-layers.
-    n_epoch : int, default=200
-        The number of epochs for the DNN learner(s).
-    batch_size : int, default=32
-        The number of samples per batch for training.
-    beta1 : float, default=0.9
-        The exponential decay rate for the first moment estimates.
-    beta2 : float, default=0.999
-        The exponential decay rate for the second moment estimates.
-    lr : float, default=1e-3
-        The learning rate.
-    l1_weight : float, default=1e-2
-        The L1-regularization paramter for weight decay.
-    l2_weight : float, default=0
-        The L2-regularization paramter for weight decay.
-    epsilon : float, default=1e-8
-        A small constant added to the denominator to prevent division by zero.
-    random_state : int, default=2023
-        Fixing the seeds of the random generator.
-
-    Returns
-    -------
-    current_model : list
-        The parameters of the sub-DNN model
-    scaler_x : list of Scikit-learn StandardScalers
-        The scalers for the continuous input variables.
-    scaler_y : Scikit-learn StandardScaler
-        The scaler for the continuous output variable.
-    pred_v : ndarray
-        The predictions of the sub-DNN model.
-    loss : float
-        The loss score of the sub-DNN model.
-    """
-
-    pred_v = np.empty(X.shape[0])
-    # Sampling and Train/Validate splitting
-    (
-        X_train_scaled,
-        y_train_scaled,
-        X_validation_scaled,
-        y_validation_scaled,
-        X_scaled,
-        y_validation,
-        scaler_x,
-        scaler_y,
-        valid_ind,
-    ) = create_X_y(
-        X,
-        y,
-        sampling_with_repetition=sampling_with_repetition,
-        split_percentage=split_percentage,
-        problem_type=problem_type,
-        list_continuous=list_continuous,
-        random_state=random_state,
-    )
-
-    current_model = dnn_net(
-        X_train_scaled,
-        y_train_scaled,
-        X_validation_scaled,
-        y_validation_scaled,
-        problem_type=problem_type,
-        n_epoch=n_epoch,
-        batch_size=batch_size,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        l1_weight=l1_weight,
-        l2_weight=l2_weight,
-        epsilon=epsilon,
-        list_grps=list_grps,
-        group_stacking=group_stacking,
-        input_dimensions=input_dimensions,
-        random_state=random_state,
-    )
-
-    if not group_stacking:
-        X_scaled_n = X_scaled.copy()
-    else:
-        X_scaled_n = np.zeros((X_scaled.shape[0], input_dimensions[-1]))
-        for grp_ind in range(len(list_grps)):
-            n_layer_stacking = len(current_model[3][grp_ind]) - 1
-            curr_pred = X_scaled[:, list_grps[grp_ind]].copy()
-            for ind_w_b in range(n_layer_stacking):
-                if ind_w_b == 0:
-                    curr_pred = relu(
-                        X_scaled[:, list_grps[grp_ind]].dot(
-                            current_model[3][grp_ind][ind_w_b]
-                        )
-                        + current_model[4][grp_ind][ind_w_b]
-                    )
-                else:
-                    curr_pred = relu(
-                        curr_pred.dot(current_model[3][grp_ind][ind_w_b])
-                        + current_model[4][grp_ind][ind_w_b]
-                    )
-            X_scaled_n[
-                :,
-                list(
-                    np.arange(input_dimensions[grp_ind], input_dimensions[grp_ind + 1])
-                ),
-            ] = (
-                curr_pred.dot(current_model[3][grp_ind][n_layer_stacking])
-                + current_model[4][grp_ind][n_layer_stacking]
-            )
-
-    n_layer = len(current_model[0]) - 1
-    for j in range(n_layer):
-        if j == 0:
-            pred = relu(X_scaled_n.dot(current_model[0][j]) + current_model[1][j])
-        else:
-            pred = relu(pred.dot(current_model[0][j]) + current_model[1][j])
-
-    pred = pred.dot(current_model[0][n_layer]) + current_model[1][n_layer]
-
-    if problem_type not in ("classification", "binary"):
-        if problem_type != "ordinal":
-            pred_v = pred * scaler_y.scale_ + scaler_y.mean_
-        else:
-            pred_v = activation_outcome[problem_type](pred)
-        loss = np.std(y_validation) ** 2 - mean_squared_error(
-            y_validation, pred_v[valid_ind]
-        )
-    else:
-        pred_v = activation_outcome[problem_type](pred)
-        loss = log_loss(
-            y_validation, np.ones(y_validation.shape) * np.mean(y_validation, axis=0)
-        ) - log_loss(y_validation, pred_v[valid_ind])
-
-    return (current_model, scaler_x, scaler_y, pred_v, loss)
-
-
-def initialize_weights(layer):
-    if isinstance(layer, nn.Linear):
-        layer.weight.data = (layer.weight.data.uniform_() - 0.5) * 0.2
-        layer.bias.data = (layer.bias.data.uniform_() - 0.5) * 0.1
-
-
-def Dataset_Loader(X, y, shuffle=False, batch_size=50):
-    if y.shape[-1] == 2:
-        y = y[:, [1]]
-    dataset = torch.utils.data.TensorDataset(
-        torch.from_numpy(X).float(), torch.from_numpy(y).float()
-    )
-
-    loader = torch.utils.data.DataLoader(
-        dataset, batch_size=batch_size, shuffle=shuffle
-    )
-    return loader
-
-
-class DNN(nn.Module):
-    """
-    Feedfoward Neural Network with 4 hidden layers
-    """
-
-    def __init__(
-        self, input_dim, group_stacking, list_grps, output_dimension, problem_type
-    ):
-        super().__init__()
-        if problem_type == "classification":
-            self.accuracy = Accuracy(task="multiclass", num_classes=output_dimension)
-        else:
-            self.accuracy = Accuracy(task="binary")
-        self.list_grps = list_grps
-        self.group_stacking = group_stacking
-        if group_stacking:
-            self.layers_stacking = nn.ModuleList(
-                [
-                    nn.Linear(
-                        in_features=len(grp),
-                        out_features=input_dim[grp_ind + 1] - input_dim[grp_ind],
-                    )
-                    # nn.Sequential(
-                    #     nn.Linear(
-                    #         in_features=len(grp),
-                    #         # out_features=max(1, int(0.1 * len(grp))),
-                    #         out_features=input_dim[grp_ind + 1]
-                    #         - input_dim[grp_ind],
-                    #     ),
-                    #     nn.ReLU(),
-                    # nn.Linear(
-                    #     in_features=max(1, int(0.1 * len(grp))),
-                    #     out_features=input_dim[grp_ind + 1]
-                    #     - input_dim[grp_ind],
-                    # ),
-                    # nn.ReLU(),
-                    # nn.Linear(
-                    #     in_features=max(1, int(0.1 * len(grp))),
-                    #     out_features=input_dim[grp_ind + 1]
-                    #     - input_dim[grp_ind],
-                    # ),
-                    # )
-                    for grp_ind, grp in enumerate(list_grps)
-                ]
-            )
-            input_dim = input_dim[-1]
-        self.layers = nn.Sequential(
-            # hidden layers
-            nn.Linear(input_dim, 50),
-            nn.ReLU(),
-            nn.Linear(50, 40),
-            nn.ReLU(),
-            nn.Linear(40, 30),
-            nn.ReLU(),
-            nn.Linear(30, 20),
-            nn.ReLU(),
-            # output layer
-            nn.Linear(20, output_dimension),
-        )
-        self.loss = 0
-
-    def forward(self, x):
-        if self.group_stacking:
-            list_stacking = [None] * len(self.layers_stacking)
-            for ind_layer, layer in enumerate(self.layers_stacking):
-                list_stacking[ind_layer] = layer(x[:, self.list_grps[ind_layer]])
-            x = torch.cat(list_stacking, dim=1)
-        return self.layers(x)
-
-    def training_step(self, batch, device, problem_type):
-        X, y = batch[0].to(device), batch[1].to(device)
-        y_pred = self(X)  # Generate predictions
-        if problem_type == "regression":
-            loss = F.mse_loss(y_pred, y)
-        elif problem_type == "classification":
-            loss = F.cross_entropy(y_pred, y)  # Calculate loss
-        else:
-            loss = F.binary_cross_entropy_with_logits(y_pred, y)
-        return loss
-
-    def validation_step(self, batch, device, problem_type):
-        X, y = batch[0].to(device), batch[1].to(device)
-        y_pred = self(X)  # Generate predictions
-        if problem_type == "regression":
-            loss = F.mse_loss(y_pred, y)
-            return {
-                "val_mse": loss,
-                "batch_size": len(X),
-            }
-        else:
-            if problem_type == "classification":
-                loss = F.cross_entropy(y_pred, y)  # Calculate loss
-            else:
-                loss = F.binary_cross_entropy_with_logits(y_pred, y)
-            acc = self.accuracy(y_pred, y.int())
-            return {
-                "val_loss": loss,
-                "val_acc": acc,
-                "batch_size": len(X),
-            }
-
-    def validation_epoch_end(self, outputs, problem_type):
-        if problem_type in ("classification", "binary"):
-            batch_losses = []
-            batch_accs = []
-            batch_sizes = []
-            for x in outputs:
-                batch_losses.append(x["val_loss"] * x["batch_size"])
-                batch_accs.append(x["val_acc"] * x["batch_size"])
-                batch_sizes.append(x["batch_size"])
-            self.loss = torch.stack(batch_losses).sum().item() / np.sum(
-                batch_sizes
-            )  # Combine losses
-            epoch_acc = torch.stack(batch_accs).sum().item() / np.sum(
-                batch_sizes
-            )  # Combine accuracies
-            return {"val_loss": self.loss, "val_acc": epoch_acc}
-        else:
-            batch_losses = [x["val_mse"] * x["batch_size"] for x in outputs]
-            batch_sizes = [x["batch_size"] for x in outputs]
-            self.loss = torch.stack(batch_losses).sum().item() / np.sum(
-                batch_sizes
-            )  # Combine losses
-            return {"val_mse": self.loss}
-
-    def epoch_end(self, epoch, result):
-        if len(result) == 2:
-            print(
-                "Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format(
-                    epoch + 1, result["val_loss"], result["val_acc"]
-                )
-            )
-        else:
-            print("Epoch [{}], val_mse: {:.4f}".format(epoch + 1, result["val_mse"]))
-
-
-def evaluate(model, loader, device, problem_type):
-    outputs = [model.validation_step(batch, device, problem_type) for batch in loader]
-    return model.validation_epoch_end(outputs, problem_type)
-
-
-def dnn_net(
-    X_train,
-    y_train,
-    X_validation,
-    y_validation,
-    problem_type="regression",
-    n_epoch=200,
-    batch_size=32,
-    batch_size_validation=128,
-    beta1=0.9,
-    beta2=0.999,
-    lr=1e-3,
-    l1_weight=1e-2,
-    l2_weight=1e-2,
-    epsilon=1e-8,
-    list_grps=None,
-    group_stacking=False,
-    input_dimensions=None,
-    random_state=2023,
-    verbose=0,
-):
-    """
-    This function implements the training/validation process of the sub-DNN
-    models
-
-    Parameters
-    ----------
-    X_train : {array-like, sparse matrix} of shape (n_train_samples, n_features)
-        The training input samples.
-    y_train : {array-like} of shape (n_train_samples, )
-        The training output samples.
-    X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features)
-        The validation input samples.
-    y_validation : {array-like} of shape (n_validation_samples, )
-        The validation output samples.
-    problem_type : str, default='regression'
-        A classification or a regression problem.
-    n_epoch : int, default=200
-        The number of epochs for the DNN learner(s).
-    batch_size : int, default=32
-        The number of samples per batch for training.
-    batch_size_validation : int, default=128
-        The number of samples per batch for validation.
-    beta1 : float, default=0.9
-        The exponential decay rate for the first moment estimates.
-    beta2 : float, default=0.999
-        The exponential decay rate for the second moment estimates.
-    lr : float, default=1e-3
-        The learning rate.
-    l1_weight : float, default=1e-2
-        The L1-regularization paramter for weight decay.
-    l2_weight : float, default=0
-        The L2-regularization paramter for weight decay.
-    epsilon : float, default=1e-8
-        A small constant added to the denominator to prevent division by zero.
-    list_grps : list of lists, default=None
-        A list collecting the indices of the groups' variables
-        while applying the stacking method.
-    group_stacking : bool, default=False
-        Apply the stacking-based method for the provided groups.
-    input_dimensions : list, default=None
-        The cumsum of inputs after the linear sub-layers.
-    random_state : int, default=2023
-        Fixing the seeds of the random generator.
-    verbose : int, default=0
-        If verbose > 0, the fitted iterations will be printed.
-    """
-    # Creating DataLoaders
-    train_loader = Dataset_Loader(
-        X_train,
-        y_train,
-        shuffle=True,
-        batch_size=batch_size,
-    )
-    validate_loader = Dataset_Loader(
-        X_validation, y_validation, batch_size=batch_size_validation
-    )
-    # Set the seed for PyTorch's random number generator
-    torch.manual_seed(random_state)
-
-    # Set the seed for PyTorch's CUDA random number generator(s), if available
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(random_state)
-        torch.cuda.manual_seed_all(random_state)
-
-    # Specify whether to use GPU or CPU
-    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    device = torch.device("cpu")
-
-    if problem_type in ("regression", "binary"):
-        output_dimension = 1
-    else:
-        output_dimension = y_train.shape[-1]
-
-    # DNN model
-    input_dim = input_dimensions.copy() if group_stacking else X_train.shape[1]
-    model = DNN(input_dim, group_stacking, list_grps, output_dimension, problem_type)
-    model.to(device)
-    # Initializing weights/bias
-    model.apply(initialize_weights)
-    # Adam Optimizer
-    optimizer = torch.optim.Adam(
-        model.parameters(), lr=lr, betas=(beta1, beta2), eps=epsilon
-    )
-
-    best_loss = 1e100
-    for epoch in range(n_epoch):
-        # Training Phase
-        model.train()
-        for batch in train_loader:
-            optimizer.zero_grad()
-            loss = model.training_step(batch, device, problem_type)
-
-            loss.backward()
-            optimizer.step()
-            for name, param in model.named_parameters():
-                if "bias" not in name:
-                    # if name.split(".")[0] == "layers_stacking":
-                    #     param.data -= l2_weight * param.data
-                    # else:
-                    param.data -= (
-                        l1_weight * torch.sign(param.data) + l2_weight * param.data
-                    )
-        # Validation Phase
-        model.eval()
-        result = evaluate(model, validate_loader, device, problem_type)
-        if model.loss < best_loss:
-            best_loss = model.loss
-            dict_params = copy.deepcopy(model.state_dict())
-        if verbose >= 2:
-            model.epoch_end(epoch, result)
-
-    best_weight = []
-    best_bias = []
-    best_weight_stack = [[].copy() for _ in range(len(list_grps))]
-    best_bias_stack = [[].copy() for _ in range(len(list_grps))]
-
-    for name, param in dict_params.items():
-        if name.split(".")[0] == "layers":
-            if name.split(".")[-1] == "weight":
-                best_weight.append(param.numpy().T)
-            if name.split(".")[-1] == "bias":
-                best_bias.append(param.numpy()[np.newaxis, :])
-        if name.split(".")[0] == "layers_stacking":
-            curr_ind = int(name.split(".")[1])
-            if name.split(".")[-1] == "weight":
-                best_weight_stack[curr_ind].append(param.numpy().T)
-            if name.split(".")[-1] == "bias":
-                best_bias_stack[curr_ind].append(param.numpy()[np.newaxis, :])
-
-    return [
-        best_weight,
-        best_bias,
-        best_loss,
-        best_weight_stack,
-        best_bias_stack,
-    ]
-
-
-def compute_imp_std(pred_scores):
-    weights = np.array([el.shape[-2] for el in pred_scores])
-    # Compute the mean of each fold over the number of observations
-    pred_mean = np.array([np.mean(el.copy(), axis=-2) for el in pred_scores])
-
-    # Weighted average
-    imp = np.average(pred_mean, axis=0, weights=weights)
-
-    # Compute the standard deviation of each fold
-    # over the number of observations
-    pred_std = np.array(
-        [
-            np.mean(
-                (el - imp[..., np.newaxis]) ** 2,
-                axis=-2,
-            )
-            for el in pred_scores
-        ]
-    )
-    std = np.sqrt(np.average(pred_std, axis=0, weights=weights) / (np.sum(weights) - 1))
-    return (imp, std)
+# def sigmoid(x):
+#     """
+#     This function applies the sigmoid function element-wise to the input array x
+#     """
+#     return 1 / (1 + np.exp(-x))
+
+
+# def softmax(x):
+#     """
+#     This function applies the softmax function element-wise to the input array x
+#     """
+#     # Ensure numerical stability by subtracting the maximum value of x from each element of x
+#     # This prevents overflow errors when exponentiating large numbers
+#     x = x - np.max(x, axis=-1, keepdims=True)
+#     exp_x = np.exp(x)
+#     return exp_x / np.sum(exp_x, axis=-1, keepdims=True)
+
+
+# def relu(x):
+#     """
+#     This function applies the relu function element-wise to the input array x
+#     """
+#     return (abs(x) + x) / 2
+
+
+# def relu_(x):
+#     """
+#     This function applies the derivative of the relu function element-wise
+#     to the input array x
+#     """
+#     return (x > 0) * 1
+
+
+# def convert_predict_proba(list_probs):
+#     """
+#     If the classification is done using a one-hot encoded variable, the list of
+#     probabilites will be a list of lists for the probabilities of each of the categories.
+#     This function takes the probabilities of having each category (=1 with binary) and stack
+#     them into one ndarray.
+#     """
+#     if len(list_probs.shape) == 3:
+#         list_probs = np.array(list_probs)[..., 1].T
+#     return list_probs
+
+
+# def ordinal_encode(y):
+#     """
+#     This function encodes the ordinal variable with a special gradual encoding storing also
+#     the natural order information.
+#     """
+#     list_y = []
+#     for y_col in range(y.shape[-1]):
+#         # Retrieve the unique values
+#         unique_vals = np.unique(y[:, y_col])
+#         # Mapping each unique value to its corresponding index
+#         mapping_dict = {}
+#         for i, val in enumerate(unique_vals):
+#             mapping_dict[val] = i + 1
+#         # create a zero-filled array for the ordinal encoding
+#         y_ordinal = np.zeros((len(y[:, y_col]), len(set(y[:, y_col]))))
+#         # set the appropriate indices to 1 for each ordinal value and all lower ordinal values
+#         for ind_el, el in enumerate(y[:, y_col]):
+#             y_ordinal[ind_el, np.arange(mapping_dict[el])] = 1
+#         list_y.append(y_ordinal[:, 1:])
+
+#     return list_y
+
+
+# def sample_predictions(predictions, random_state=None):
+#     """
+#     This function samples from the same leaf node of the input sample
+#     in both the regression and the classification cases
+#     """
+#     rng = np.random.RandomState(random_state)
+#     # print(predictions[..., rng.randint(predictions.shape[2]), :])
+#     # print(predictions.shape)
+#     # exit(0)
+#     return predictions[..., rng.randint(predictions.shape[2]), :]
+
+
+# def joblib_ensemble_dnnet(
+#     X,
+#     y,
+#     problem_type="regression",
+#     activation_outcome=None,
+#     list_continuous=None,
+#     list_grps=None,
+#     sampling_with_repetition=False,
+#     split_percentage=0.8,
+#     group_stacking=False,
+#     input_dimensions=None,
+#     n_epoch=200,
+#     batch_size=32,
+#     beta1=0.9,
+#     beta2=0.999,
+#     lr=1e-3,
+#     l1_weight=1e-2,
+#     l2_weight=1e-2,
+#     epsilon=1e-8,
+#     random_state=None,
+# ):
+#     """
+#     This function implements the ensemble learning of the sub-DNN models
+
+#     Parameters
+#     ----------
+#     X : {array-like, sparse matrix} of shape (n_train_samples, n_features)
+#         The input samples.
+#     y : array-like of shape (n_train_samples,) or (n_train_samples, n_outputs)
+#         The target values (class labels in classification, real numbers in
+#         regression).
+#     problem_type : str, default='regression'
+#         A classification or a regression problem.
+#     activation_outcome : str, default=None
+#         The activation function to apply in the outcome layer, "softmax" for
+#         classification and "sigmoid" for both ordinal and binary cases.
+#     list_continuous : list, default=None
+#         The list of continuous variables.
+#     list_grps : list of lists, default=None
+#         A list collecting the indices of the groups' variables
+#         while applying the stacking method.
+#     sampling_with_repetition : bool, default=True
+#         Application of sampling_with_repetition sampling for the training set.
+#     split_percentage : float, default=0.8
+#         The training/validation cut for the provided data.
+#     group_stacking : bool, default=False
+#         Apply the stacking-based method for the provided groups.
+#     input_dimensions : list, default=None
+#         The cumsum of inputs after the linear sub-layers.
+#     n_epoch : int, default=200
+#         The number of epochs for the DNN learner(s).
+#     batch_size : int, default=32
+#         The number of samples per batch for training.
+#     beta1 : float, default=0.9
+#         The exponential decay rate for the first moment estimates.
+#     beta2 : float, default=0.999
+#         The exponential decay rate for the second moment estimates.
+#     lr : float, default=1e-3
+#         The learning rate.
+#     l1_weight : float, default=1e-2
+#         The L1-regularization paramter for weight decay.
+#     l2_weight : float, default=0
+#         The L2-regularization paramter for weight decay.
+#     epsilon : float, default=1e-8
+#         A small constant added to the denominator to prevent division by zero.
+#     random_state : int, default=2023
+#         Fixing the seeds of the random generator.
+
+#     Returns
+#     -------
+#     current_model : list
+#         The parameters of the sub-DNN model
+#     scaler_x : list of Scikit-learn StandardScalers
+#         The scalers for the continuous input variables.
+#     scaler_y : Scikit-learn StandardScaler
+#         The scaler for the continuous output variable.
+#     pred_v : ndarray
+#         The predictions of the sub-DNN model.
+#     loss : float
+#         The loss score of the sub-DNN model.
+#     """
+
+#     pred_v = np.empty(X.shape[0])
+#     # Sampling and Train/Validate splitting
+#     (
+#         X_train_scaled,
+#         y_train_scaled,
+#         X_validation_scaled,
+#         y_validation_scaled,
+#         X_scaled,
+#         y_validation,
+#         scaler_x,
+#         scaler_y,
+#         valid_ind,
+#     ) = create_X_y(
+#         X,
+#         y,
+#         sampling_with_repetition=sampling_with_repetition,
+#         split_percentage=split_percentage,
+#         problem_type=problem_type,
+#         list_continuous=list_continuous,
+#         random_state=random_state,
+#     )
+
+#     current_model = dnn_net(
+#         X_train_scaled,
+#         y_train_scaled,
+#         X_validation_scaled,
+#         y_validation_scaled,
+#         problem_type=problem_type,
+#         n_epoch=n_epoch,
+#         batch_size=batch_size,
+#         beta1=beta1,
+#         beta2=beta2,
+#         lr=lr,
+#         l1_weight=l1_weight,
+#         l2_weight=l2_weight,
+#         epsilon=epsilon,
+#         list_grps=list_grps,
+#         group_stacking=group_stacking,
+#         input_dimensions=input_dimensions,
+#         random_state=random_state,
+#     )
+
+#     if not group_stacking:
+#         X_scaled_n = X_scaled.copy()
+#     else:
+#         X_scaled_n = np.zeros((X_scaled.shape[0], input_dimensions[-1]))
+#         for grp_ind in range(len(list_grps)):
+#             n_layer_stacking = len(current_model[3][grp_ind]) - 1
+#             curr_pred = X_scaled[:, list_grps[grp_ind]].copy()
+#             for ind_w_b in range(n_layer_stacking):
+#                 if ind_w_b == 0:
+#                     curr_pred = relu(
+#                         X_scaled[:, list_grps[grp_ind]].dot(
+#                             current_model[3][grp_ind][ind_w_b]
+#                         )
+#                         + current_model[4][grp_ind][ind_w_b]
+#                     )
+#                 else:
+#                     curr_pred = relu(
+#                         curr_pred.dot(current_model[3][grp_ind][ind_w_b])
+#                         + current_model[4][grp_ind][ind_w_b]
+#                     )
+#             X_scaled_n[
+#                 :,
+#                 list(
+#                     np.arange(input_dimensions[grp_ind], input_dimensions[grp_ind + 1])
+#                 ),
+#             ] = (
+#                 curr_pred.dot(current_model[3][grp_ind][n_layer_stacking])
+#                 + current_model[4][grp_ind][n_layer_stacking]
+#             )
+
+#     n_layer = len(current_model[0]) - 1
+#     for j in range(n_layer):
+#         if j == 0:
+#             pred = relu(X_scaled_n.dot(current_model[0][j]) + current_model[1][j])
+#         else:
+#             pred = relu(pred.dot(current_model[0][j]) + current_model[1][j])
+
+#     pred = pred.dot(current_model[0][n_layer]) + current_model[1][n_layer]
+
+#     if problem_type not in ("classification", "binary"):
+#         if problem_type != "ordinal":
+#             pred_v = pred * scaler_y.scale_ + scaler_y.mean_
+#         else:
+#             pred_v = activation_outcome[problem_type](pred)
+#         loss = np.std(y_validation) ** 2 - mean_squared_error(
+#             y_validation, pred_v[valid_ind]
+#         )
+#     else:
+#         pred_v = activation_outcome[problem_type](pred)
+#         loss = log_loss(
+#             y_validation, np.ones(y_validation.shape) * np.mean(y_validation, axis=0)
+#         ) - log_loss(y_validation, pred_v[valid_ind])
+
+#     return (current_model, scaler_x, scaler_y, pred_v, loss)
+
+
+# def initialize_weights(layer):
+#     if isinstance(layer, nn.Linear):
+#         layer.weight.data = (layer.weight.data.uniform_() - 0.5) * 0.2
+#         layer.bias.data = (layer.bias.data.uniform_() - 0.5) * 0.1
+
+
+# def Dataset_Loader(X, y, shuffle=False, batch_size=50):
+#     if y.shape[-1] == 2:
+#         y = y[:, [1]]
+#     dataset = torch.utils.data.TensorDataset(
+#         torch.from_numpy(X).float(), torch.from_numpy(y).float()
+#     )
+
+#     loader = torch.utils.data.DataLoader(
+#         dataset, batch_size=batch_size, shuffle=shuffle
+#     )
+#     return loader
+
+
+# class DNN(nn.Module):
+#     """
+#     Feedfoward Neural Network with 4 hidden layers
+#     """
+
+#     def __init__(
+#         self, input_dim, group_stacking, list_grps, output_dimension, problem_type
+#     ):
+#         super().__init__()
+#         if problem_type == "classification":
+#             self.accuracy = Accuracy(task="multiclass", num_classes=output_dimension)
+#         else:
+#             self.accuracy = Accuracy(task="binary")
+#         self.list_grps = list_grps
+#         self.group_stacking = group_stacking
+#         if group_stacking:
+#             self.layers_stacking = nn.ModuleList(
+#                 [
+#                     nn.Linear(
+#                         in_features=len(grp),
+#                         out_features=input_dim[grp_ind + 1] - input_dim[grp_ind],
+#                     )
+#                     # nn.Sequential(
+#                     #     nn.Linear(
+#                     #         in_features=len(grp),
+#                     #         # out_features=max(1, int(0.1 * len(grp))),
+#                     #         out_features=input_dim[grp_ind + 1]
+#                     #         - input_dim[grp_ind],
+#                     #     ),
+#                     #     nn.ReLU(),
+#                     # nn.Linear(
+#                     #     in_features=max(1, int(0.1 * len(grp))),
+#                     #     out_features=input_dim[grp_ind + 1]
+#                     #     - input_dim[grp_ind],
+#                     # ),
+#                     # nn.ReLU(),
+#                     # nn.Linear(
+#                     #     in_features=max(1, int(0.1 * len(grp))),
+#                     #     out_features=input_dim[grp_ind + 1]
+#                     #     - input_dim[grp_ind],
+#                     # ),
+#                     # )
+#                     for grp_ind, grp in enumerate(list_grps)
+#                 ]
+#             )
+#             input_dim = input_dim[-1]
+#         self.layers = nn.Sequential(
+#             # hidden layers
+#             nn.Linear(input_dim, 50),
+#             nn.ReLU(),
+#             nn.Linear(50, 40),
+#             nn.ReLU(),
+#             nn.Linear(40, 30),
+#             nn.ReLU(),
+#             nn.Linear(30, 20),
+#             nn.ReLU(),
+#             # output layer
+#             nn.Linear(20, output_dimension),
+#         )
+#         self.loss = 0
+
+#     def forward(self, x):
+#         if self.group_stacking:
+#             list_stacking = [None] * len(self.layers_stacking)
+#             for ind_layer, layer in enumerate(self.layers_stacking):
+#                 list_stacking[ind_layer] = layer(x[:, self.list_grps[ind_layer]])
+#             x = torch.cat(list_stacking, dim=1)
+#         return self.layers(x)
+
+#     def training_step(self, batch, device, problem_type):
+#         X, y = batch[0].to(device), batch[1].to(device)
+#         y_pred = self(X)  # Generate predictions
+#         if problem_type == "regression":
+#             loss = F.mse_loss(y_pred, y)
+#         elif problem_type == "classification":
+#             loss = F.cross_entropy(y_pred, y)  # Calculate loss
+#         else:
+#             loss = F.binary_cross_entropy_with_logits(y_pred, y)
+#         return loss
+
+#     def validation_step(self, batch, device, problem_type):
+#         X, y = batch[0].to(device), batch[1].to(device)
+#         y_pred = self(X)  # Generate predictions
+#         if problem_type == "regression":
+#             loss = F.mse_loss(y_pred, y)
+#             return {
+#                 "val_mse": loss,
+#                 "batch_size": len(X),
+#             }
+#         else:
+#             if problem_type == "classification":
+#                 loss = F.cross_entropy(y_pred, y)  # Calculate loss
+#             else:
+#                 loss = F.binary_cross_entropy_with_logits(y_pred, y)
+#             acc = self.accuracy(y_pred, y.int())
+#             return {
+#                 "val_loss": loss,
+#                 "val_acc": acc,
+#                 "batch_size": len(X),
+#             }
+
+#     def validation_epoch_end(self, outputs, problem_type):
+#         if problem_type in ("classification", "binary"):
+#             batch_losses = []
+#             batch_accs = []
+#             batch_sizes = []
+#             for x in outputs:
+#                 batch_losses.append(x["val_loss"] * x["batch_size"])
+#                 batch_accs.append(x["val_acc"] * x["batch_size"])
+#                 batch_sizes.append(x["batch_size"])
+#             self.loss = torch.stack(batch_losses).sum().item() / np.sum(
+#                 batch_sizes
+#             )  # Combine losses
+#             epoch_acc = torch.stack(batch_accs).sum().item() / np.sum(
+#                 batch_sizes
+#             )  # Combine accuracies
+#             return {"val_loss": self.loss, "val_acc": epoch_acc}
+#         else:
+#             batch_losses = [x["val_mse"] * x["batch_size"] for x in outputs]
+#             batch_sizes = [x["batch_size"] for x in outputs]
+#             self.loss = torch.stack(batch_losses).sum().item() / np.sum(
+#                 batch_sizes
+#             )  # Combine losses
+#             return {"val_mse": self.loss}
+
+#     def epoch_end(self, epoch, result):
+#         if len(result) == 2:
+#             print(
+#                 "Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}".format(
+#                     epoch + 1, result["val_loss"], result["val_acc"]
+#                 )
+#             )
+#         else:
+#             print("Epoch [{}], val_mse: {:.4f}".format(epoch + 1, result["val_mse"]))
+
+
+# def evaluate(model, loader, device, problem_type):
+#     outputs = [model.validation_step(batch, device, problem_type) for batch in loader]
+#     return model.validation_epoch_end(outputs, problem_type)
+
+
+# def dnn_net(
+#     X_train,
+#     y_train,
+#     X_validation,
+#     y_validation,
+#     problem_type="regression",
+#     n_epoch=200,
+#     batch_size=32,
+#     batch_size_validation=128,
+#     beta1=0.9,
+#     beta2=0.999,
+#     lr=1e-3,
+#     l1_weight=1e-2,
+#     l2_weight=1e-2,
+#     epsilon=1e-8,
+#     list_grps=None,
+#     group_stacking=False,
+#     input_dimensions=None,
+#     random_state=2023,
+#     verbose=0,
+# ):
+#     """
+#     This function implements the training/validation process of the sub-DNN
+#     models
+
+#     Parameters
+#     ----------
+#     X_train : {array-like, sparse matrix} of shape (n_train_samples, n_features)
+#         The training input samples.
+#     y_train : {array-like} of shape (n_train_samples, )
+#         The training output samples.
+#     X_validation : {array-like, sparse matrix} of shape (n_validation_samples, n_features)
+#         The validation input samples.
+#     y_validation : {array-like} of shape (n_validation_samples, )
+#         The validation output samples.
+#     problem_type : str, default='regression'
+#         A classification or a regression problem.
+#     n_epoch : int, default=200
+#         The number of epochs for the DNN learner(s).
+#     batch_size : int, default=32
+#         The number of samples per batch for training.
+#     batch_size_validation : int, default=128
+#         The number of samples per batch for validation.
+#     beta1 : float, default=0.9
+#         The exponential decay rate for the first moment estimates.
+#     beta2 : float, default=0.999
+#         The exponential decay rate for the second moment estimates.
+#     lr : float, default=1e-3
+#         The learning rate.
+#     l1_weight : float, default=1e-2
+#         The L1-regularization paramter for weight decay.
+#     l2_weight : float, default=0
+#         The L2-regularization paramter for weight decay.
+#     epsilon : float, default=1e-8
+#         A small constant added to the denominator to prevent division by zero.
+#     list_grps : list of lists, default=None
+#         A list collecting the indices of the groups' variables
+#         while applying the stacking method.
+#     group_stacking : bool, default=False
+#         Apply the stacking-based method for the provided groups.
+#     input_dimensions : list, default=None
+#         The cumsum of inputs after the linear sub-layers.
+#     random_state : int, default=2023
+#         Fixing the seeds of the random generator.
+#     verbose : int, default=0
+#         If verbose > 0, the fitted iterations will be printed.
+#     """
+#     # Creating DataLoaders
+#     train_loader = Dataset_Loader(
+#         X_train,
+#         y_train,
+#         shuffle=True,
+#         batch_size=batch_size,
+#     )
+#     validate_loader = Dataset_Loader(
+#         X_validation, y_validation, batch_size=batch_size_validation
+#     )
+#     # Set the seed for PyTorch's random number generator
+#     torch.manual_seed(random_state)
+
+#     # Set the seed for PyTorch's CUDA random number generator(s), if available
+#     if torch.cuda.is_available():
+#         torch.cuda.manual_seed(random_state)
+#         torch.cuda.manual_seed_all(random_state)
+
+#     # Specify whether to use GPU or CPU
+#     # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#     device = torch.device("cpu")
+
+#     if problem_type in ("regression", "binary"):
+#         output_dimension = 1
+#     else:
+#         output_dimension = y_train.shape[-1]
+
+#     # DNN model
+#     input_dim = input_dimensions.copy() if group_stacking else X_train.shape[1]
+#     model = DNN(input_dim, group_stacking, list_grps, output_dimension, problem_type)
+#     model.to(device)
+#     # Initializing weights/bias
+#     model.apply(initialize_weights)
+#     # Adam Optimizer
+#     optimizer = torch.optim.Adam(
+#         model.parameters(), lr=lr, betas=(beta1, beta2), eps=epsilon
+#     )
+
+#     best_loss = 1e100
+#     for epoch in range(n_epoch):
+#         # Training Phase
+#         model.train()
+#         for batch in train_loader:
+#             optimizer.zero_grad()
+#             loss = model.training_step(batch, device, problem_type)
+
+#             loss.backward()
+#             optimizer.step()
+#             for name, param in model.named_parameters():
+#                 if "bias" not in name:
+#                     # if name.split(".")[0] == "layers_stacking":
+#                     #     param.data -= l2_weight * param.data
+#                     # else:
+#                     param.data -= (
+#                         l1_weight * torch.sign(param.data) + l2_weight * param.data
+#                     )
+#         # Validation Phase
+#         model.eval()
+#         result = evaluate(model, validate_loader, device, problem_type)
+#         if model.loss < best_loss:
+#             best_loss = model.loss
+#             dict_params = copy.deepcopy(model.state_dict())
+#         if verbose >= 2:
+#             model.epoch_end(epoch, result)
+
+#     best_weight = []
+#     best_bias = []
+#     best_weight_stack = [[].copy() for _ in range(len(list_grps))]
+#     best_bias_stack = [[].copy() for _ in range(len(list_grps))]
+
+#     for name, param in dict_params.items():
+#         if name.split(".")[0] == "layers":
+#             if name.split(".")[-1] == "weight":
+#                 best_weight.append(param.numpy().T)
+#             if name.split(".")[-1] == "bias":
+#                 best_bias.append(param.numpy()[np.newaxis, :])
+#         if name.split(".")[0] == "layers_stacking":
+#             curr_ind = int(name.split(".")[1])
+#             if name.split(".")[-1] == "weight":
+#                 best_weight_stack[curr_ind].append(param.numpy().T)
+#             if name.split(".")[-1] == "bias":
+#                 best_bias_stack[curr_ind].append(param.numpy()[np.newaxis, :])
+
+#     return [
+#         best_weight,
+#         best_bias,
+#         best_loss,
+#         best_weight_stack,
+#         best_bias_stack,
+#     ]
+
+
+# def compute_imp_std(pred_scores):
+#     weights = np.array([el.shape[-2] for el in pred_scores])
+#     # Compute the mean of each fold over the number of observations
+#     pred_mean = np.array([np.mean(el.copy(), axis=-2) for el in pred_scores])
+
+#     # Weighted average
+#     imp = np.average(pred_mean, axis=0, weights=weights)
+
+#     # Compute the standard deviation of each fold
+#     # over the number of observations
+#     pred_std = np.array(
+#         [
+#             np.mean(
+#                 (el - imp[..., np.newaxis]) ** 2,
+#                 axis=-2,
+#             )
+#             for el in pred_scores
+#         ]
+#     )
+#     std = np.sqrt(np.average(pred_std, axis=0, weights=weights) / (np.sum(weights) - 1))
+#     return (imp, std)