diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
index 4eaaa9c..91fd70f 100644
--- a/.github/workflows/python-package-conda.yml
+++ b/.github/workflows/python-package-conda.yml
@@ -14,24 +14,25 @@ jobs:
     - uses: actions/checkout@v2
     - uses: mamba-org/setup-micromamba@v1
       with:
-        environment-file: environment.yml
+        environment-file: environment_torch.yml
         activate-environment: test
     - shell: bash -l {0}
       run: |
-        pip install --upgrade keras
         conda info
         conda list
         conda config --show-sources
         conda config --show
         printenv | sort
-    - name: Lint with flake8
+    - name: Lint with ruff
       shell: bash -l {0}
       run: |
-        micromamba install flake8
+        micromamba install ruff
         # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        ruff . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=100 --max-line-length=127 --statistics
+        ruff . --count --exit-zero --max-complexity=100 --max-line-length=127 --statistics
+        # Checking documentation errors
+        ruff . --count --select=D --exit-zero --max-complexity=100 --max-line-length=127 --statistics
     - name: Test with pytest
       shell: bash -l {0}
       run: |
diff --git a/mlguess/keras/data.py b/mlguess/keras/data.py
index 9921f86..6df462c 100644
--- a/mlguess/keras/data.py
+++ b/mlguess/keras/data.py
@@ -86,8 +86,7 @@ def preprocess_data(
     groups=[],
     seed=1000,
 ):
-    """
-    Function to select features and scale data for ML
+    """Function to select features and scale data for ML
     Args:
         data (dictionary of dataframes for training and validation data):
         input_features (list): Input features
diff --git a/mlguess/keras/deprecated/losses.py b/mlguess/keras/deprecated/losses.py
index e43cffb..98f9172 100644
--- a/mlguess/keras/deprecated/losses.py
+++ b/mlguess/keras/deprecated/losses.py
@@ -15,8 +15,8 @@
 
 
 class DirichletEvidentialLoss(keras.losses.Loss):
-    """
-    Loss function for an evidential categorical model.
+    """Loss function for an evidential categorical model.
+
     Args:
         callback (list): List of callbacks.
         name (str): reference name
@@ -76,8 +76,7 @@ def __call__(self, y, output, sample_weight=None):
         return ops.mean(A + B + C)
 
 class EvidentialRegressionLoss(keras.losses.Loss):
-    """
-    Loss function for an evidential regression model. The total loss is the Negative Log Likelihood of the
+    """Loss function for an evidential regression model. The total loss is the Negative Log Likelihood of the
     Normal Inverse Gamma summed with the error and scaled by the evidential coefficient. The coefficient has a strong
     influence on the uncertainty predictions (less so for the predictions themselves) of the model and must be tuned
     for individual datasets.
@@ -120,8 +119,8 @@ def get_config(self):
         return config
 
 def gaussian_nll(y, y_pred, reduce=True):
-    """
-    Loss function for a parametric Gaussian Loss.
+    """Loss function for a parametric Gaussian Loss.
+
     Args:
         y: Training data targets
         y_pred: Model predicitons
@@ -139,8 +138,7 @@ def gaussian_nll(y, y_pred, reduce=True):
 
 class EvidentialRegressionCoupledLoss(keras.losses.Loss):
     def __init__(self, r=1.0, coeff=1.0):
-        """
-        implementation of the loss from meinert and lavin that fixes issues with the original
+        """Implementation of the loss from meinert and lavin that fixes issues with the original
         evidential loss for regression. The loss couples the virtual evidence values with coefficient r.
         In this new loss, the regularizer is unnecessary.
         """
diff --git a/mlguess/keras/deprecated/models.py b/mlguess/keras/deprecated/models.py
index e5b296d..0540405 100644
--- a/mlguess/keras/deprecated/models.py
+++ b/mlguess/keras/deprecated/models.py
@@ -19,8 +19,8 @@
 import logging
 
 class BaseRegressor(object):
-    """
-    A base class for regression models.
+    """A base class for regression models.
+
     Attributes:
         hidden_layers: Number of hidden layers
         hidden_neurons: Number of neurons in each hidden layer
@@ -97,8 +97,7 @@ def __init__(
         self.history = None
 
     def build_neural_network(self, inputs, outputs, last_layer="Dense"):
-        """
-        Create Keras neural network model and compile it.
+        """Create Keras neural network model and compile it.
 
         Args:
             inputs (int): Number of input predictor variables.
@@ -169,8 +168,7 @@ def build_neural_network(self, inputs, outputs, last_layer="Dense"):
         )
 
     def build_from_sequential(self, model, optimizer="adam", loss="mse", metrics=None):
-        """
-        Build the neural network model using a Keras Sequential model.
+        """Build the neural network model using a Keras Sequential model.
 
         Args:
             model (tf.keras.Sequential): Keras Sequential model to use.
@@ -204,8 +202,8 @@ def fit(
         shuffle=True,
         **kwargs,
     ):
-        """
-        Fit the regression model.
+        """Fit the regression model.
+
         Args:
             x: Input data
             y: Target data
@@ -217,7 +215,6 @@ def fit(
             use_multiprocessing: If True, use ProcessPoolExecutor to load data, which is faster but can cause issues with certain GPU setups. If False, use a ThreadPoolExecutor.
             **kwargs: Additional arguments to be passed to the `fit` method
         """
-
         if self.model is None:
             raise ValueError("Model has not been built. Call build_neural_network first.")
         if self.verbose:
@@ -240,8 +237,7 @@ def fit(
         )
 
     def save_model(self):
-        """
-        Save the trained model to a file.
+        """Save the trained model to a file.
         """
         if not os.path.exists(self.save_path):
             os.makedirs(self.save_path)
@@ -259,8 +255,7 @@ def save_model(self):
 
     @classmethod
     def load_model(cls, conf):
-        """
-        Load a trained model using args from a configuration
+        """Load a trained model using args from a configuration
         """
         # Check if weights file exists
         weights = os.path.join(conf["model"]["save_path"], "best.h5")
@@ -306,7 +301,7 @@ def load_model(cls, conf):
         return model_class
 
     def mae(self, y_true, y_pred):
-        """ Compute the MAE """
+        """Compute the MAE"""
         num_splits = y_pred.shape[-1]
         if num_splits == 4:
             mu, _, _, _ = ops.split(y_pred, num_splits, axis=-1)
@@ -317,7 +312,7 @@ def mae(self, y_true, y_pred):
         return keras.metrics.mean_absolute_error(y_true, mu)
 
     def mse(self, y_true, y_pred):
-        """ Compute the MSE """
+        """Compute the MSE"""
         num_splits = y_pred.shape[-1]
         if num_splits == 4:
             mu, _, _, _ = ops.split(y_pred, num_splits, axis=-1)
@@ -329,8 +324,7 @@ def mse(self, y_true, y_pred):
         return keras.metrics.mean_squared_error(y_true, mu)
 
     def predict(self, x, scaler=None, batch_size=None):
-        """
-        Predict target values for input data.
+        """Predict target values for input data.
 
         Args:
             x (numpy.ndarray): Input data.
@@ -350,8 +344,7 @@ def predict(self, x, scaler=None, batch_size=None):
         return y_out
 
     def predict_ensemble(self, x, batch_size=None, scaler=None, num_outputs=1):
-        """
-        Predicts outcomes using an ensemble of trained Keras models.
+        """Predicts outcomes using an ensemble of trained Keras models.
 
         Args:
             x (numpy.ndarray): Input data for predictions.
@@ -429,8 +422,7 @@ def predict_ensemble(self, x, batch_size=None, scaler=None, num_outputs=1):
         return ensemble_mu, ensemble_ale, ensemble_epi
 
     def predict_monte_carlo(self, x_test, forward_passes, scaler=None, batch_size=None, num_outputs=1):
-        """
-        Perform Monte Carlo dropout predictions for the model.
+        """Perform Monte Carlo dropout predictions for the model.
 
         Args:
             x_test (numpy.ndarray): Input data for prediction.
@@ -442,7 +434,6 @@ def predict_monte_carlo(self, x_test, forward_passes, scaler=None, batch_size=No
         Returns:
             tuple: Tuple of arrays containing predicted target values and specified uncertainties.
         """
-
         n_samples = x_test.shape[0]
         pred_size = self.model.output_shape[-1]
         _batch_size = self.batch_size if batch_size is None else batch_size
@@ -531,8 +522,7 @@ def __init__(
 
 
 class GaussianRegressorDNN(BaseRegressor):
-    """
-    A Dense Neural Network Model that can support arbitrary numbers of hidden layers
+    """A Dense Neural Network Model that can support arbitrary numbers of hidden layers
     and provides evidential uncertainty estimation.
     Inherits from BaseRegressor.
 
@@ -581,8 +571,7 @@ def __init__(
         metrics=None,
         eps=1e-7
     ):
-        """
-        Initialize the EvidentialRegressorDNN.
+        """Initialize the EvidentialRegressorDNN.
 
         Args:
             coupling_coef: Coupling coeffient for loss fix
@@ -617,8 +606,7 @@ def __init__(
         self.loss = gaussian_nll
 
     def build_neural_network(self, inputs, outputs, last_layer="DenseNormal"):
-        """
-        Create Keras neural network model and compile it.
+        """Create Keras neural network model and compile it.
 
         Args:
             inputs (int): Number of input predictor variables.
@@ -714,8 +702,7 @@ def predict_monte_carlo(self, x_test, forward_passes, scaler=None, batch_size=No
 
 
 class EvidentialRegressorDNN(BaseRegressor):
-    """
-    A Dense Neural Network Model that can support arbitrary numbers of hidden layers
+    """A Dense Neural Network Model that can support arbitrary numbers of hidden layers
     and provides evidential uncertainty estimation.
     Inherits from BaseRegressor.
 
@@ -765,8 +752,7 @@ def __init__(
         metrics=None,
         eps=1e-7
     ):
-        """
-        Initialize the EvidentialRegressorDNN.
+        """Initialize the EvidentialRegressorDNN.
 
         Args:
             coupling_coef: Coupling coeffient for loss fix
@@ -817,8 +803,7 @@ def __init__(
         logging.info(f"Using loss: {loss}")
 
     def build_neural_network(self, inputs, outputs):
-        """
-        Create Keras neural network model and compile it.
+        """Create Keras neural network model and compile it.
 
         Args:
             inputs (int): Number of input predictor variables.
@@ -932,8 +917,8 @@ def predict_monte_carlo(
 
 
 class CategoricalDNN(object):
-    """
-    A Dense Neural Network Model that can support arbitrary numbers of hidden layers.
+    """A Dense Neural Network Model that can support arbitrary numbers of hidden layers.
+
     Attributes:
         hidden_layers: Number of hidden layers
         hidden_neurons: Number of neurons in each hidden layer
@@ -1022,8 +1007,8 @@ def __init__(
         self.steps_per_epoch = steps_per_epoch
 
     def build_neural_network(self, inputs, outputs):
-        """
-        Create Keras neural network model and compile it.
+        """Create Keras neural network model and compile it.
+
         Args:
             inputs (int): Number of input predictor variables
             outputs (int): Number of output predictor variables
@@ -1081,8 +1066,7 @@ def build_neural_network(self, inputs, outputs):
         self.model.compile(optimizer=self.optimizer_obj, loss=self.loss)
 
     def build_from_sequential(self, model, optimizer="adam", loss="mse", metrics=None):
-        """
-        Build the neural network model using a Keras Sequential model.
+        """Build the neural network model using a Keras Sequential model.
 
         Args:
             model (tf.keras.Sequential): Keras Sequential model to use.
diff --git a/mlguess/keras/layers.py b/mlguess/keras/layers.py
index 7bf018b..0315359 100644
--- a/mlguess/keras/layers.py
+++ b/mlguess/keras/layers.py
@@ -7,8 +7,7 @@
 
 @keras.saving.register_keras_serializable()
 class DenseNormalGamma(layers.Layer):
-    """
-    Implements dense output layer for a deep evidential regression model.
+    """Implements dense output layer for a deep evidential regression model.
     Reference: https://www.mit.edu/~amini/pubs/pdf/deep-evidential-regression.pdf
     Source: https://github.com/aamini/evidential-deep-learning
 
@@ -36,8 +35,7 @@ def __init__(self, units: int,
         self.eps = eps
 
     def evidence(self, x):
-        """
-        Converts values from continuous space to greater than 0 using a softplus activation function.
+        """Converts values from continuous space to greater than 0 using a softplus activation function.
 
         Args:
             x: input value
@@ -65,8 +63,7 @@ def get_config(self):
       
 
 class DenseNormal(layers.Layer):
-    """
-    Dense output layer for a Gaussian distribution regression neural network.
+    """Dense output layer for a Gaussian distribution regression neural network.
 
     Args:
         units (int): Output size of regression tasks
diff --git a/mlguess/keras/losses.py b/mlguess/keras/losses.py
index bdc96a6..17f1686 100644
--- a/mlguess/keras/losses.py
+++ b/mlguess/keras/losses.py
@@ -57,8 +57,7 @@ def loss(y, y_pred):
 
 @keras.saving.register_keras_serializable()
 def evidential_reg_loss(evi_coef):
-    """
-    Loss function for an evidential regression model. The total loss is the Negative Log Likelihood of the
+    """Loss function for an evidential regression model. The total loss is the Negative Log Likelihood of the
     Normal Inverse Gamma summed with the error and scaled by the evidential coefficient. The coefficient has a strong
     influence on the uncertainty predictions (less so for the predictions themselves) of the model and must be tuned
     for individual datasets.
@@ -97,8 +96,8 @@ def loss(y, y_pred):
 
 @keras.saving.register_keras_serializable()
 def gaussian_nll(y, y_pred, reduce=True):
-    """
-    Loss function for a parametric Gaussian Loss.
+    """Loss function for a parametric Gaussian Loss.
+
     Args:
         y: Training data targets
         y_pred: Model predicitons
@@ -116,8 +115,7 @@ def gaussian_nll(y, y_pred, reduce=True):
 
 class EvidentialRegressionCoupledLoss(keras.losses.Loss):
     def __init__(self, r=1.0, coeff=1.0):
-        """
-        implementation of the loss from meinert and lavin that fixes issues with the original
+        """Implementation of the loss from meinert and lavin that fixes issues with the original
         evidential loss for regression. The loss couples the virtual evidence values with coefficient r.
         In this new loss, the regularizer is unnecessary.
         """
diff --git a/mlguess/keras/models.py b/mlguess/keras/models.py
index 304bb05..e97475f 100644
--- a/mlguess/keras/models.py
+++ b/mlguess/keras/models.py
@@ -13,8 +13,7 @@
 
 @keras.saving.register_keras_serializable()
 class CategoricalDNN(keras.models.Model):
-    """
-    A Categorical Dense Neural Network Model that can support arbitrary numbers of hidden layers
+    """A Categorical Dense Neural Network Model that can support arbitrary numbers of hidden layers
     and the ability to provide evidential uncertainty estimation.
 
     Attributes:
@@ -42,8 +41,7 @@ class CategoricalDNN(keras.models.Model):
         verbose: Level of detail to provide during training (0 = None, 1 = Minimal, 2 = All)
         classifier: (boolean) If training on classes
 
-        Example:
-
+    Example:
             When evidential==True, the output activation and the loss function will be overridden under the hood. When
             evidential==False, it will use the parameters specified and ignore the annealing_coeff.
             Note: Model compilation happens under the hood when .fit() is called.
@@ -196,12 +194,12 @@ def fit(self, x=None, y=None, **kwargs):
         return hist
 
     def predict(self, x, return_uncertainties=True, **kwargs):
-        """
-        Args:
+        """Args:
             x: Input data
             batch_size: Size of batch to predict
             return_uncertainties: Returns derived uncertainties from evidential distribution parameters.
                                   If False, return the probabilities only.
+
         Returns:
             If return_uncertainties is True (tuple): (probs, u (evidential uncertainty), aleatoric, epistemic)
             Else If return_uncertainties is False: probs
@@ -262,8 +260,7 @@ def get_config(self):
 
 
 class RegressorDNN(keras.models.Model):
-    """
-    A Dense Neural Network Model that can support arbitrary numbers of hidden layers
+    """A Dense Neural Network Model that can support arbitrary numbers of hidden layers
     and the ability to provide evidential uncertainty estimation or uncertainty estimation through
     a gaussian parametric approach.
 
@@ -285,8 +282,7 @@ class RegressorDNN(keras.models.Model):
         evi_coeff: Evidential regularization coefficient.
         metrics: Optional list of metrics to monitor during training.
 
-        Example:
-
+    Example:
             When evidential==True or uncertainty==True, the output activation and the loss function will be overridden
             under the hood. If both are True, the evidential model will override. When both are set to False,
             it will train a generic DNN with a linear output activation and the specified loss function.
@@ -433,12 +429,12 @@ def fit(self, x=None, y=None, **kwargs):
         return hist
 
     def predict(self, x, return_uncertainties=True, batch_size=1000, **kwargs):
-        """
-        Args:
+        """Args:
             x: Input data
             batch_size: Size of batch to predict
             return_uncertainties: Returns derived uncertainties from evidential distribution parameters.
                                   If False, return the raw parameters themselves (mu, gamma, alpha, beta).
+
         Returns:
             If return_uncertainties is True: np.array(mu, aleatoric uncertainty, epistemic uncertainty)
             Else If return_uncertainties is False: np.array(mu, gamma, alpha, beta)
diff --git a/mlguess/pbs.py b/mlguess/pbs.py
index cba2b3a..117ac4b 100644
--- a/mlguess/pbs.py
+++ b/mlguess/pbs.py
@@ -7,8 +7,7 @@
 
 
 def launch_pbs_jobs(config_file, trainer_path, args=''):
-    """
-    Launches a PBS job using the specified configuration file and trainer script.
+    """Launches a PBS job using the specified configuration file and trainer script.
 
     This function reads the configuration file to construct a PBS script, writes the
     script to a file, submits the job using `qsub`, and then cleans up the script file.
@@ -22,7 +21,6 @@ def launch_pbs_jobs(config_file, trainer_path, args=''):
         ValueError: If the 'pbs' section is not present in the configuration file.
 
     """
-
     # Load configuration file
     with open(config_file, "r") as f:
         config = yaml.safe_load(f)
@@ -74,8 +72,7 @@ def launch_pbs_jobs(config_file, trainer_path, args=''):
 
 
 def launch_distributed_jobs(config_file, script_path, launch=True):
-    """
-    Launches a distributed job across multiple nodes using PBS and MPI.
+    """Launches a distributed job across multiple nodes using PBS and MPI.
 
     This function generates a PBS script based on the provided configuration file,
     copies the necessary files, and optionally submits the job to the queue.
@@ -86,7 +83,6 @@ def launch_distributed_jobs(config_file, script_path, launch=True):
         launch (bool, optional): If True, submits the job using `qsub`. If False, only generates the script. Defaults to True.
 
     """
-
     with open(config_file) as cf:
         config = yaml.load(cf, Loader=yaml.FullLoader)
 
diff --git a/mlguess/pit.py b/mlguess/pit.py
index c6df00b..24ae2b2 100644
--- a/mlguess/pit.py
+++ b/mlguess/pit.py
@@ -20,7 +20,6 @@ def pit_histogram(y_true, y_pred, pred_type="ensemble", bins=10):
     Returns:
         pit_hist, pit_bins        
     """
-
     if pred_type == "gaussian":
         assert len(y_pred.shape) == 2 and y_pred.shape[1] == 2, "Pred shape is incorrect for Gaussian distribution"
         pit_quantiles = probability_integral_transform_gaussian(y_true, y_pred)
@@ -35,8 +34,7 @@ def pit_histogram(y_true, y_pred, pred_type="ensemble", bins=10):
 
 
 def pit_deviation(y_true, y_pred, pred_type="ensemble", bins=10):
-    """
-    Runs pit_histogram then calculates the pit_deviation. See docstring for pit_histogram.
+    """Runs pit_histogram then calculates the pit_deviation. See docstring for pit_histogram.
     
     """
     pit_hist, pit_bins = pit_histogram(y_true, y_pred, pred_type=pred_type, bins=bins)
@@ -46,16 +44,14 @@ def pit_deviation(y_true, y_pred, pred_type="ensemble", bins=10):
     return pit_deviation
 
 def pit_deviation_worst(n_bins):
-    """
-    Calculate the worst possible PITD score based on the number of bins. Assumes all the forecasts 
+    """Calculate the worst possible PITD score based on the number of bins. Assumes all the forecasts
     end up in one of the outermost bins.
     """
     return np.sqrt(1 / n_bins * ((n_bins - 1) * (1 / n_bins) ** 2 + (1 - 1 / n_bins) ** 2))
 
 
 def pit_deviation_skill_score(y_true, y_pred, pred_type="ensemble", bins=10):
-    """
-    Calculate PITD score relative to the worst possible PITD for a given number of bins.
+    """Calculate PITD score relative to the worst possible PITD for a given number of bins.
     Ranges from 0 to 1.
     """
     pitd_score = pit_deviation(y_true, y_pred, pred_type=pred_type, bins=bins)
@@ -64,14 +60,13 @@ def pit_deviation_skill_score(y_true, y_pred, pred_type="ensemble", bins=10):
 
 
 def probability_integral_transform_ensemble(y_true, y_pred_ens):
-    """
-    Calculate the probability integral transform quantiles for an ensemble of predictions
+    """Calculate the probability integral transform quantiles for an ensemble of predictions
 
     Args:
         y_true: true values with shape (n_samples,)
         y_pred_ens: predicted ensemble values with shape (n_samples, n_ensemble_members)
     
-    Returns: 
+    Returns:
         pit_quantiles: for each sample, the true value's quantile in the predicted distribution.
     """
     pit_quantiles = np.zeros(y_true.shape)
@@ -81,14 +76,13 @@ def probability_integral_transform_ensemble(y_true, y_pred_ens):
 
 
 def probability_integral_transform_gaussian(y_true, y_pred_gaussian):
-    """
-    Calculate the probability integral transform quantiles for a single Gaussian distribution.
+    """Calculate the probability integral transform quantiles for a single Gaussian distribution.
 
     Args:
         y_true: true values with shape (n_samples,)
         y_pred_gaussian: predicted Gaussian parameters (mean, stand. dev.) with shape (n_samples, n_params)
     
-    Returns: 
+    Returns:
         pit_quantiles: for each sample, the true value's quantile in the predicted distribution.
     """
     pit_quantiles = np.zeros(y_true.shape)
@@ -98,14 +92,13 @@ def probability_integral_transform_gaussian(y_true, y_pred_gaussian):
 
 
 def pit_gaussian_ensemble(y_true, y_pred_gauss_ens):
-    """
-    Calculate the probability integral transform quantile for an ensemble of Gaussian parametric models
+    """Calculate the probability integral transform quantile for an ensemble of Gaussian parametric models
 
     Args:
         y_true: true values with shape (n_samples,)
         y_pred_gauss_ens: ensemble of gaussian predictions (mean and standard deviation) with shape (n_samples, n_params, n_members)
  
-    Returns: 
+    Returns:
         pit_quantiles: for each sample, the true value's quantile in the predicted distribution.
     """
     pit_quantiles_members = np.zeros((y_true.shape[0], y_pred_gauss_ens.shape[-1]))
diff --git a/mlguess/plotting.py b/mlguess/plotting.py
index 76eb869..8560f30 100644
--- a/mlguess/plotting.py
+++ b/mlguess/plotting.py
@@ -7,8 +7,7 @@
 
 
 def plot_confusion_matrix(y_true, y_pred, classes, model_name, normalize=False, title=None, cmap=plt.cm.Blues, filename=None):
-    """
-    Function to plot a confusion matrix. 
+    """Function to plot a confusion matrix.
     """
     if not title:
         if normalize:
diff --git a/mlguess/regression_metrics.py b/mlguess/regression_metrics.py
index 1114633..17699b2 100644
--- a/mlguess/regression_metrics.py
+++ b/mlguess/regression_metrics.py
@@ -8,8 +8,7 @@
 
 
 def regression_metrics(y_true, y_pred, total=None, split="val"):
-    """
-    Compute common regression metrics for continuous data.
+    """Compute common regression metrics for continuous data.
 
     Parameters:
     y_true (array-like): True target values.
diff --git a/mlguess/regression_uq.py b/mlguess/regression_uq.py
index 4c30bde..6531ddd 100644
--- a/mlguess/regression_uq.py
+++ b/mlguess/regression_uq.py
@@ -299,8 +299,7 @@ def plot_uncertainties(
 
 
 def compute_skill_score(y_true, y_pred, y_std, num_bins=10):
-    """
-    Computes the skill score with RMSE on the y-axis and binned spread on the x-axis.
+    """Computes the skill score with RMSE on the y-axis and binned spread on the x-axis.
 
     Parameters
     ----------
@@ -313,14 +312,13 @@ def compute_skill_score(y_true, y_pred, y_std, num_bins=10):
     num_bins : int, optional
         The number of bins to use for binning the spread.
 
-    Returns
+    Returns:
     -------
     ss : array-like
         A 2D array of skill scores.
     bins : array-like
         A 1D array of bin edges for the spread.
     """
-
     # Bin the spread
     spread_min, spread_max = np.percentile(y_std, [5, 95])
     if spread_max - spread_min > 20:
@@ -350,8 +348,7 @@ def plot_skill_score(
     legend_cols=None,
     save_location=False,
 ):
-    """
-    Plots the skill score with RMSE on the y-axis and binned spread on the x-axis.
+    """Plots the skill score with RMSE on the y-axis and binned spread on the x-axis.
 
     Parameters
     ----------
diff --git a/mlguess/torch/checkpoint.py b/mlguess/torch/checkpoint.py
index 3ced647..e770a01 100644
--- a/mlguess/torch/checkpoint.py
+++ b/mlguess/torch/checkpoint.py
@@ -21,8 +21,7 @@
 # utils
 
 def load_model_state(conf, model, device):
-    """
-    Load the model state from a checkpoint file.
+    """Load the model state from a checkpoint file.
 
     This function restores the model state from a saved checkpoint. It supports loading models from
     different distributed training modes such as Fully Sharded Data Parallel (FSDP), Distributed Data Parallel (DDP),
@@ -43,7 +42,6 @@ def load_model_state(conf, model, device):
         FileNotFoundError: If the checkpoint file does not exist at the specified location.
         KeyError: If the checkpoint file does not contain the expected keys.
     """
-
     save_loc = os.path.expandvars(conf['save_loc'])
     #  Load an optimizer, gradient scaler, and learning rate scheduler, the optimizer must come after wrapping model using FSDP
     ckpt = os.path.join(save_loc, "checkpoint.pt")
@@ -63,8 +61,7 @@ def load_model_state(conf, model, device):
 
 
 def save_state_dict(state_dict: dict, checkpoint_file_path: str, use_safetensors: bool) -> None:
-    """
-    Save state dict to checkpoint.
+    """Save state dict to checkpoint.
 
     Args:
         state_dict (dict): state dict.
@@ -87,8 +84,7 @@ def save_state_dict(state_dict: dict, checkpoint_file_path: str, use_safetensors
 
 
 def load_state_dict(checkpoint_file_path: Path):
-    """
-    Load state dict from checkpoint.
+    """Load state dict from checkpoint.
 
     Args:
         checkpoint_file_path (Path): path to the checkpoint file.
@@ -96,7 +92,6 @@ def load_state_dict(checkpoint_file_path: Path):
     Returns:
         dict: state dict.
     """
-
     assert not is_dtensor_checkpoint(
         checkpoint_file_path
     ), f"Cannot load state dict from dtensor checkpoint {checkpoint_file_path}, you should convert the distributed tensors to gathered tensors with our CLI offline."
@@ -120,8 +115,7 @@ def load_state_dict(checkpoint_file_path: Path):
 
 
 def is_dtensor_checkpoint(checkpoint_file_path: str) -> bool:
-    """
-    Check whether the checkpoint file is a dtensor checkpoint.
+    """Check whether the checkpoint file is a dtensor checkpoint.
 
     Args:
         checkpoint_file_path (str): path to the checkpoint file.
@@ -136,8 +130,7 @@ def is_dtensor_checkpoint(checkpoint_file_path: str) -> bool:
 
 
 def is_safetensor_checkpoint(checkpoint_file_path: str) -> bool:
-    """
-    Check whether the checkpoint file is a safetensor checkpoint.
+    """Check whether the checkpoint file is a safetensor checkpoint.
 
     Args:
         checkpoint_file_path (str): path to the checkpoint file.
@@ -152,8 +145,7 @@ def is_safetensor_checkpoint(checkpoint_file_path: str) -> bool:
 
 
 def is_safetensors_available() -> bool:
-    """
-    Check whether safetensors is available.
+    """Check whether safetensors is available.
 
     Returns:
         bool: whether safetensors is available.
@@ -165,8 +157,7 @@ def is_safetensors_available() -> bool:
 
 
 class TorchFSDPCheckpointIO:
-    """
-    Handles loading and saving of checkpoints for models and optimizers
+    """Handles loading and saving of checkpoints for models and optimizers
     using Fully Sharded Data Parallel (FSDP) in PyTorch.
 
     This class provides methods to load unsharded models and optimizers from
@@ -199,8 +190,7 @@ def load_unsharded_optimizer(self, optimizer, checkpoint):
         optimizer.load_state_dict(sharded_osd)
 
     def save_unsharded_model(self, model, checkpoint, gather_dtensor, use_safetensors, rank):
-        """
-        Save model to checkpoint but only on master process.
+        """Save model to checkpoint but only on master process.
         """
         model = model.unwrap()
         cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
@@ -210,8 +200,7 @@ def save_unsharded_model(self, model, checkpoint, gather_dtensor, use_safetensor
             save_state_dict(full_model_state, checkpoint_file_path=checkpoint, use_safetensors=use_safetensors)
 
     def save_unsharded_optimizer(self, optimizer, checkpoint, gather_dtensor, rank):
-        """
-        Save optimizer to checkpoint but only on master process.
+        """Save optimizer to checkpoint but only on master process.
         """
         fsdp_model = optimizer.unwrap_model()
         full_optimizer_state = FSDP.optim_state_dict(fsdp_model, optim=optimizer)
@@ -220,8 +209,7 @@ def save_unsharded_optimizer(self, optimizer, checkpoint, gather_dtensor, rank):
 
 
 class ModelWrapper(nn.Module):
-    """
-    A wrapper class to define the common interface used FSDP.
+    """A wrapper class to define the common interface used FSDP.
 
     Args:
         module (nn.Module): The model to be wrapped.
@@ -232,8 +220,7 @@ def __init__(self, module: nn.Module) -> None:
         self.module = module
 
     def unwrap(self):
-        """
-        Unwrap the model to return the original model for checkpoint saving/loading.
+        """Unwrap the model to return the original model for checkpoint saving/loading.
         """
         if isinstance(self.module, ModelWrapper):
             return self.module.unwrap()
@@ -258,8 +245,7 @@ def concat_and_reshape(self, x1, x2):  # to be removed when data is updated
 
 
 class OptimizerWrapper:
-    """
-    A standard interface for optimizers wrapped by the Booster.
+    """A standard interface for optimizers wrapped by the Booster.
 
     Args:
         optim (Optimizer): The optimizer to be wrapped.
@@ -288,20 +274,17 @@ def add_param_group(self, *args, **kwargs):
         return self.optim.add_param_group(*args, **kwargs)
 
     def step(self, *args, **kwargs):
-        """
-        Performs a single optimization step.
+        """Performs a single optimization step.
         """
         return self.optim.step(*args, **kwargs)
 
     def zero_grad(self, *args, **kwargs):
-        """
-        Clears the gradients of all optimized `torch.Tensor`.
+        """Clears the gradients of all optimized `torch.Tensor`.
         """
         self.optim.zero_grad(*args, **kwargs)
 
     def backward(self, loss: Tensor, *args, **kwargs):
-        """
-        Performs a backward pass on the loss.
+        """Performs a backward pass on the loss.
         """
         loss.backward(*args, **kwargs)
 
@@ -309,20 +292,17 @@ def backward_by_grad(self, tensor: Tensor, grad: Tensor):
         torch.autograd.backward(tensor, grad)
 
     def state_dict(self):
-        """
-        Returns the optimizer state.
+        """Returns the optimizer state.
         """
         return self.optim.state_dict()
 
     def load_state_dict(self, *args, **kwargs):
-        """
-        Loads the optimizer state.
+        """Loads the optimizer state.
         """
         self.optim.load_state_dict(*args, **kwargs)
 
     def clip_grad_by_value(self, clip_value: float, *args, **kwargs) -> None:
-        """
-        Clips gradient of an iterable of parameters at specified min and max values.
+        """Clips gradient of an iterable of parameters at specified min and max values.
 
         Args:
             clip_value (float or int): maximum allowed value of the gradients. Gradients are clipped in the range
@@ -341,8 +321,7 @@ def clip_grad_by_norm(
         *args,
         **kwargs,
     ) -> Tensor:
-        """
-        Clips gradient norm of an iterable of parameters.
+        """Clips gradient norm of an iterable of parameters.
 
         Args:
             max_norm (float or int): max norm of the gradients
@@ -357,8 +336,7 @@ def clip_grad_by_norm(
         return norm
 
     def scale_loss(self, loss: Tensor):
-        """
-        Scales the loss for mixed precision training.
+        """Scales the loss for mixed precision training.
 
         Note: Only available for optimizers with mixed precision training.
 
@@ -370,8 +348,7 @@ def scale_loss(self, loss: Tensor):
         )
 
     def unscale_grad(self):
-        """
-        Unscale the gradients for mixed precision training.
+        """Unscale the gradients for mixed precision training.
 
         Note: Only available for optimizers with mixed precision training.
         """
@@ -380,8 +357,7 @@ def unscale_grad(self):
         )
 
     def unwrap(self):
-        """
-        Unwrap the optimizer for checkpoint saving/loading.
+        """Unwrap the optimizer for checkpoint saving/loading.
         """
         return self.optim
 
diff --git a/mlguess/torch/class_losses.py b/mlguess/torch/class_losses.py
index 8aa02df..dd4856b 100644
--- a/mlguess/torch/class_losses.py
+++ b/mlguess/torch/class_losses.py
@@ -5,8 +5,7 @@
 
 
 def get_device():
-    """
-    Get the device for PyTorch operations.
+    """Get the device for PyTorch operations.
 
     Returns:
         torch.device: The device to use, either "cuda" if CUDA is available, otherwise "cpu".
@@ -17,8 +16,7 @@ def get_device():
 
 
 def relu_evidence(y):
-    """
-    Apply the ReLU activation function to the input tensor.
+    """Apply the ReLU activation function to the input tensor.
 
     Args:
         y (torch.Tensor): Input tensor.
@@ -30,8 +28,7 @@ def relu_evidence(y):
 
 
 def exp_evidence(y):
-    """
-    Apply the exponential function to the input tensor with clamping.
+    """Apply the exponential function to the input tensor with clamping.
 
     Args:
         y (torch.Tensor): Input tensor.
@@ -43,8 +40,7 @@ def exp_evidence(y):
 
 
 def softplus_evidence(y):
-    """
-    Apply the Softplus activation function to the input tensor.
+    """Apply the Softplus activation function to the input tensor.
 
     Args:
         y (torch.Tensor): Input tensor.
@@ -56,8 +52,7 @@ def softplus_evidence(y):
 
 
 def kl_divergence(alpha, num_classes, device=None):
-    """
-    Compute the Kullback-Leibler divergence for a Dirichlet distribution.
+    """Compute the Kullback-Leibler divergence for a Dirichlet distribution.
 
     Args:
         alpha (torch.Tensor): The Dirichlet parameters (alpha).
@@ -87,8 +82,7 @@ def kl_divergence(alpha, num_classes, device=None):
 
 
 def loglikelihood_loss(y, alpha, device=None):
-    """
-    Compute the log-likelihood loss for a Dirichlet distribution.
+    """Compute the log-likelihood loss for a Dirichlet distribution.
 
     Args:
         y (torch.Tensor): Target values.
@@ -112,8 +106,7 @@ def loglikelihood_loss(y, alpha, device=None):
 
 
 def mse_loss(y, alpha, epoch_num, num_classes, annealing_step, device=None):
-    """
-    Compute the mean squared error loss with KL divergence for Dirichlet distributions.
+    """Compute the mean squared error loss with KL divergence for Dirichlet distributions.
 
     Args:
         y (torch.Tensor): Target values.
@@ -143,8 +136,7 @@ def mse_loss(y, alpha, epoch_num, num_classes, annealing_step, device=None):
 
 
 def edl_loss(func, y, alpha, epoch_num, num_classes, annealing_step, weights=None, device=None):
-    """
-    Compute the Evidence Deep Learning (EDL) loss.
+    """Compute the Evidence Deep Learning (EDL) loss.
 
     Args:
         func (callable): Function to apply to alpha (e.g., log, softplus).
@@ -179,8 +171,7 @@ def edl_loss(func, y, alpha, epoch_num, num_classes, annealing_step, weights=Non
 
 
 def edl_mse_loss(output, target, epoch_num, num_classes, annealing_step, weights=None, device=None):
-    """
-    Compute the Evidence Deep Learning (EDL) loss with mean squared error.
+    """Compute the Evidence Deep Learning (EDL) loss with mean squared error.
 
     Args:
         output (torch.Tensor): Model output tensor.
@@ -205,8 +196,7 @@ def edl_mse_loss(output, target, epoch_num, num_classes, annealing_step, weights
 
 
 def edl_log_loss(output, target, epoch_num, num_classes, annealing_step, weights=None, device=None):
-    """
-    Compute the Evidence Deep Learning (EDL) loss with the logarithm of evidence.
+    """Compute the Evidence Deep Learning (EDL) loss with the logarithm of evidence.
 
     Args:
         output (torch.Tensor): Model output tensor.
@@ -235,8 +225,7 @@ def edl_log_loss(output, target, epoch_num, num_classes, annealing_step, weights
 def edl_digamma_loss(
     output, target, epoch_num, num_classes, annealing_step, weights=None, device=None
 ):
-    """
-    Compute the Evidence Deep Learning (EDL) loss with the digamma function of evidence.
+    """Compute the Evidence Deep Learning (EDL) loss with the digamma function of evidence.
 
     Args:
         output (torch.Tensor): Model output tensor.
diff --git a/mlguess/torch/distributed.py b/mlguess/torch/distributed.py
index 914a43a..a1208b9 100644
--- a/mlguess/torch/distributed.py
+++ b/mlguess/torch/distributed.py
@@ -25,8 +25,7 @@
 
 
 def distributed_model_wrapper(conf, neural_network, device):
-    """
-    Wraps a neural network model in a distributed training wrapper (FSDP or DDP) based on configuration.
+    """Wraps a neural network model in a distributed training wrapper (FSDP or DDP) based on configuration.
 
     Args:
         conf (dict): Configuration dictionary specifying the training setup, including the model type,
@@ -37,7 +36,6 @@ def distributed_model_wrapper(conf, neural_network, device):
     Returns:
         torch.nn.Module: The distributed model wrapped according to the configuration.
     """
-
     # convert $USER to the actual user name
     conf['save_loc'] = os.path.expandvars(conf['save_loc'])
 
diff --git a/mlguess/torch/layers.py b/mlguess/torch/layers.py
index 86df96b..053f4c2 100644
--- a/mlguess/torch/layers.py
+++ b/mlguess/torch/layers.py
@@ -5,9 +5,7 @@
 
 
 class LinearNormalGamma(nn.Module):
-
-    """
-    A linear layer with a Normal-Gamma distribution parameterization.
+    """A linear layer with a Normal-Gamma distribution parameterization.
 
     This module applies a linear transformation to the input, followed by
     reshaping and parameter extraction for a Normal-Gamma distribution. The
@@ -23,15 +21,12 @@ class LinearNormalGamma(nn.Module):
     """
 
     def __init__(self, in_channels, out_channels, spectral_norm=True):
-
-        """
-        Initializes the LinearNormalGamma module.
+        """Initializes the LinearNormalGamma module.
 
         Args:
             in_channels (int): The number of input features.
             out_channels (int): The number of output features.
         """
-
         super().__init__()
         if spectral_norm:
             self.linear = SpectralNorm(nn.Linear(in_channels, out_channels*4))
@@ -39,8 +34,7 @@ def __init__(self, in_channels, out_channels, spectral_norm=True):
             self.linear = nn.Linear(in_channels, out_channels*4)
 
     def evidence(self, x):
-        """
-        Applies a log transformation to the input with a shift.
+        """Applies a log transformation to the input with a shift.
 
         Args:
             x (torch.Tensor): The input tensor.
@@ -51,8 +45,7 @@ def evidence(self, x):
         return torch.log(torch.exp(x) + 1)
 
     def forward(self, x):
-        """
-        Forward pass of the module.
+        """Forward pass of the module.
 
         Args:
             x (torch.Tensor): The input tensor of shape (batch_size, in_channels).
diff --git a/mlguess/torch/mc_dropout.py b/mlguess/torch/mc_dropout.py
index 2a8a3b6..ffbc6d4 100644
--- a/mlguess/torch/mc_dropout.py
+++ b/mlguess/torch/mc_dropout.py
@@ -6,7 +6,7 @@
 
 
 def enable_dropout(model):
-    """ Function to enable the dropout layers during test-time """
+    """Function to enable the dropout layers during test-time"""
     for m in model.modules():
         if m.__class__.__name__.startswith('Dropout'):
             m.train()
@@ -18,8 +18,7 @@ def monte_carlo_dropout(data_loader,
                         n_samples, 
                         batch_size=1024,
                         uncertainty=False):
-
-    """ Function to get the monte-carlo samples and uncertainty estimates
+    """Function to get the monte-carlo samples and uncertainty estimates
     through multiple forward passes
 
     Parameters
diff --git a/mlguess/torch/metrics.py b/mlguess/torch/metrics.py
index f5068b7..bd84442 100644
--- a/mlguess/torch/metrics.py
+++ b/mlguess/torch/metrics.py
@@ -3,8 +3,7 @@
 from hagelslag.evaluation.ProbabilityMetrics import DistributedROC
 
 class MetricsCalculator:
-    """
-    A class to calculate various metrics for model evaluation, including CSI, average accuracy,
+    """A class to calculate various metrics for model evaluation, including CSI, average accuracy,
     precision, recall, F1 score, AUC, MCE, and ECE.
 
     Args:
@@ -29,8 +28,7 @@ class MetricsCalculator:
     """
 
     def __init__(self, n_bins=10, use_uncertainty=False):
-        """
-        Initializes the MetricsCalculator with the specified number of bins and uncertainty flag.
+        """Initializes the MetricsCalculator with the specified number of bins and uncertainty flag.
 
         Args:
             n_bins (int, optional): Number of bins for MCE and ECE calculations. Default is 10.
@@ -42,8 +40,7 @@ def __init__(self, n_bins=10, use_uncertainty=False):
         self.bin_uppers = bin_boundaries[1:]
 
     def __call__(self, y_true, y_pred, split="train"):
-        """
-        Computes various metrics based on the true and predicted values.
+        """Computes various metrics based on the true and predicted values.
 
         Args:
             y_true (torch.Tensor): Tensor of true labels (one-hot encoded).
@@ -80,8 +77,7 @@ def __call__(self, y_true, y_pred, split="train"):
         return logs
 
     def mean_csi(self, y, pred_probs):
-        """
-        Computes the mean Critical Success Index (CSI) for the predicted probabilities.
+        """Computes the mean Critical Success Index (CSI) for the predicted probabilities.
 
         Args:
             y (numpy.ndarray): Array of true labels (one-hot encoded).
@@ -104,8 +100,7 @@ def mean_csi(self, y, pred_probs):
         return np.mean(rocs)
 
     def ave_acc(self, true_labels, pred_labels):
-        """
-        Computes the average accuracy for the true and predicted labels.
+        """Computes the average accuracy for the true and predicted labels.
 
         Args:
             true_labels (numpy.ndarray): Array of true labels.
@@ -125,8 +120,7 @@ def ave_acc(self, true_labels, pred_labels):
         )
 
     def mce(self, true_labels, pred_probs):
-        """
-        Computes the Maximum Calibration Error (MCE) for the predicted probabilities.
+        """Computes the Maximum Calibration Error (MCE) for the predicted probabilities.
 
         Args:
             true_labels (numpy.ndarray): Array of true labels.
@@ -158,8 +152,7 @@ def mce(self, true_labels, pred_probs):
         return mce if mce != 0.0 else self.bin_lowers.shape[0]
 
     def ece(self, true_labels, pred_probs):
-        """
-        Computes the Expected Calibration Error (ECE) for the predicted probabilities.
+        """Computes the Expected Calibration Error (ECE) for the predicted probabilities.
 
         Args:
             true_labels (numpy.ndarray): Array of true labels.
diff --git a/mlguess/torch/models.py b/mlguess/torch/models.py
index c80b222..9117b82 100644
--- a/mlguess/torch/models.py
+++ b/mlguess/torch/models.py
@@ -16,8 +16,7 @@
 
 
 def get_device():
-    """
-    Determine the computing device to use.
+    """Determine the computing device to use.
 
     Checks if CUDA is available and returns the appropriate device
     (either "cuda" or "cpu").
@@ -31,8 +30,7 @@ def get_device():
 
 
 def seed_everything(seed=1234):
-    """
-    Seed all random number generators for reproducibility.
+    """Seed all random number generators for reproducibility.
 
     Args:
         seed (int): The seed value to use for all random number generators. Default is 1234.
@@ -55,8 +53,7 @@ def seed_everything(seed=1234):
 
 
 def init_weights(net, init_type='normal', init_gain=0.0, verbose=True):
-    """
-    Initialize network weights using the specified method.
+    """Initialize network weights using the specified method.
 
     Args:
         net (nn.Module): The network whose weights are to be initialized.
@@ -75,8 +72,7 @@ def init_weights(net, init_type='normal', init_gain=0.0, verbose=True):
         - 'orthogonal': Orthogonal initialization.
     """
     def init_func(m):
-        """
-        Initialization function for network layers.
+        """Initialization function for network layers.
 
         Args:
             m (nn.Module): The module to be initialized.
@@ -108,19 +104,18 @@ def init_func(m):
 
 
 class DNN(nn.Module):
-    """
-        Initialize the Deep Neural Network (DNN) model.
+    """Initialize the Deep Neural Network (DNN) model.
 
-        Args:
-            input_size (int or list of int): Number of input features or a list of sizes for each input.
-            output_size (int or list of int): Number of output features or a list of sizes for each output.
-            layer_size (list of int): List of sizes for hidden layers. Default is [1000].
-            dr (list of float): Dropout rates for each layer. Default is [0.5].
-            batch_norm (bool): Whether to use batch normalization. Default is True.
-            lng (bool): Whether to use LinearNormalGamma layer at the end. Default is False.
-            weight_init (bool): Whether to initialize weights. Default is False.
-            num_layers (int): Number of layers to create if layer_size is a single number. Default is None.
-        """
+    Args:
+        input_size (int or list of int): Number of input features or a list of sizes for each input.
+        output_size (int or list of int): Number of output features or a list of sizes for each output.
+        layer_size (list of int): List of sizes for hidden layers. Default is [1000].
+        dr (list of float): Dropout rates for each layer. Default is [0.5].
+        batch_norm (bool): Whether to use batch normalization. Default is True.
+        lng (bool): Whether to use LinearNormalGamma layer at the end. Default is False.
+        weight_init (bool): Whether to initialize weights. Default is False.
+        num_layers (int): Number of layers to create if layer_size is a single number. Default is None.
+    """
     def __init__(self,
                  input_size,
                  output_size,
@@ -166,8 +161,7 @@ def __init__(self,
             self.apply(self.init_weights)
 
     def block(self, input_size, output_size, dr, batch_norm):
-        """
-        Create a block of layers for the network.
+        """Create a block of layers for the network.
 
         Args:
             input_size (int): Number of input features.
@@ -187,8 +181,7 @@ def block(self, input_size, output_size, dr, batch_norm):
         return block
 
     def forward(self, x):
-        """
-        Perform a forward pass through the network.
+        """Perform a forward pass through the network.
 
         Args:
             x (torch.Tensor): Input tensor.
@@ -200,8 +193,7 @@ def forward(self, x):
         return x
 
     def load_weights(self, weights_path: str) -> None:
-        """
-        Load model weights from a file.
+        """Load model weights from a file.
 
         Args:
             weights_path (str): Path to the weights file (.pt).
@@ -223,8 +215,7 @@ def load_weights(self, weights_path: str) -> None:
             )
 
     def predict(self, input, y_scaler=None, return_uncertainties=True, return_tuple=False):
-        """
-        Make predictions with the model.
+        """Make predictions with the model.
 
         Args:
             input (torch.Tensor): Input tensor.
@@ -244,8 +235,7 @@ def predict(self, input, y_scaler=None, return_uncertainties=True, return_tuple=
         return output
 
     def predict_uncertainty(self, input, y_scaler=None):
-        """
-        Estimate uncertainties of predictions.
+        """Estimate uncertainties of predictions.
 
         Args:
             input (tuple of torch.Tensor): Tuple containing (mu, v, alpha, beta) tensors.
@@ -286,8 +276,7 @@ def predict_uncertainty(self, input, y_scaler=None):
         return mu, aleatoric, epistemic, aleatoric + epistemic
 
     def predict_dropout(self, x, mc_forward_passes=10, batch_size=16):
-        """
-        Perform Monte Carlo Dropout predictions.
+        """Perform Monte Carlo Dropout predictions.
 
         Args:
             x (torch.Tensor): Input tensor.
@@ -326,8 +315,7 @@ def predict_dropout(self, x, mc_forward_passes=10, batch_size=16):
 
     @classmethod
     def from_config(cls, conf, device="cpu"):
-        """
-        Create a model instance from configuration.
+        """Create a model instance from configuration.
 
         Args:
             conf (dict): Configuration dictionary with model parameters.
diff --git a/mlguess/torch/regression_losses.py b/mlguess/torch/regression_losses.py
index 385d7a0..54dd5b5 100644
--- a/mlguess/torch/regression_losses.py
+++ b/mlguess/torch/regression_losses.py
@@ -1,4 +1,4 @@
-""" Torch losses for regression models """
+"""Torch losses for regression models"""
 
 import numpy as np
 import torch
@@ -8,8 +8,7 @@
 
 
 class EvidentialRegressionLoss:
-    """
-    Class for computing Evidential Regression Loss, which includes the Normal Inverse Gamma negative log likelihood
+    """Class for computing Evidential Regression Loss, which includes the Normal Inverse Gamma negative log likelihood
     and a regularization term.
 
     Args:
@@ -19,8 +18,7 @@ def __init__(self, coef=1.0):
         self.coef = coef
 
     def normal_inverse_gamma_nll(self, y, gamma, v, alpha, beta):
-        """
-        Compute the Normal Inverse Gamma Negative Log Likelihood (NLL) for Deep Evidential Regression.
+        """Compute the Normal Inverse Gamma Negative Log Likelihood (NLL) for Deep Evidential Regression.
 
         Args:
             y (torch.Tensor): Target values.
@@ -45,8 +43,7 @@ def normal_inverse_gamma_nll(self, y, gamma, v, alpha, beta):
         return nll
 
     def normal_inverse_gamma_reg(self, y, gamma, v, alpha, beta):
-        """
-        Compute the Normal Inverse Gamma Regularizer for Deep Evidential Regression.
+        """Compute the Normal Inverse Gamma Regularizer for Deep Evidential Regression.
 
         Args:
             y (torch.Tensor): Target values.
@@ -63,8 +60,7 @@ def normal_inverse_gamma_reg(self, y, gamma, v, alpha, beta):
         return error * evi
 
     def __call__(self, gamma, v, alpha, beta, y):
-        """
-        Compute the total Evidential Regression Loss which is the sum of the negative log likelihood and the regularization term.
+        """Compute the total Evidential Regression Loss which is the sum of the negative log likelihood and the regularization term.
 
         Args:
             gamma (torch.Tensor): Mean of the Normal-Inverse Gamma distribution.
@@ -85,8 +81,7 @@ def __call__(self, gamma, v, alpha, beta, y):
 
 
 def modified_mse(gamma, nu, alpha, beta, target, reduction='mean'):
-    """
-    Compute the Lipschitz Mean Squared Error (MSE) loss as described in "Improving Evidential Deep Learning via Multitask Learning."
+    """Compute the Lipschitz Mean Squared Error (MSE) loss as described in "Improving Evidential Deep Learning via Multitask Learning."
 
     Args:
         gamma (torch.Tensor): Output of the evidential network.
@@ -102,7 +97,6 @@ def modified_mse(gamma, nu, alpha, beta, target, reduction='mean'):
     Reference: https://www.mit.edu/~amini/pubs/pdf/deep-evidential-regression.pdf
     Source: https://github.com/deargen/MT-ENet/tree/468822188f52e517b1ee8e386eea607b2b7d8829
     """
-
     mse = (gamma-target)**2
     c = get_mse_coef(gamma, nu, alpha, beta, target).detach()
     mod_mse = mse*c
@@ -116,8 +110,7 @@ def modified_mse(gamma, nu, alpha, beta, target, reduction='mean'):
 
 
 def get_mse_coef(gamma, nu, alpha, beta, y):
-    """
-    Return the coefficient of the MSE loss for each prediction.
+    """Return the coefficient of the MSE loss for each prediction.
     By assigning the coefficient to each MSE value, it clips the gradient of the MSE
     based on the threshold values U_nu, U_alpha, which are calculated by check_mse_efficiency_* functions.
 
@@ -130,6 +123,7 @@ def get_mse_coef(gamma, nu, alpha, beta, y):
         alpha ([FloatTensor]): the output of the ENet.
         beta ([FloatTensor]): the output of the ENet.
         y ([FloatTensor]): true labels.
+
     Returns:
         [FloatTensor]: [0.0-1.0], the coefficient of the MSE for each prediction.
     """
@@ -142,8 +136,7 @@ def get_mse_coef(gamma, nu, alpha, beta, y):
 
 
 def check_mse_efficiency_alpha(nu, alpha, beta):
-    """
-    Check the MSE loss (gamma - y)^2 can make negative gradients for alpha, which is
+    """Check the MSE loss (gamma - y)^2 can make negative gradients for alpha, which is
     a pseudo observation of the normal-inverse-gamma. We can use this to check the MSE
     loss can success(increase the pseudo observation, alpha).
 
@@ -165,8 +158,7 @@ def check_mse_efficiency_alpha(nu, alpha, beta):
 
 
 def check_mse_efficiency_nu(gamma, nu, alpha, beta):
-    """
-    Check the MSE loss (gamma - y)^2 can make negative gradients for nu, which is
+    """Check the MSE loss (gamma - y)^2 can make negative gradients for nu, which is
     a pseudo observation of the normal-inverse-gamma. We can use this to check the MSE
     loss can success(increase the pseudo observation, nu).
 
@@ -189,8 +181,7 @@ def check_mse_efficiency_nu(gamma, nu, alpha, beta):
 
 
 class EvidentialMarginalLikelihood(torch.nn.modules.loss._Loss):
-    """
-    Marginal likelihood error of prior network.
+    """Marginal likelihood error of prior network.
     The target value is not a distribution (mu, std), but a just value.
 
     This is a negative log marginal likelihood, with integral mu and sigma.
@@ -203,8 +194,7 @@ def __init__(self, size_average=None, reduce=None, reduction: str = 'mean'):
 
     def forward(self, gamma: torch.Tensor, nu: torch.Tensor, alpha: torch.Tensor, beta: torch.Tensor,
                 target: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
+        """Args:
             gamma (torch.Tensor): gamma output value of the evidential network
             nu (torch.Tensor): nu output value of the evidential network
             alpha (torch.Tensor): alpha output value of the evidential network
@@ -233,8 +223,7 @@ def forward(self, gamma: torch.Tensor, nu: torch.Tensor, alpha: torch.Tensor, be
 
 
 class EvidenceRegularizer(torch.nn.modules.loss._Loss):
-    """
-    Regularization for the regression prior network.
+    """Regularization for the regression prior network.
     If self.factor increases, the model output the wider(high confidence interval) predictions.
 
     Reference: https://www.mit.edu/~amini/pubs/pdf/deep-evidential-regression.pdf
@@ -246,8 +235,7 @@ def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', coef
 
     def forward(self, gamma: torch.Tensor, nu: torch.Tensor, alpha: torch.Tensor,
                 target: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
+        """Args:
             gamma (torch.Tensor): gamma output value of the evidential network
             nu (torch.Tensor): nu output value of the evidential network
             alpha (torch.Tensor): alpha output value of the evidential network
@@ -268,8 +256,7 @@ def forward(self, gamma: torch.Tensor, nu: torch.Tensor, alpha: torch.Tensor,
 
 
 class LipschitzMSELoss(torch.nn.Module):
-    """
-    Compute the Lipschitz MSE loss, which includes the Evidential Marginal Likelihood, Evidence Regularizer,
+    """Compute the Lipschitz MSE loss, which includes the Evidential Marginal Likelihood, Evidence Regularizer,
     and a modified MSE term.
 
     Args:
@@ -286,8 +273,7 @@ def __init__(self, tol=1e-8, coef=0.1, reduction='mean'):
         self.evidence_regularizer = EvidenceRegularizer(coef=coef, reduction=reduction)
 
     def forward(self, gamma, nu, alpha, beta, target):
-        """
-        Compute the total Lipschitz MSE Loss.
+        """Compute the total Lipschitz MSE Loss.
 
         Args:
             gamma (torch.Tensor): Output value of the evidential network for gamma.
diff --git a/mlguess/torch/scheduler.py b/mlguess/torch/scheduler.py
index 3b8e67f..0aaeb01 100644
--- a/mlguess/torch/scheduler.py
+++ b/mlguess/torch/scheduler.py
@@ -10,8 +10,7 @@
 
 
 def load_scheduler(optimizer, conf):
-    """
-    Load a learning rate scheduler based on the configuration.
+    """Load a learning rate scheduler based on the configuration.
 
     Parameters:
     - optimizer: The PyTorch optimizer.
@@ -41,8 +40,7 @@ def load_scheduler(optimizer, conf):
 
 # Define a half-cosine decay learning rate schedule for the second phase
 def lr_lambda_phase2(step, total_updates_phase2=299000):
-    """
-    This function implements a half-cosine decay learning rate schedule
+    """This function implements a half-cosine decay learning rate schedule
     specifically for the second training phase.
 
     Args:
@@ -60,8 +58,7 @@ def lr_lambda_phase2(step, total_updates_phase2=299000):
 
 # Combine the learning rate schedules
 def phased_lr_lambda(step, total_updates_phase1=1000, total_updates_phase2=299000):
-    """
-    This function combines two learning rate schedules for a phased training
+    """This function combines two learning rate schedules for a phased training
     process.
 
     Args:
@@ -82,8 +79,7 @@ def phased_lr_lambda(step, total_updates_phase1=1000, total_updates_phase2=29900
 
 
 def lr_lambda_phase1(epoch, num_epochs=100, warmup_epochs=10):
-    """
-    This function implements a learning rate schedule based on the reference
+    """This function implements a learning rate schedule based on the reference
     paper (https://arxiv.org/pdf/2312.03876.pdf) for the first training phase.
 
     Args:
@@ -107,15 +103,14 @@ def lr_lambda_phase1(epoch, num_epochs=100, warmup_epochs=10):
 
 
 class CosineAnnealingWarmupRestarts(LRScheduler):
-    """
-        optimizer (Optimizer): Wrapped optimizer.
-        first_cycle_steps (int): First cycle step size.
-        cycle_mult(float): Cycle steps magnification. Default: -1.
-        max_lr(float): First cycle's max learning rate. Default: 0.1.
-        min_lr(float): Min learning rate. Default: 0.001.
-        warmup_steps(int): Linear warmup step size. Default: 0.
-        gamma(float): Decrease rate of max learning rate by cycle. Default: 1.
-        last_epoch (int): The index of last epoch. Default: -1.
+    """optimizer (Optimizer): Wrapped optimizer.
+    first_cycle_steps (int): First cycle step size.
+    cycle_mult(float): Cycle steps magnification. Default: -1.
+    max_lr(float): First cycle's max learning rate. Default: 0.1.
+    min_lr(float): Min learning rate. Default: 0.001.
+    warmup_steps(int): Linear warmup step size. Default: 0.
+    gamma(float): Decrease rate of max learning rate by cycle. Default: 1.
+    last_epoch (int): The index of last epoch. Default: -1.
     """
 
     def __init__(
@@ -193,8 +188,7 @@ def step(self, epoch=None):
 
 
 def annealed_probability(epoch, max_epochs=100, min_probability=0.01, max_probability=1.0):
-    """
-    Anneal the termination probability from 1 to a small value.
+    """Anneal the termination probability from 1 to a small value.
 
     Parameters:
     - epoch: The current epoch.
@@ -205,7 +199,6 @@ def annealed_probability(epoch, max_epochs=100, min_probability=0.01, max_probab
     Returns:
     - termination_probability: The annealed termination probability.
     """
-
     # Linear annealing schedule
     termination_probability = 1.0 - (epoch / max_epochs) * (1.0 - min_probability)
 
diff --git a/mlguess/torch/trainer_classifier.py b/mlguess/torch/trainer_classifier.py
index ab9f7eb..e691060 100644
--- a/mlguess/torch/trainer_classifier.py
+++ b/mlguess/torch/trainer_classifier.py
@@ -16,8 +16,7 @@
 
 
 def cleanup():
-    """
-    Clean up and destroy the process group for distributed training.
+    """Clean up and destroy the process group for distributed training.
 
     This function is used to release resources and finalize the distributed training environment
     by destroying the process group. It should be called at the end of distributed training.
@@ -25,13 +24,11 @@ def cleanup():
     Returns:
         None
     """
-
     dist.destroy_process_group()
 
 
 def accum_log(log, new_logs):
-    """
-    Accumulate new log values into the existing log dictionary.
+    """Accumulate new log values into the existing log dictionary.
 
     Args:
         log (dict): The existing log dictionary to which new values will be added.
@@ -46,7 +43,6 @@ def accum_log(log, new_logs):
         updated_log = accum_log(old_log, new_log)
         # updated_log will be {'loss': 1.5, 'accuracy': 1.7}
     """
-
     for key, new_value in new_logs.items():
         old_value = log.get(key, 0.)
         log[key] = old_value + new_value
@@ -61,15 +57,13 @@ def one_hot_embedding(labels, num_classes=4):
 
 class Trainer:
     def __init__(self, model, rank, module=False, uncertainty=False):
-        """
-        Initialize the Trainer class.
+        """Initialize the Trainer class.
 
         Args:
             model (nn.Module): The model to be trained.
             rank (int): The rank of the current process (used for distributed training).
             module (bool): Whether the model is wrapped in a `DistributedDataParallel` module. Default is False.
         """
-
         super(Trainer, self).__init__()
         self.model = model
         self.rank = rank
@@ -92,8 +86,7 @@ def train_one_epoch(
         scheduler,
         metrics
     ):
-        """
-        Train the model for one epoch.
+        """Train the model for one epoch.
 
         Args:
             epoch (int): The current epoch number.
@@ -109,7 +102,6 @@ def train_one_epoch(
         Returns:
             dict: Dictionary containing training metrics for the epoch.
         """
-
         batches_per_epoch = conf['trainer']['batches_per_epoch']
         grad_accum_every = conf['trainer']['grad_accum_every']
         amp = conf['trainer']['amp']
@@ -222,8 +214,7 @@ def validate(
         criterion,
         metrics
     ):
-        """
-        Validate the model on the validation dataset.
+        """Validate the model on the validation dataset.
 
         Args:
             epoch (int): The current epoch number.
@@ -236,7 +227,6 @@ def validate(
         Returns:
             dict: Dictionary containing validation metrics for the epoch.
         """
-
         self.model.eval()
 
         valid_batches_per_epoch = conf['trainer']['valid_batches_per_epoch']
@@ -323,8 +313,7 @@ def predict(
         metrics,
         split=None
     ):
-        """
-        Make predictions with the model on the test dataset.
+        """Make predictions with the model on the test dataset.
 
         Args:
             conf (dict): Configuration dictionary containing prediction settings.
@@ -339,7 +328,6 @@ def predict(
                 - predictions (torch.Tensor): The model's predictions on the test dataset.
                 - metrics (dict): Dictionary containing evaluation metrics.
         """
-
         self.model.eval()
         distributed = True if conf["trainer"]["mode"] in ["fsdp", "ddp"] else False
 
@@ -437,8 +425,7 @@ def fit(
         metrics,
         trial=False
     ):
-        """
-        Train and validate the model.
+        """Train and validate the model.
 
         Args:
             conf (dict): Configuration dictionary containing training and validation settings.
@@ -456,7 +443,6 @@ def fit(
         Returns:
             dict: Dictionary containing training and validation metrics.
         """
-
         save_loc = conf['save_loc']
         start_epoch = conf['trainer']['start_epoch']
         epochs = conf['trainer']['epochs']
diff --git a/mlguess/torch/trainer_regression.py b/mlguess/torch/trainer_regression.py
index e330dfb..8d1bd44 100644
--- a/mlguess/torch/trainer_regression.py
+++ b/mlguess/torch/trainer_regression.py
@@ -16,8 +16,7 @@
 
 
 def cleanup():
-    """
-    Clean up and destroy the process group for distributed training.
+    """Clean up and destroy the process group for distributed training.
 
     This function is used to release resources and finalize the distributed training environment
     by destroying the process group. It should be called at the end of distributed training.
@@ -25,13 +24,11 @@ def cleanup():
     Returns:
         None
     """
-
     dist.destroy_process_group()
 
 
 def accum_log(log, new_logs):
-    """
-    Accumulate new log values into the existing log dictionary.
+    """Accumulate new log values into the existing log dictionary.
 
     Args:
         log (dict): The existing log dictionary to which new values will be added.
@@ -46,7 +43,6 @@ def accum_log(log, new_logs):
         updated_log = accum_log(old_log, new_log)
         # updated_log will be {'loss': 1.5, 'accuracy': 1.7}
     """
-
     for key, new_value in new_logs.items():
         old_value = log.get(key, 0.)
         log[key] = old_value + new_value
@@ -55,15 +51,13 @@ def accum_log(log, new_logs):
 
 class Trainer:
     def __init__(self, model, rank, module=False):
-        """
-        Initialize the Trainer class.
+        """Initialize the Trainer class.
 
         Args:
             model (nn.Module): The model to be trained.
             rank (int): The rank of the current process (used for distributed training).
             module (bool): Whether the model is wrapped in a `DistributedDataParallel` module. Default is False.
         """
-
         super(Trainer, self).__init__()
         self.model = model
         self.rank = rank
@@ -85,8 +79,7 @@ def train_one_epoch(
         metrics,
         transform=None
     ):
-        """
-        Train the model for one epoch.
+        """Train the model for one epoch.
 
         Args:
             epoch (int): The current epoch number.
@@ -102,7 +95,6 @@ def train_one_epoch(
         Returns:
             dict: Dictionary containing training metrics for the epoch.
         """
-
         batches_per_epoch = conf['trainer']['batches_per_epoch']
         grad_accum_every = conf['trainer']['grad_accum_every']
         amp = conf['trainer']['amp']
@@ -212,8 +204,7 @@ def validate(
         metrics,
         transform=None
     ):
-        """
-        Validate the model on the validation dataset.
+        """Validate the model on the validation dataset.
 
         Args:
             epoch (int): The current epoch number.
@@ -226,7 +217,6 @@ def validate(
         Returns:
             dict: Dictionary containing validation metrics for the epoch.
         """
-
         self.model.eval()
 
         valid_batches_per_epoch = conf['trainer']['valid_batches_per_epoch']
@@ -308,8 +298,7 @@ def predict(
         transform=None,
         split=None
     ):
-        """
-        Make predictions with the model on the test dataset.
+        """Make predictions with the model on the test dataset.
 
         Args:
             conf (dict): Configuration dictionary containing prediction settings.
@@ -324,7 +313,6 @@ def predict(
                 - predictions (torch.Tensor): The model's predictions on the test dataset.
                 - metrics (dict): Dictionary containing evaluation metrics.
         """
-
         self.model.eval()
         distributed = True if conf["trainer"]["mode"] in ["fsdp", "ddp"] else False
 
@@ -416,8 +404,7 @@ def fit(
         transform=None,
         trial=False
     ):
-        """
-        Train and validate the model.
+        """Train and validate the model.
 
         Args:
             conf (dict): Configuration dictionary containing training and validation settings.
@@ -435,7 +422,6 @@ def fit(
         Returns:
             dict: Dictionary containing training and validation metrics.
         """
-
         save_loc = conf['save_loc']
         start_epoch = conf['trainer']['start_epoch']
         epochs = conf['trainer']['epochs']
diff --git a/pyproject.toml b/pyproject.toml
index efc9996..2b1be53 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,3 +53,6 @@ packages = ["mlguess", "mlguess.keras", "mlguess.torch"]
 [tool.setuptools.dynamic]
 version = {file = "mlguess/VERSION"}
 readme = {file = ["README.md"]}
+
+[tool.ruff]
+pydocstyle.convention = "google"