From add9eaa9287a9a9b69acc4138751f6b18cba5721 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Tue, 13 Aug 2024 20:07:36 +0200
Subject: [PATCH 01/63] refactor: DeepGP

---
 .../bayesian_optimization/models/deepGP.py    | 634 ------------------
 neps/optimizers/models/__init__.py            |   0
 neps/optimizers/models/deepGP.py              | 572 ++++++++++++++++
 3 files changed, 572 insertions(+), 634 deletions(-)
 delete mode 100644 neps/optimizers/bayesian_optimization/models/deepGP.py
 create mode 100644 neps/optimizers/models/__init__.py
 create mode 100644 neps/optimizers/models/deepGP.py

diff --git a/neps/optimizers/bayesian_optimization/models/deepGP.py b/neps/optimizers/bayesian_optimization/models/deepGP.py
deleted file mode 100644
index d5145043..00000000
--- a/neps/optimizers/bayesian_optimization/models/deepGP.py
+++ /dev/null
@@ -1,634 +0,0 @@
-from __future__ import annotations
-
-import logging
-import os
-from copy import deepcopy
-from pathlib import Path
-
-import gpytorch
-import numpy as np
-import torch
-import torch.nn as nn
-
-from ....search_spaces.search_space import (
-    CategoricalParameter,
-    FloatParameter,
-    IntegerParameter,
-    SearchSpace,
-)
-
-
-def count_non_improvement_steps(root_directory: Path | str) -> int:
-    root_directory = Path(root_directory)
-
-    all_losses_file = root_directory / "all_losses_and_configs.txt"
-    best_loss_fiel = root_directory / "best_loss_trajectory.txt"
-
-    # Read all losses from the file in the order they are explored
-    losses = [
-        float(line[6:])
-        for line in all_losses_file.read_text(encoding="utf-8").splitlines()
-        if "Loss: " in line
-    ]
-    # Get the best seen loss value
-    best_loss = float(best_loss_fiel.read_text(encoding="utf-8").splitlines()[-1].strip())
-
-    # Count the non-improvement
-    count = 0
-    for loss in reversed(losses):
-        if np.greater(loss, best_loss):
-            count += 1
-        else:
-            break
-
-    return count
-
-
-class NeuralFeatureExtractor(nn.Module):
-    """
-    Neural network to be used in the DeepGP
-    """
-
-    def __init__(self, input_size: int, **kwargs):
-        super().__init__()
-
-        # Set number of hyperparameters
-        self.input_size = input_size
-
-        self.n_layers = kwargs.get("n_layers", 2)
-        self.activation = nn.LeakyReLU()
-
-        layer1_units = kwargs.get("layer1_units", 128)
-        self.fc1 = nn.Linear(input_size, layer1_units)
-        self.bn1 = nn.BatchNorm1d(layer1_units)
-
-        previous_layer_units = layer1_units
-        for i in range(2, self.n_layers):
-            next_layer_units = kwargs.get(f"layer{i}_units", 256)
-            setattr(
-                self,
-                f"fc{i}",
-                nn.Linear(previous_layer_units, next_layer_units),
-            )
-            setattr(
-                self,
-                f"bn{i}",
-                nn.BatchNorm1d(next_layer_units),
-            )
-            previous_layer_units = next_layer_units
-
-        setattr(
-            self,
-            f"fc{self.n_layers}",
-            nn.Linear(
-                previous_layer_units + kwargs.get("cnn_nr_channels", 4),
-                # accounting for the learning curve features
-                kwargs.get(f"layer{self.n_layers}_units", 256),
-            ),
-        )
-        self.cnn = nn.Sequential(
-            nn.Conv1d(
-                in_channels=1,
-                kernel_size=(kwargs.get("cnn_kernel_size", 3),),
-                out_channels=4,
-            ),
-            nn.AdaptiveMaxPool1d(1),
-        )
-
-    def forward(self, x, budgets, learning_curves):
-        # add an extra dimensionality for the budget
-        # making it nr_rows x 1.
-        budgets = torch.unsqueeze(budgets, dim=1)
-        # concatenate budgets with examples
-        x = torch.cat((x, budgets), dim=1)
-        x = self.fc1(x)
-        x = self.activation(self.bn1(x))
-
-        for i in range(2, self.n_layers):
-            x = self.activation(getattr(self, f"bn{i}")(getattr(self, f"fc{i}")(x)))
-
-        # add an extra dimensionality for the learning curve
-        # making it nr_rows x 1 x lc_values.
-        learning_curves = torch.unsqueeze(learning_curves, 1)
-        lc_features = self.cnn(learning_curves)
-        # revert the output from the cnn into nr_rows x nr_kernels.
-        lc_features = torch.squeeze(lc_features, 2)
-
-        # put learning curve features into the last layer along with the higher level features.
-        x = torch.cat((x, lc_features), dim=1)
-        x = self.activation(getattr(self, f"fc{self.n_layers}")(x))
-
-        return x
-
-
-class GPRegressionModel(gpytorch.models.ExactGP):
-    """
-    A simple GP model.
-    """
-
-    def __init__(
-        self,
-        train_x: torch.Tensor,
-        train_y: torch.Tensor,
-        likelihood: gpytorch.likelihoods.GaussianLikelihood,
-    ):
-        """
-        Constructor of the GPRegressionModel.
-
-        Args:
-            train_x: The initial train examples for the GP.
-            train_y: The initial train labels for the GP.
-            likelihood: The likelihood to be used.
-        """
-        super().__init__(train_x, train_y, likelihood)
-
-        self.mean_module = gpytorch.means.ConstantMean()
-        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
-
-    def forward(self, x):
-        mean_x = self.mean_module(x)
-        covar_x = self.covar_module(x)
-
-        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
-
-
-class DeepGP:
-    """
-    Gaussian process with a deep kernel
-    """
-
-    def __init__(
-        self,
-        pipeline_space: SearchSpace,
-        neural_network_args: dict | None = None,
-        logger=None,
-        surrogate_model_fit_args: dict | None = None,
-        # IMPORTANT: Checkpointing does not use file locking,
-        # IMPORTANT: hence, it is not suitable for multiprocessing settings
-        checkpointing: bool = False,
-        root_directory: Path | str | None = None,
-        checkpoint_file: Path | str = "surrogate_checkpoint.pth",
-        refine_epochs: int = 50,
-        **kwargs,
-    ):
-        self.surrogate_model_fit_args = (
-            surrogate_model_fit_args if surrogate_model_fit_args is not None else {}
-        )
-
-        self.checkpointing = checkpointing
-        self.refine_epochs = refine_epochs
-        if checkpointing:
-            assert (
-                root_directory is not None
-            ), "neps root_directory must be provided for the checkpointing"
-            self.root_dir = Path(os.getcwd(), root_directory)
-            self.checkpoint_path = Path(os.getcwd(), root_directory, checkpoint_file)
-
-        super().__init__()
-        self.__preprocess_search_space(pipeline_space)
-        # set the categories array for the encoder
-        self.categories_array = np.array(self.categories)
-
-        if neural_network_args is None:
-            neural_network_args = {}
-        self.nn_args = neural_network_args
-
-        self.device = (
-            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-        )
-        # self.device = torch.device("cpu")
-
-        # Save the NN args, necessary for preprocessing
-        self.cnn_kernel_size = neural_network_args.get("cnn_kernel_size", 3)
-        self.model, self.likelihood, self.mll = self.__initialize_gp_model(
-            neural_network_args.get("n_layers", 2)
-        )
-
-        # build the neural network
-        self.nn = NeuralFeatureExtractor(self.input_size, **neural_network_args)
-
-        self.logger = logger or logging.getLogger("neps")
-
-    def __initialize_gp_model(
-        self,
-        train_size: int,
-    ) -> tuple[
-        GPRegressionModel,
-        gpytorch.likelihoods.GaussianLikelihood,
-        gpytorch.mlls.ExactMarginalLogLikelihood,
-    ]:
-        """
-        Called when the surrogate is first initialized or restarted.
-
-        Args:
-            train_size: The size of the current training set.
-
-        Returns:
-            model, likelihood, mll - The GP model, the likelihood and
-                the marginal likelihood.
-        """
-        train_x = torch.ones(train_size, train_size).to(self.device)
-        train_y = torch.ones(train_size).to(self.device)
-
-        likelihood = gpytorch.likelihoods.GaussianLikelihood().to(self.device)
-        model = GPRegressionModel(
-            train_x=train_x, train_y=train_y, likelihood=likelihood
-        ).to(self.device)
-        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model).to(self.device)
-        return model, likelihood, mll
-
-    def __preprocess_search_space(self, pipeline_space: SearchSpace):
-        self.categories = []
-        self.categorical_hps = []
-
-        parameter_count = 0
-        for hp_name, hp in pipeline_space.items():
-            # Collect all categories in a list for the encoder
-            if isinstance(hp, CategoricalParameter):
-                self.categorical_hps.append(hp_name)
-                self.categories.extend(hp.choices)
-                parameter_count += len(hp.choices)
-            else:
-                parameter_count += 1
-
-        # add 1 for budget
-        self.input_size = parameter_count
-        self.continuous_params_size = self.input_size - len(self.categories)
-        self.min_fidelity = pipeline_space.fidelity.lower
-        self.max_fidelity = pipeline_space.fidelity.upper
-
-    def __encode_config(self, config: SearchSpace):
-        categorical_encoding = np.zeros_like(self.categories_array)
-        continuous_values = []
-
-        for hp_name, hp in config.items():
-            if hp.is_fidelity:
-                continue  # Ignore fidelity
-            if hp_name in self.categorical_hps:
-                label = hp.value
-                categorical_encoding[np.argwhere(self.categories_array == label)] = 1
-            else:
-                continuous_values.append(hp.value_to_normalized(hp.value))
-
-        continuous_encoding = np.array(continuous_values)
-
-        encoding = np.concatenate([categorical_encoding, continuous_encoding])
-        return encoding
-
-    def __extract_budgets(
-        self, x_train: list[SearchSpace], normalized: bool = True
-    ) -> np.ndarray:
-        budgets = np.array([config.fidelity.value for config in x_train], dtype=np.single)
-        if normalized:
-            normalized_budgets = (budgets - self.min_fidelity) / (
-                self.max_fidelity - self.min_fidelity
-            )
-            budgets = normalized_budgets
-        return budgets
-
-    def __preprocess_learning_curves(
-        self, learning_curves: list[list[float]], padding_value: float = 0.0
-    ) -> np.ndarray:
-        # Add padding to the learning curves to make them the same size
-
-        # Get max learning_curve length
-        max_length = 0
-        for lc in learning_curves:
-            length = len(lc)
-            if length > max_length:
-                max_length = length
-
-        for lc in learning_curves:
-            # add padding to the learning curve to fit the cnn kernel or
-            # the max_length depending on which is the largest
-            padding_length = max([max_length - len(lc), self.cnn_kernel_size - len(lc)])
-            lc.extend([padding_value] * padding_length)
-
-        # TODO: check if the lc values are within bounds [0, 1] (karibbov)
-        # TODO: add normalize_lcs option in the future
-
-        return np.array(learning_curves, dtype=np.single)
-
-    def __reset_xy(
-        self,
-        x_train: list[SearchSpace],
-        y_train: list[float],
-        learning_curves: list[list[float]],
-        normalize_y: bool = False,
-        normalize_budget: bool = True,
-    ):
-        self.normalize_budget = normalize_budget
-        self.normalize_y = normalize_y
-
-        x_train, train_budgets, learning_curves = self._preprocess_input(
-            x_train, learning_curves, normalize_budget
-        )
-
-        y_train = self._preprocess_y(y_train, normalize_y)
-
-        self.x_train = x_train
-        self.train_budgets = train_budgets
-        self.learning_curves = learning_curves
-        self.y_train = y_train
-
-    def _preprocess_input(
-        self,
-        x: list[SearchSpace],
-        learning_curves: list[list[float]],
-        normalize_budget: bool = True,
-    ):
-        budgets = self.__extract_budgets(x, normalize_budget)
-        learning_curves = self.__preprocess_learning_curves(learning_curves)
-
-        x = np.array([self.__encode_config(config) for config in x], dtype=np.single)
-
-        x = torch.tensor(x).to(device=self.device)
-        budgets = torch.tensor(budgets).to(device=self.device)
-        learning_curves = torch.tensor(learning_curves).to(device=self.device)
-
-        return x, budgets, learning_curves
-
-    def _preprocess_y(self, y_train: list[float], normalize_y: bool = False):
-        y_train_array = np.array(y_train, dtype=np.single)
-        self.min_y = y_train_array.min()
-        self.max_y = y_train_array.max()
-        if normalize_y:
-            y_train_array = (y_train_array - self.min_y) / (self.max_y - self.min_y)
-        y_train_array = torch.tensor(y_train_array).to(device=self.device)
-        return y_train_array
-
-    def fit(
-        self,
-        x_train: list[SearchSpace],
-        y_train: list[float],
-        learning_curves: list[list[float]],
-    ):
-        self._fit(x_train, y_train, learning_curves, **self.surrogate_model_fit_args)
-
-    def _fit(
-        self,
-        x_train: list[SearchSpace],
-        y_train: list[float],
-        learning_curves: list[list[float]],
-        normalize_y: bool = False,
-        normalize_budget: bool = True,
-        n_epochs: int = 1000,
-        batch_size: int = 64,
-        optimizer_args: dict | None = None,
-        early_stopping: bool = True,
-        patience: int = 10,
-        perf_patience: int = 10,
-    ):
-        self.__reset_xy(
-            x_train,
-            y_train,
-            learning_curves,
-            normalize_y=normalize_y,
-            normalize_budget=normalize_budget,
-        )
-        self.model, self.likelihood, self.mll = self.__initialize_gp_model(len(y_train))
-        self.nn = NeuralFeatureExtractor(self.input_size, **self.nn_args)
-        self.model.to(self.device)
-        self.likelihood.to(self.device)
-        self.nn.to(self.device)
-
-        if self.checkpointing and self.checkpoint_path.exists():
-            non_improvement_steps = count_non_improvement_steps(self.root_dir)
-            # If checkpointing and patience is not exhausted load a partial model
-            if non_improvement_steps < perf_patience:
-                n_epochs = self.refine_epochs
-                self.load_checkpoint()
-            self.logger.debug(f"No improvement for: {non_improvement_steps} evaulations")
-        self.logger.debug(f"N Epochs for the full training: {n_epochs}")
-
-        initial_state = self.get_state()
-        try:
-            self.__train_model(
-                self.x_train,
-                self.train_budgets,
-                self.learning_curves,
-                self.y_train,
-                n_epochs=n_epochs,
-                batch_size=batch_size,
-                optimizer_args=optimizer_args,
-                early_stopping=early_stopping,
-                patience=patience,
-            )
-            if self.checkpointing:
-                self.save_checkpoint()
-        except gpytorch.utils.errors.NotPSDError:
-            self.logger.info("Model training failed loading the untrained model")
-            self.load_checkpoint(initial_state)
-            # Delete checkpoint to restart training
-            self.delete_checkpoint()
-
-    def __train_model(
-        self,
-        x_train: torch.Tensor,
-        train_budgets: torch.Tensor,
-        learning_curves: torch.Tensor,
-        y_train: torch.Tensor,
-        n_epochs: int = 1000,
-        batch_size: int = 64,
-        optimizer_args: dict | None = None,
-        early_stopping: bool = True,
-        patience: int = 10,
-    ):
-        if optimizer_args is None:
-            optimizer_args = {"lr": 0.001}
-
-        self.model.train()
-        self.likelihood.train()
-        self.nn.train()
-        self.optimizer = torch.optim.Adam(
-            [
-                dict({"params": self.model.parameters()}, **optimizer_args),
-                dict({"params": self.nn.parameters()}, **optimizer_args),
-            ]
-        )
-
-        count_down = patience
-        min_avg_loss_val = np.inf
-        average_loss: float = 0.0
-
-        for epoch_nr in range(0, n_epochs):
-            if early_stopping and count_down == 0:
-                self.logger.info(
-                    f"Epoch: {epoch_nr - 1} surrogate training stops due to early "
-                    f"stopping with the patience: {patience} and "
-                    f"the minimum average loss of {min_avg_loss_val} and "
-                    f"the final average loss of {average_loss}"
-                )
-                break
-
-            n_examples_batch = x_train.size(dim=0)
-
-            # get a random permutation for mini-batches
-            permutation = torch.randperm(n_examples_batch)
-
-            # optimize over mini-batches
-            total_scaled_loss = 0.0
-            for batch_idx, start_index in enumerate(
-                range(0, n_examples_batch, batch_size)
-            ):
-                end_index = start_index + batch_size
-                if end_index > n_examples_batch:
-                    end_index = n_examples_batch
-                indices = permutation[start_index:end_index]
-                batch_x, batch_budget, batch_lc, batch_y = (
-                    x_train[indices],
-                    train_budgets[indices],
-                    learning_curves[indices],
-                    y_train[indices],
-                )
-
-                minibatch_size = end_index - start_index
-                # if only one example in the batch, skip the batch.
-                # Otherwise, the code will fail because of batchnorm
-                if minibatch_size <= 1:
-                    continue
-
-                # Zero backprop gradients
-                self.optimizer.zero_grad()
-
-                projected_x = self.nn(batch_x, batch_budget, batch_lc)
-                self.model.set_train_data(projected_x, batch_y, strict=False)
-                output = self.model(projected_x)
-
-                # try:
-                # Calc loss and backprop derivatives
-                loss = -self.mll(output, self.model.train_targets)
-                episodic_loss_value: float = loss.detach().to("cpu").item()
-                # weighted sum over losses in the batch
-                total_scaled_loss = (
-                    total_scaled_loss + episodic_loss_value * minibatch_size
-                )
-
-                mse = gpytorch.metrics.mean_squared_error(
-                    output, self.model.train_targets
-                )
-                self.logger.debug(
-                    f"Epoch {epoch_nr}  Batch {batch_idx} - MSE {mse:.5f}, "
-                    f"Loss: {episodic_loss_value:.3f}, "
-                    f"lengthscale: {self.model.covar_module.base_kernel.lengthscale.item():.3f}, "
-                    f"noise: {self.model.likelihood.noise.item():.3f}, "
-                )
-
-                loss.backward()
-                self.optimizer.step()
-
-            # Get average weighted loss over every batch
-            average_loss = total_scaled_loss / n_examples_batch
-            if average_loss < min_avg_loss_val:
-                min_avg_loss_val = average_loss
-                count_down = patience
-            elif early_stopping:
-                self.logger.debug(
-                    f"No improvement over the minimum loss value of {min_avg_loss_val} "
-                    f"for the past {patience - count_down} epochs "
-                    f"the training will stop in {count_down} epochs"
-                )
-                count_down -= 1
-            # except Exception as training_error:
-            #     self.logger.error(
-            #         f'The following error happened while training: {training_error}')
-            #     # An error has happened, trigger the restart of the optimization and restart
-            #     # the model with default hyperparameters.
-            #     self.restart = True
-            #     training_errored = True
-            #     break
-
-    def set_prediction_learning_curves(self, learning_curves: list[list[float]]):
-        self.prediction_learning_curves = learning_curves
-
-    def predict(
-        self, x: list[SearchSpace], learning_curves: list[list[float]] | None = None
-    ):
-        # Preprocess input
-        if learning_curves is None:
-            learning_curves = self.prediction_learning_curves
-        x_test, test_budgets, learning_curves = self._preprocess_input(
-            x, learning_curves, self.normalize_budget
-        )
-
-        self.model.eval()
-        self.nn.eval()
-        self.likelihood.eval()
-
-        with torch.no_grad():
-            projected_train_x = self.nn(
-                self.x_train, self.train_budgets, self.learning_curves
-            )
-            self.model.set_train_data(
-                inputs=projected_train_x, targets=self.y_train, strict=False
-            )
-
-            projected_test_x = self.nn(x_test, test_budgets, learning_curves)
-
-            preds = self.likelihood(self.model(projected_test_x))
-
-        means = preds.mean.detach().cpu()
-
-        if self.normalize_y:
-            means = (means + self.min_y) * (self.max_y - self.min_y)
-
-        cov = torch.diag(torch.pow(preds.stddev.detach(), 2)).cpu()
-
-        return means, cov
-
-    def load_checkpoint(self, state: dict | None = None):
-        """
-        Load the state from a previous checkpoint.
-        """
-        if state is None:
-            checkpoint = torch.load(self.checkpoint_path)
-        else:
-            checkpoint = state
-        self.model.load_state_dict(checkpoint["gp_state_dict"])
-        self.nn.load_state_dict(checkpoint["nn_state_dict"])
-        self.likelihood.load_state_dict(checkpoint["likelihood_state_dict"])
-
-        self.model.to(self.device)
-        self.likelihood.to(self.device)
-        self.nn.to(self.device)
-
-    def save_checkpoint(self, state: dict | None = None):
-        """
-        Save the given state or the current state in a
-        checkpoint file.
-
-        Args:
-            checkpoint_path: path to the checkpoint file
-            state: The state to save, if none, it will
-            save the current state.
-        """
-
-        if state is None:
-            torch.save(
-                self.get_state(),
-                self.checkpoint_path,
-            )
-        else:
-            torch.save(
-                state,
-                self.checkpoint_path,
-            )
-
-    def get_state(self) -> dict[str, dict]:
-        """
-        Get the current state of the surrogate.
-
-        Returns:
-            current_state: A dictionary that represents
-                the current state of the surrogate model.
-        """
-        current_state = {
-            "gp_state_dict": deepcopy(self.model.state_dict()),
-            "nn_state_dict": deepcopy(self.nn.state_dict()),
-            "likelihood_state_dict": deepcopy(self.likelihood.state_dict()),
-        }
-
-        return current_state
-
-    def delete_checkpoint(self):
-        self.checkpoint_path.unlink(missing_ok=True)
diff --git a/neps/optimizers/models/__init__.py b/neps/optimizers/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/neps/optimizers/models/deepGP.py b/neps/optimizers/models/deepGP.py
new file mode 100644
index 00000000..e0c225e6
--- /dev/null
+++ b/neps/optimizers/models/deepGP.py
@@ -0,0 +1,572 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+
+import logging
+import os
+from copy import deepcopy
+from pathlib import Path
+
+import gpytorch
+import numpy as np
+import torch
+import torch.nn as nn
+from neps.search_spaces.architecture.graph_grammar import GraphParameter
+
+from neps.exceptions import SurrogateFailedToFit
+
+from neps.search_spaces.search_space import (
+    CategoricalParameter,
+    FloatParameter,
+    IntegerParameter,
+    SearchSpace,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def count_non_improvement_steps(root_directory: Path | str) -> int:
+    root_directory = Path(root_directory)
+
+    all_losses_file = root_directory / "all_losses_and_configs.txt"
+    best_loss_fiel = root_directory / "best_loss_trajectory.txt"
+
+    # Read all losses from the file in the order they are explored
+    losses = [
+        float(line[6:])
+        for line in all_losses_file.read_text(encoding="utf-8").splitlines()
+        if "Loss: " in line
+    ]
+    # Get the best seen loss value
+    best_loss = float(best_loss_fiel.read_text(encoding="utf-8").splitlines()[-1].strip())
+
+    # Count the non-improvement
+    count = 0
+    for loss in reversed(losses):
+        if np.greater(loss, best_loss):
+            count += 1
+        else:
+            break
+
+    return count
+
+
+class NeuralFeatureExtractor(nn.Module):
+    """
+    Neural network to be used in the DeepGP
+    """
+
+    def __init__(self, input_size: int, **kwargs):
+        super().__init__()
+
+        # Set number of hyperparameters
+        self.input_size = input_size
+
+        self.n_layers = kwargs.get("n_layers", 2)
+        self.activation = nn.LeakyReLU()
+
+        layer1_units = kwargs.get("layer1_units", 128)
+        self.fc1 = nn.Linear(input_size, layer1_units)
+        self.bn1 = nn.BatchNorm1d(layer1_units)
+
+        previous_layer_units = layer1_units
+        for i in range(2, self.n_layers):
+            next_layer_units = kwargs.get(f"layer{i}_units", 256)
+            setattr(
+                self,
+                f"fc{i}",
+                nn.Linear(previous_layer_units, next_layer_units),
+            )
+            setattr(
+                self,
+                f"bn{i}",
+                nn.BatchNorm1d(next_layer_units),
+            )
+            previous_layer_units = next_layer_units
+
+        setattr(
+            self,
+            f"fc{self.n_layers}",
+            nn.Linear(
+                previous_layer_units + kwargs.get("cnn_nr_channels", 4),
+                # accounting for the learning curve features
+                kwargs.get(f"layer{self.n_layers}_units", 256),
+            ),
+        )
+        self.cnn = nn.Sequential(
+            nn.Conv1d(
+                in_channels=1,
+                kernel_size=(kwargs.get("cnn_kernel_size", 3),),
+                out_channels=4,
+            ),
+            nn.AdaptiveMaxPool1d(1),
+        )
+
+    def forward(self, x, budgets, learning_curves):
+        # add an extra dimensionality for the budget
+        # making it nr_rows x 1.
+        budgets = torch.unsqueeze(budgets, dim=1)
+        # concatenate budgets with examples
+        x = torch.cat((x, budgets), dim=1)
+        x = self.fc1(x)
+        x = self.activation(self.bn1(x))
+
+        for i in range(2, self.n_layers):
+            x = self.activation(getattr(self, f"bn{i}")(getattr(self, f"fc{i}")(x)))
+
+        # add an extra dimensionality for the learning curve
+        # making it nr_rows x 1 x lc_values.
+        learning_curves = torch.unsqueeze(learning_curves, 1)
+        lc_features = self.cnn(learning_curves)
+        # revert the output from the cnn into nr_rows x nr_kernels.
+        lc_features = torch.squeeze(lc_features, 2)
+
+        # put learning curve features into the last layer along with the higher level features.
+        x = torch.cat((x, lc_features), dim=1)
+        x = self.activation(getattr(self, f"fc{self.n_layers}")(x))
+
+        return x
+
+
+class GPRegressionModel(gpytorch.models.ExactGP):
+    """
+    A simple GP model.
+    """
+
+    def __init__(
+        self,
+        train_x: torch.Tensor,
+        train_y: torch.Tensor,
+        likelihood: gpytorch.likelihoods.GaussianLikelihood,
+    ):
+        """
+        Constructor of the GPRegressionModel.
+
+        Args:
+            train_x: The initial train examples for the GP.
+            train_y: The initial train labels for the GP.
+            likelihood: The likelihood to be used.
+        """
+        super().__init__(train_x, train_y, likelihood)
+
+        self.mean_module = gpytorch.means.ConstantMean()
+        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
+
+    def forward(self, x):
+        mean_x = self.mean_module(x)
+        covar_x = self.covar_module(x)
+
+        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
+
+
+@dataclass
+class DeepGPDataTransformer:
+    # TODO: This class could be used for other models as well
+    space: SearchSpace
+    fidelity_bounds: tuple[int | float, int | float] | None
+    normalize_y: bool
+    min_learning_curve_length: int
+    learning_curve_pad_value: float
+    device: torch.device
+
+    numericals: dict[str, FloatParameter | IntegerParameter] = field(init=False)
+    categoricals: dict[str, CategoricalParameter] = field(init=False)
+    output_dim: int = field(init=False)
+
+    def __post_init__(self) -> None:
+        self.numericals = {
+            name: h
+            for name, h in self.space.items()
+            if isinstance(h, (FloatParameter, IntegerParameter))
+        }
+        self.categoricals = {
+            name: h
+            for name, h in self.space.items()
+            if isinstance(h, CategoricalParameter)
+        }
+        self.output_dim = len(self.numericals) + sum(
+            len(c.choices) for c in self.categoricals.values()
+        )
+
+    def encode_configs(
+        self,
+        configs: list[SearchSpace],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        x_buffer = torch.empty(
+            (len(configs), self.output_dim),
+            device=self.device,
+            dtype=torch.float32,
+        )
+
+        # Normals are just fill the columns with the normalized values
+        for i, (hp_name, hp) in enumerate(self.numericals.items()):
+            budget_tensor = torch.tensor(
+                [config[hp_name].value for config in configs],
+                device=self.device,
+                dtype=torch.float32,
+            )
+
+            x_buffer[:, i] = (budget_tensor - hp.lower) / (hp.upper - hp.lower)
+
+        # Categoricals is a bit harder, we create a tensor with all the indices (values)
+        # as we did above, but then we sub-select the portion of the buffer for that categorical
+        # before inserting the one-hot encoding.
+        offset = len(self.numericals)
+        for hp_name, hp in self.categoricals.items():
+            budget_tensor = torch.tensor(
+                [config[hp_name]._value_index for config in configs],  # type: ignore
+                device=self.device,
+                dtype=torch.float64,
+            )
+
+            # .. and insert one-hot encoding (ChatGPT solution, verified locally)
+            portion = x_buffer[:, offset : offset + len(hp.choices)]
+            portion.scatter_(1, budget_tensor.unsqueeze(1), 1)
+
+            offset += len(hp.choices)
+
+        # Finally, ... budgets
+        budgets = [config.fidelity.value for config in configs]  # type: ignore
+        budget_tensor = torch.tensor(budgets, device=self.device, dtype=torch.float32)
+        if self.fidelity_bounds:
+            assert self.space.fidelity is not None
+            _min = self.space.fidelity.lower
+            _max = self.space.fidelity.upper
+            budget_tensor.sub_(_min).div_(_max - _min)
+
+        return x_buffer, budget_tensor
+
+    def encode_learning_curves(self, learning_curves: list[list[float]]) -> torch.Tensor:
+        lc_height = len(learning_curves)
+        lc_width = max(
+            max(len(lc) for lc in learning_curves), self.min_learning_curve_length
+        )
+        lc_buffer = torch.full(
+            (lc_width, lc_height),
+            self.learning_curve_pad_value,
+            device=self.device,
+            dtype=torch.float32,
+        )
+
+        for i, lc in enumerate(learning_curves):
+            lc_buffer[: len(lc), i] = torch.tensor(
+                lc, device=self.device, dtype=torch.float32
+            )
+
+        return lc_buffer
+
+    def encode_y(
+        self, y: list[float]
+    ) -> tuple[torch.Tensor, None | tuple[int | float, int | float]]:
+        tensor = torch.tensor(y, device=self.device, dtype=torch.float32)
+        if self.fidelity_bounds:
+            _min, _max = tensor.min(), tensor.max()
+            tensor.sub_(_min).div_(_max - _min)
+            bounds = (_min.detach().item(), _max.detach().item())
+        else:
+            bounds = None
+
+        return tensor, bounds
+
+
+def _train_model(
+    x_train: torch.Tensor,
+    train_budgets: torch.Tensor,
+    learning_curves: torch.Tensor,
+    model: GPRegressionModel,
+    likelihood: gpytorch.likelihoods.GaussianLikelihood,
+    device: torch.device,
+    nn: NeuralFeatureExtractor,
+    y_train: torch.Tensor,
+    n_epochs: int = 1000,
+    batch_size: int = 64,
+    optimizer_args: dict | None = None,
+    early_stopping: bool = True,
+    patience: int = 10,
+):
+    if optimizer_args is None:
+        optimizer_args = {"lr": 0.001}
+
+    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model).to(device)
+
+    # Set to training mode
+    mll.train()
+    model.train()
+    likelihood.train()
+    nn.train()
+
+    optimizer = torch.optim.Adam(
+        [
+            dict({"params": model.parameters()}, **optimizer_args),
+            dict({"params": nn.parameters()}, **optimizer_args),
+        ]
+    )
+
+    count_down = patience
+    min_avg_loss_val = np.inf
+    average_loss: float = 0.0
+
+    for epoch_nr in range(0, n_epochs):
+        if early_stopping and count_down == 0:
+            logger.info(
+                f"Epoch: {epoch_nr - 1} surrogate training stops due to early "
+                f"stopping with the patience: {patience} and "
+                f"the minimum average loss of {min_avg_loss_val} and "
+                f"the final average loss of {average_loss}"
+            )
+            break
+
+        n_examples_batch = x_train.size(dim=0)
+
+        # get a random permutation for mini-batches
+        permutation = torch.randperm(n_examples_batch)
+
+        # optimize over mini-batches
+        total_scaled_loss = 0.0
+        for batch_idx, start_index in enumerate(range(0, n_examples_batch, batch_size)):
+            end_index = min(start_index + batch_size, n_examples_batch)
+            minibatch_size = end_index - start_index + 1
+
+            # if only one example in the batch, skip the batch.
+            # Otherwise, the code will fail because of batchnorm
+            if minibatch_size <= 1:
+                continue
+
+            indices = permutation[start_index:end_index]
+
+            batch_x, batch_budget, batch_lc, batch_y = (
+                x_train[indices],
+                train_budgets[indices],
+                learning_curves[indices],
+                y_train[indices],
+            )
+
+            # Zero backprop gradients
+            optimizer.zero_grad()
+
+            projected_x = nn(batch_x, batch_budget, batch_lc)
+            model.set_train_data(projected_x, batch_y, strict=False)
+            output = model(projected_x)
+
+            # Calc loss and backprop derivatives
+            loss = -mll(output, model.train_targets)  # type: ignore
+            episodic_loss_value: float = loss.detach().to("cpu").item()
+            # weighted sum over losses in the batch
+            total_scaled_loss = total_scaled_loss + episodic_loss_value * minibatch_size
+
+            mse = gpytorch.metrics.mean_squared_error(output, model.train_targets)
+            logger.debug(
+                f"Epoch {epoch_nr}  Batch {batch_idx} - MSE {mse:.5f}, "
+                f"Loss: {episodic_loss_value:.3f}, "
+                f"lengthscale: {model.covar_module.base_kernel.lengthscale.item():.3f}, "
+                f"noise: {model.likelihood.noise.item():.3f}, "  # type: ignore
+            )
+
+            loss.backward()
+            optimizer.step()
+
+        # Get average weighted loss over every batch
+        average_loss = total_scaled_loss / n_examples_batch
+        if average_loss < min_avg_loss_val:
+            min_avg_loss_val = average_loss
+            count_down = patience
+        elif early_stopping:
+            logger.debug(
+                f"No improvement over the minimum loss value of {min_avg_loss_val} "
+                f"for the past {patience - count_down} epochs "
+                f"the training will stop in {count_down} epochs"
+            )
+            count_down -= 1
+
+
+@dataclass
+class DeepGP:
+    """Gaussian process with a deep kernel."""
+
+    # Required
+    pipeline_space: SearchSpace
+
+    # Optional
+    learning_curve_pad_value: float = 0.0
+    root_directory: Path | None = None
+    # IMPORTANT: Checkpointing does not use file locking
+    # IMPORTANT: hence it is not suitable for multiprocessing settings
+    checkpoint_file: Path | str = "surrogate_checkpoint.pth"
+    checkpointing: bool = False
+    early_stopping: bool = True
+    batch_size: int = 64
+    n_epochs: int = 1000
+    patience: int = 10
+    refine_epochs: int = 50
+    perf_patience: int = 10
+    device: torch.device = field(
+        default_factory=lambda: torch.device("cuda")
+        if torch.cuda.is_available()
+        else torch.device("cpu")
+    )
+    normalize_budget: bool = True
+    normalize_y: bool = True
+    neural_network_args: dict = field(default_factory=dict)
+    surrogate_model_fit_args: dict = field(default_factory=dict)
+    optimizer_args: dict = field(default_factory=dict)
+
+    # Created from the above arguments
+    # TODO: Lift this out of DeepGP and let the optimizer worry about pre-processing
+    preprocessor: DeepGPDataTransformer = field(init=False)
+
+    # Post fit parameters, following scikit-learn convention of appending an underscore
+    model_: GPRegressionModel | None = field(init=False)
+    likelihood_: gpytorch.likelihoods.GaussianLikelihood | None = field(init=False)
+    nn_: NeuralFeatureExtractor | None = field(init=False)
+    projected_x_train_: torch.Tensor | None = field(init=False)
+    y_train_: torch.Tensor | None = field(init=False)
+    y_bounds_: tuple[float, float] | None = field(init=False)
+
+    def __post_init__(self):
+        if any(isinstance(h, GraphParameter) for h in self.pipeline_space.values()):
+            raise ValueError("Graph parameters are not supported for DeepGP")
+
+        if self.normalize_budget:
+            budget_bounds = (pipeline_space.fidelity.lower, pipeline_space.fidelity.upper)  # type: ignore
+        else:
+            budget_bounds = None
+
+        if self.checkpointing:
+            assert (
+                self.root_directory is not None
+            ), "neps root_directory must be provided for the checkpointing"
+            self.checkpoint_path = self.root_directory / self.checkpoint_file
+
+        self.preprocessor = DeepGPDataTransformer(
+            space=self.pipeline_space,
+            fidelity_bounds=budget_bounds,
+            normalize_y=self.normalize_y,
+            min_learning_curve_length=self.neural_network_args.get("cnn_kernel_size", 3),
+            learning_curve_pad_value=self.learning_curve_pad_value,
+            device=self.device,
+        )
+        self.model_ = None
+        self.likelihood_ = None
+        self.nn_ = None
+
+    def fit(
+        self,
+        x_train: list[SearchSpace],
+        y_train: list[float],
+        learning_curves: list[list[float]],
+    ):
+        x_, train_budget = self.preprocessor.encode_configs(x_train)
+        curves = self.preprocessor.encode_learning_curves(learning_curves)
+        y_, y_bounds = self.preprocessor.encode_y(y_train)
+
+        # Required for predictions later
+        self.y_train_ = y_
+        self.y_bounds_ = y_bounds
+
+        input_dim = x_.shape[1]
+
+        # Initial state
+        likelihood = gpytorch.likelihoods.GaussianLikelihood().to(self.device)
+        model = GPRegressionModel(train_x=x_, train_y=y_, likelihood=likelihood).to(
+            self.device
+        )
+        nn = NeuralFeatureExtractor(input_dim, **self.neural_network_args).to(self.device)
+
+        # If checkpointing and we are improving, load existing model
+        if self.checkpointing and self.checkpoint_path.exists():
+            assert self.root_directory is not None
+
+            non_improvement_steps = count_non_improvement_steps(self.root_directory)
+            if non_improvement_steps < self.perf_patience:
+                n_epochs = self.refine_epochs
+
+                checkpoint = torch.load(self.checkpoint_path)
+                model.load_state_dict(checkpoint["gp_state_dict"])
+                nn.load_state_dict(checkpoint["nn_state_dict"])
+                likelihood.load_state_dict(checkpoint["likelihood_state_dict"])
+            else:
+                n_epochs = self.n_epochs
+                logger.debug(f"No improvement for: {non_improvement_steps} evaulations")
+        else:
+            # Starting from scratch
+            n_epochs = self.n_epochs
+
+        logger.debug(f"N Epochs for the full training: {self.n_epochs}")
+
+        try:
+            _train_model(
+                x_train=x_,
+                train_budgets=train_budget,
+                learning_curves=curves,
+                y_train=y_,
+                model=model,
+                likelihood=likelihood,
+                nn=nn,
+                n_epochs=n_epochs,
+                device=self.device,
+                batch_size=self.batch_size,
+                optimizer_args=self.optimizer_args,
+                early_stopping=self.early_stopping,
+                patience=self.patience,
+            )
+            self.model_ = model
+            self.likelihood_ = likelihood
+            self.nn_ = nn
+
+            nn.eval()
+            # Cheaper to do this once during fit, rather than on each call to predict
+            self.projected_x_train_ = nn(x_, train_budget, curves)
+
+            if self.checkpointing:
+                torch.save(
+                    {
+                        "gp_state_dict": deepcopy(model).state_dict(),
+                        "nn_state_dict": deepcopy(nn).state_dict(),
+                        "likelihood_state_dict": deepcopy(likelihood.state_dict()),
+                    },
+                    self.checkpoint_path,
+                )
+        except gpytorch.utils.errors.NotPSDError as e:
+            logger.error(
+                "Model training failed loading the untrained model", exc_info=True
+            )
+            # Delete checkpoint to restart training
+            self.checkpoint_path.unlink(missing_ok=True)
+            raise SurrogateFailedToFit("DeepGP Failed to fit the training data!") from e
+
+    def predict(
+        self, x: list[SearchSpace], learning_curves: list[list[float]]
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert self.model_ is not None, "Please fit the model first"
+        assert self.nn_ is not None, "Please fit the model first"
+        assert self.likelihood_ is not None, "Please fit the model first"
+        assert self.projected_x_train_ is not None, "Please fit the model first"
+        assert self.y_train_ is not None, "Please fit the model first"
+        assert self.y_bounds_ is not None, "Please fit the model first"
+
+        self.model_.eval()
+        self.nn_.eval()
+        self.likelihood_.eval()
+
+        x_test, test_budgets = self.preprocessor.encode_configs(x)
+        _curves = self.preprocessor.encode_learning_curves(learning_curves)
+
+        with torch.no_grad():
+            # Set GP prior
+            self.model_.set_train_data(
+                inputs=self.projected_x_train_,
+                targets=self.y_train_,
+                strict=False,
+            )
+
+            projected_test_x = self.nn_(x_test, test_budgets, _curves)
+            preds = self.likelihood_(self.model_(projected_test_x))
+
+        means = preds.mean.detach().cpu()
+
+        if self.normalize_y:
+            _min, _max = self.y_bounds_
+            means = (means + _min) * (_max - _min)
+
+        cov = torch.diag(torch.pow(preds.stddev.detach(), 2)).cpu()
+
+        return means, cov

From 3aba212c233fe73fc37db18142adae67dcaef20a Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 14 Aug 2024 14:56:47 +0200
Subject: [PATCH 02/63] fix: Address comements from @karibbov

---
 neps/exceptions.py               |  4 +++
 neps/optimizers/models/deepGP.py | 44 ++++++++++++++------------------
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/neps/exceptions.py b/neps/exceptions.py
index 7054d7c6..edf0232c 100644
--- a/neps/exceptions.py
+++ b/neps/exceptions.py
@@ -52,3 +52,7 @@ class WorkerRaiseError(NePSError):
 
     Includes additional information on how to recover
     """
+
+
+class SurrogateFailedToFitError(NePSError):
+    """Raised when a surrogate model fails to fit."""
diff --git a/neps/optimizers/models/deepGP.py b/neps/optimizers/models/deepGP.py
index e0c225e6..a94988ab 100644
--- a/neps/optimizers/models/deepGP.py
+++ b/neps/optimizers/models/deepGP.py
@@ -176,7 +176,7 @@ def __post_init__(self) -> None:
         self.numericals = {
             name: h
             for name, h in self.space.items()
-            if isinstance(h, (FloatParameter, IntegerParameter))
+            if isinstance(h, (FloatParameter, IntegerParameter)) and not h.is_fidelity
         }
         self.categoricals = {
             name: h
@@ -254,18 +254,8 @@ def encode_learning_curves(self, learning_curves: list[list[float]]) -> torch.Te
 
         return lc_buffer
 
-    def encode_y(
-        self, y: list[float]
-    ) -> tuple[torch.Tensor, None | tuple[int | float, int | float]]:
-        tensor = torch.tensor(y, device=self.device, dtype=torch.float32)
-        if self.fidelity_bounds:
-            _min, _max = tensor.min(), tensor.max()
-            tensor.sub_(_min).div_(_max - _min)
-            bounds = (_min.detach().item(), _max.detach().item())
-        else:
-            bounds = None
-
-        return tensor, bounds
+    def encode_y(self, y: list[float]) -> torch.Tensor:
+        return torch.tensor(y, device=self.device, dtype=torch.float32)
 
 
 def _train_model(
@@ -397,7 +387,8 @@ class DeepGP:
     n_epochs: int = 1000
     patience: int = 10
     refine_epochs: int = 50
-    perf_patience: int = 10
+    perf_patience_factor: float = 1.2  # X * max_fidelity
+    n_initial_full_trainings: int = 10
     device: torch.device = field(
         default_factory=lambda: torch.device("cuda")
         if torch.cuda.is_available()
@@ -412,6 +403,7 @@ class DeepGP:
     # Created from the above arguments
     # TODO: Lift this out of DeepGP and let the optimizer worry about pre-processing
     preprocessor: DeepGPDataTransformer = field(init=False)
+    max_fidelity: int | float = field(init=False)
 
     # Post fit parameters, following scikit-learn convention of appending an underscore
     model_: GPRegressionModel | None = field(init=False)
@@ -419,7 +411,6 @@ class DeepGP:
     nn_: NeuralFeatureExtractor | None = field(init=False)
     projected_x_train_: torch.Tensor | None = field(init=False)
     y_train_: torch.Tensor | None = field(init=False)
-    y_bounds_: tuple[float, float] | None = field(init=False)
 
     def __post_init__(self):
         if any(isinstance(h, GraphParameter) for h in self.pipeline_space.values()):
@@ -436,6 +427,7 @@ def __post_init__(self):
             ), "neps root_directory must be provided for the checkpointing"
             self.checkpoint_path = self.root_directory / self.checkpoint_file
 
+        self.max_fidelity = self.pipeline_space.fidelity.upper  # type: ignore
         self.preprocessor = DeepGPDataTransformer(
             space=self.pipeline_space,
             fidelity_bounds=budget_bounds,
@@ -447,6 +439,8 @@ def __post_init__(self):
         self.model_ = None
         self.likelihood_ = None
         self.nn_ = None
+        self.projected_x_train_ = None
+        self.y_train_ = None
 
     def fit(
         self,
@@ -456,11 +450,10 @@ def fit(
     ):
         x_, train_budget = self.preprocessor.encode_configs(x_train)
         curves = self.preprocessor.encode_learning_curves(learning_curves)
-        y_, y_bounds = self.preprocessor.encode_y(y_train)
+        y_ = self.preprocessor.encode_y(y_train)
 
         # Required for predictions later
         self.y_train_ = y_
-        self.y_bounds_ = y_bounds
 
         input_dim = x_.shape[1]
 
@@ -476,7 +469,12 @@ def fit(
             assert self.root_directory is not None
 
             non_improvement_steps = count_non_improvement_steps(self.root_directory)
-            if non_improvement_steps < self.perf_patience:
+
+            patience_steps = self.perf_patience_factor * self.max_fidelity
+            if (
+                len(y_train) >= self.n_initial_full_trainings
+                and non_improvement_steps < patience_steps
+            ):
                 n_epochs = self.refine_epochs
 
                 checkpoint = torch.load(self.checkpoint_path)
@@ -534,14 +532,15 @@ def fit(
             raise SurrogateFailedToFit("DeepGP Failed to fit the training data!") from e
 
     def predict(
-        self, x: list[SearchSpace], learning_curves: list[list[float]]
+        self,
+        x: list[SearchSpace],
+        learning_curves: list[list[float]],
     ) -> tuple[torch.Tensor, torch.Tensor]:
         assert self.model_ is not None, "Please fit the model first"
         assert self.nn_ is not None, "Please fit the model first"
         assert self.likelihood_ is not None, "Please fit the model first"
         assert self.projected_x_train_ is not None, "Please fit the model first"
         assert self.y_train_ is not None, "Please fit the model first"
-        assert self.y_bounds_ is not None, "Please fit the model first"
 
         self.model_.eval()
         self.nn_.eval()
@@ -562,11 +561,6 @@ def predict(
             preds = self.likelihood_(self.model_(projected_test_x))
 
         means = preds.mean.detach().cpu()
-
-        if self.normalize_y:
-            _min, _max = self.y_bounds_
-            means = (means + _min) * (_max - _min)
-
         cov = torch.diag(torch.pow(preds.stddev.detach(), 2)).cpu()
 
         return means, cov

From 40b6830d6914e988abf6b5fa7e96881a4bad5443 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 14 Aug 2024 15:03:46 +0200
Subject: [PATCH 03/63] fix: Import from the moved file

---
 neps/optimizers/bayesian_optimization/models/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neps/optimizers/bayesian_optimization/models/__init__.py b/neps/optimizers/bayesian_optimization/models/__init__.py
index c76bedfd..6279e973 100755
--- a/neps/optimizers/bayesian_optimization/models/__init__.py
+++ b/neps/optimizers/bayesian_optimization/models/__init__.py
@@ -4,7 +4,7 @@
 from .gp_hierarchy import ComprehensiveGPHierarchy
 
 try:
-    from .deepGP import DeepGP
+    from neps.optimizers.models.deepGP import DeepGP
 except ImportError as e:
     DeepGP = MissingDependencyError("gpytorch", e)
 

From df4e089ce537c52cf96da379771686c5e9c89aa1 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 14 Aug 2024 18:40:59 +0200
Subject: [PATCH 04/63] refactor: Cleanup vectorial kernels

---
 .../kernels/combine_kernels.py                |  20 +-
 .../kernels/combine_kernels_hierarchy.py      |   3 -
 .../kernels/vectorial_kernels.py              | 339 ++++----
 .../models/deepGP.py                          |   1 -
 .../bayesian_optimization/models/gp.py        | 749 ++++++------------
 neps/optimizers/models/__init__.py            |   0
 neps/search_spaces/search_space.py            |  21 +-
 7 files changed, 434 insertions(+), 699 deletions(-)
 rename neps/optimizers/{ => bayesian_optimization}/models/deepGP.py (99%)
 delete mode 100644 neps/optimizers/models/__init__.py

diff --git a/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py b/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py
index 0e464713..3aa320b5 100644
--- a/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py
+++ b/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py
@@ -46,7 +46,7 @@ def fit_transform(
         rebuild_model: bool = True,
         save_gram_matrix: bool = True,
         gp_fit: bool = True,
-        feature_lengthscale: list = None,
+        feature_lengthscale: dict[str, torch.Tensor] | None = None,
         **kwargs,
     ):
         N = len(configs)
@@ -147,16 +147,13 @@ def transform(
 
         return K.t()
 
-    def clamp_theta_vector(self, theta_vector):
-        if theta_vector is None:
-            return None
+    def clamp_theta_vector(
+        self, theta_vector: dict[str, torch.Tensor]
+    ) -> dict[str, torch.Tensor]:
+        for t_ in theta_vector.values():
+            if t_.is_leaf:
+                t_.clamp_(self.lengthscale_bounds[0], self.lengthscale_bounds[1])
 
-        [
-            t_.clamp_(self.lengthscale_bounds[0], self.lengthscale_bounds[1])
-            if t_ is not None and t_.is_leaf
-            else None
-            for t_ in theta_vector.values()
-        ]
         return theta_vector
 
 
@@ -210,6 +207,3 @@ def forward_t(
 class ProductKernel(CombineKernel):
     def __init__(self, *kernels, **kwargs):
         super().__init__("product", *kernels, **kwargs)
-
-    def dk_dphi(self, weights, gr: list = None, x=None, feature_lengthscale=None):
-        raise NotImplementedError
diff --git a/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py b/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py
index b35b9d91..2f3d2bf6 100644
--- a/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py
+++ b/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py
@@ -243,6 +243,3 @@ def forward_t(
 class ProductKernel(CombineKernel):
     def __init__(self, *kernels, **kwargs):
         super().__init__("product", *kernels, **kwargs)
-
-    def dk_dphi(self, weights, gr: list = None, x=None, feature_lengthscale=None):
-        raise NotImplementedError
diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
index 6e0b2052..bd7a1661 100644
--- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
+++ b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
@@ -1,92 +1,96 @@
+from __future__ import annotations
 from copy import deepcopy
 from math import sqrt
-from typing import Tuple, Union
+from dataclasses import dataclass
+from typing import Iterable
+from typing_extensions import override
 
 import numpy as np
 import torch
 
+LENGTHSCALE_BOUNDS_DEFAULT = (
+    np.exp(-6.754111155189306),
+    np.exp(0.0858637988771976),
+)
 
+
+@dataclass
 class Stationary:
     """Here we follow the structure of GPy to build a sub class of stationary kernel.
     All the classes (i.e. the class of stationary kernel_operators) derived from this
     class use the scaled distance to compute the Gram matrix."""
 
-    def __init__(
+    lengthscale: float | torch.Tensor = 1.0
+    lengthscale_bounds: tuple[float, float] = LENGTHSCALE_BOUNDS_DEFAULT
+    outputscale: float = 1.0
+
+    gram_: torch.Tensor | None = None
+    train_: torch.Tensor | None = None
+
+    def forward(
         self,
-        lengthscale: Union[float, Tuple[float, ...]] = 1.0,
-        lengthscale_bounds: Tuple[float, float] = (
-            np.exp(-6.754111155189306),
-            np.exp(0.0858637988771976),
-        ),
-        outputscale=1.0,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.lengthscale = lengthscale
-        self.lengthscale_bounds = lengthscale_bounds
-        self.outputscale = outputscale
-
-        self._gram = None
-        self._train = None
-
-    def forward(self, x1, x2=None, l=None, **params):
-        if l is not None:
-            return _scaled_distance(l, x1, x2)
-        return _scaled_distance(self.lengthscale, x1, x2)
+        x1: torch.Tensor,
+        x2: torch.Tensor | None = None,
+        l: float | torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        lengthscale = l if l is not None else self.lengthscale
+        return _scaled_distance(lengthscale, x1, x2)
 
     def fit_transform(
         self,
         x1,
-        l=None,
-        rebuild_model=True,
-        save_gram_matrix=True,
-    ):
-        if not rebuild_model and self._gram is not None:
-            return self._gram
+        l: float | torch.Tensor | None = None,
+        rebuild_model: bool = True,
+        save_gram_matrix: bool = True,
+    ) -> torch.Tensor:
+        if not rebuild_model and self.gram_ is not None:
+            return self.gram_
         K = self.forward(x1, l=l)
         if save_gram_matrix:
-            self._train = deepcopy(x1)
+            self.train_ = deepcopy(x1)
             assert isinstance(K, torch.Tensor), "it doesnt work with np arrays.."
-            self._gram = K.clone()
+            self.gram_ = K.clone()
         return K
 
-    def transform(
-        self,
-        x1,
-        l=None,
-    ):
-        if self._gram is None:
+    def transform(self, x1, l: float | torch.Tensor | None = None) -> torch.Tensor:
+        if self.gram_ is None or self.train_ is None:
             raise ValueError("The kernel has not been fitted. Run fit_transform first")
-        return self.forward(self._train, x1, l=l)
-
-    def __call__(self, *args, **kwargs):
-        return self.forward(*args, **kwargs)
+        return self.forward(self.train_, x1, l=l)
 
-    def forward_t(self, x2, x1=None, l=None):
+    def forward_t(
+        self,
+        x2: torch.Tensor,
+        x1: torch.Tensor | None = None,
+        l: float | torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         if x1 is None:
-            x1 = torch.tensor(self._train)
-        x2 = torch.tensor(x2).requires_grad_(True)
+            x1 = torch.tensor(self.train_)
+        x2 = torch.tensor(x2, requires_grad=True)
         K = self.forward(x1, x2, l)
         return K, x2
 
-    def update_hyperparameters(self, lengthscale):
+    def update_hyperparameters(self, lengthscale: Iterable[torch.Tensor]) -> None:
         self.lengthscale = [
             l_.clamp(self.lengthscale_bounds[0], self.lengthscale_bounds[1]).item()
             for l_ in lengthscale
         ]
 
 
+@dataclass
 class RBFKernel(Stationary):
-    def forward(self, x1, x2=None, l=None, **kwargs):
-        if l is None:
-            dist_sq = _scaled_distance(self.lengthscale, x1, x2, sq_dist=True)
-        else:
-            dist_sq = _scaled_distance(l, x1, x2, sq_dist=True)
-        if isinstance(dist_sq, torch.Tensor):
-            return self.outputscale * torch.exp(-0.5 * dist_sq)
-        return self.outputscale * np.exp(-0.5 * dist_sq)
+    @override
+    def forward(
+        self,
+        x1: torch.Tensor,
+        x2: torch.Tensor | None = None,
+        l: float | torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        lengthscale = l if l is not None else self.lengthscale
+        dist_sq = _scaled_distance(lengthscale, x1, x2, sq_dist=True)
+        return self.outputscale * torch.exp(-0.5 * dist_sq)
 
 
+@dataclass
 class LayeredRBFKernel(RBFKernel):
     """
     Same as the conventional RBF kernel, but adapted in a way as a midway between
@@ -94,178 +98,167 @@ class LayeredRBFKernel(RBFKernel):
     Weisfiler-Lehman iteration only (e.g. one weight for h=0, another for h=1 and etc.)
     """
 
-    def forward(self, ard_dims, x1, x2=None, l=None, **kwargs):
-        l = l if l is not None else self.lengthscale
-        assert l.shape[0] == ard_dims.shape[0], (
+    @override
+    def forward(
+        self,
+        ard_dims: torch.Tensor,
+        x1: torch.Tensor,
+        x2: torch.Tensor | None = None,
+        l: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        _l = l if l is not None else self.lengthscale
+        assert isinstance(_l, torch.Tensor), "Lengthscale must be a torch tensor"
+        assert _l.shape[0] == ard_dims.shape[0], (
             "LayeredRBF expects the lengthscale vector to have the same "
             "dimensionality as the "
             "number of WL iterations, but got lengthscale vector of shape"
-            + str(l.shape[0])
+            + str(_l.shape[0])
             + "and WL iteration of shape "
             + str(ard_dims.shape[0])
         )
-        if not isinstance(ard_dims, torch.Tensor):
-            ard_dims = torch.tensor(ard_dims)
+
         M = torch.cat(
-            [torch.ones(int(ard_dims[i])) * l[i] for i in range(ard_dims.shape[0])]
+            [torch.ones(int(ard_dims[i])) * _l[i] for i in range(len(ard_dims))]
         )
-        return super().forward(x1, x2, M, **kwargs)
+        return super().forward(x1, x2, M)
 
 
+@dataclass
 class Matern32Kernel(Stationary):
-    def forward(self, x1, x2=None, l=None, **kwargs):
-        if l is None:
-            l = self.lengthscale
-        dist = _scaled_distance(l, x1, x2)
-        if isinstance(dist, torch.Tensor):
-            return (
-                self.outputscale * (1 + sqrt(3.0) * dist) * torch.exp(-sqrt(3.0) * dist)
-            )
-        return self.outputscale * (1 + sqrt(3.0) * dist) * np.exp(-sqrt(3.0) * dist)
+    @override
+    def forward(
+        self,
+        x1: torch.Tensor,
+        x2: torch.Tensor | None = None,
+        l: float | torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        lengthscale = l if l is not None else self.lengthscale
+        dist = _scaled_distance(lengthscale, x1, x2)
+        return self.outputscale * (1 + sqrt(3.0) * dist) * torch.exp(-sqrt(3.0) * dist)
 
 
 class Matern52Kernel(Stationary):
-    def forward(self, x1, x2=None, l=None, **kwargs):
-        if l is None:
-            l = self.lengthscale
-        dist = _scaled_distance(l, x1, x2)
-        sq_dist = dist**2
-        if isinstance(dist, torch.Tensor):
-            return (
-                self.outputscale
-                * (1 + sqrt(5.0) * dist + 5.0 / 3.0 * sq_dist)
-                * torch.exp(-sqrt(5.0) * dist)
-            )
+    @override
+    def forward(
+        self,
+        x1: torch.Tensor,
+        x2: torch.Tensor | None = None,
+        l: float | torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        lengthscale = l if l is not None else self.lengthscale
+        dist = _scaled_distance(lengthscale, x1, x2, sq_dist=True)
         return (
             self.outputscale
-            * (1 + sqrt(5.0) * dist + 5.0 / 3.0 * sq_dist)
-            * np.exp(-sqrt(5.0) * dist)
+            * (1 + sqrt(5.0) * dist + 5.0 / 3.0 * dist)
+            * torch.exp(-sqrt(5.0) * dist)
         )
 
     def update_hyperparameters(self, lengthscale):
         if lengthscale is None or "continuous" not in lengthscale.keys():
-            return
+            raise ValueError("wtf")
         lengthscale = lengthscale["continuous"]
         super().update_hyperparameters(lengthscale=lengthscale)
 
 
+@dataclass
 class HammingKernel(Stationary):
-    def forward(self, x1, x2=None, l=None, **kwargs):
-        if l is None:
-            dist = _hamming_distance(
-                self.lengthscale,
-                x1,
-                x2,
-            )
-        else:
-            dist = _hamming_distance(
-                l,
-                x1,
-                x2,
-            )
+    @override
+    def forward(
+        self,
+        x1: torch.Tensor,
+        x2: torch.Tensor | None = None,
+        l: float | torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        lengthscale = l if l is not None else self.lengthscale
+        dist = _hamming_distance(lengthscale, x1, x2)
         return self.outputscale * dist
 
     def update_hyperparameters(self, lengthscale):
         if lengthscale is None or "categorical" not in lengthscale.keys():
-            return
+            raise ValueError("wtf")
         lengthscale = lengthscale["categorical"]
         super().update_hyperparameters(lengthscale=lengthscale)
 
 
+@dataclass
 class RationalQuadraticKernel(Stationary):
-    def __init__(self, lengthscale, outputscale=1.0, power=2.0, **kwargs):
-        super().__init__(lengthscale, outputscale, **kwargs)
-        self.power = power
+    power: float = 2.0
 
-    def forward(self, x1, x2=None, **kwargs):
-        dist_sq = _scaled_distance(self.lengthscale, x1, x2, sq_dist=True)
+    @override
+    def forward(
+        self,
+        x1: torch.Tensor,
+        x2: torch.Tensor | None = None,
+        l: float | torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        lengthscale = l if l is not None else self.lengthscale
+        dist_sq = _scaled_distance(lengthscale, x1, x2, sq_dist=True)
         return self.outputscale * (1 + dist_sq / 2.0) ** (-self.power)
 
 
-def _unscaled_distance(X, X2=None, sq_dist=False):
-    """The unscaled distance between X and X2. if x2 is not supplied, then the squared Euclidean distance is
-    computed within X"""
-    if isinstance(X, torch.Tensor):
-        assert X.ndimension() == 2
-        if X2 is not None:
-            assert isinstance(X2, torch.Tensor)
-            assert X2.ndimension() == 2
-        if X2 is None:
-            Xsq = torch.sum(X**2, 1)
-            r2 = -2 * X @ X.t() + Xsq[:, None] + Xsq[None, :]
-        else:
-            X1sq = torch.sum(X**2, 1)
-            X2sq = torch.sum(X2**2, 1)
-            r2 = -2 * X @ X2.t() + X1sq[:, None] + X2sq[None, :]
-        r2 += 1e-8
-        r2 = torch.maximum(r2, torch.tensor(0))
-        if not sq_dist:
-            r2 = torch.sqrt(r2)
-    else:
-        assert X.ndim == 2
-        if X2 is not None:
-            assert X2.ndim == 2
-        if X2 is None:
-            Xsq = np.sum(X**2, 1)
-            r2 = -2 * X @ X.transpose() + Xsq[:, None] + Xsq[None, :]
-        else:
-            X1sq = np.sum(X**2, 1)
-            X2sq = np.sum(X2**2, 1)
-            r2 = -2 * X @ X2.transpose() + X1sq[:, None] + X2sq[None, :]
-        if not sq_dist:
-            r2 = np.sqrt(r2)
-    return r2
-
-
-def _scaled_distance(lengthscale, X, X2=None, sq_dist=False):
+def _unscaled_square_distance(
+    X: torch.Tensor,
+    X2: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """The unscaled distance between X and X2."""
+    assert X.ndim == 2
+    X1sq = torch.sum(X**2, 1)
+    X2sq = X1sq if X is X2 else torch.sum(X**2, 1)
+    X2 = X if X2 is None else X2
+
+    r2 = -2 * X @ X2.T + X1sq[:, None] + X2sq[None, :]
+    r2 += 1e-15
+    return torch.clamp_min(r2, 0.0)
+
+
+def _scaled_distance(
+    lengthscale: float | torch.Tensor,
+    X: torch.Tensor,
+    X2: torch.Tensor | None = None,
+    *,
+    sq_dist: bool = False,
+) -> torch.Tensor:
     """Compute the *scaled* distance between X and x2 (or, if X2 is not supplied,
     the distance between X and itself) by the lengthscale. if a scalar (float) or a
     dim=1 lengthscale vector is supplied, then it is assumed that we use one
     lengthscale for all dimensions. Otherwise, we have an ARD kernel and in which case
     the length of the lengthscale vector must be the same as the dimensionality of the
     problem."""
-    X = torch.tensor(X, dtype=torch.float64)
+    if isinstance(lengthscale, float):
+        if sq_dist is False:
+            return torch.sqrt(_unscaled_square_distance(X, X2)) / (lengthscale**2)
+
+        return _unscaled_square_distance(X, X2) / lengthscale
+
+    # ARD kernel - one lengthscale per dimension
+    assert len(lengthscale) == X.shape[1], (
+        f"Lengthscale must have the same dimensionality as the input data."
+        f"Got {len(lengthscale)} and {X.shape[1]}"
+    )
+    rescaled_X = X / lengthscale
     if X2 is None:
-        X2 = X
-    if isinstance(lengthscale, float) or len(lengthscale) == 1:
-        return (
-            _unscaled_distance(X, X2) / lengthscale
-            if sq_dist is False
-            else _unscaled_distance(X, X2, sq_dist=True) / (lengthscale**2)
-        )
+        dist = _unscaled_square_distance(rescaled_X)
     else:
-        # ARD kernel - one lengthscale per dimension
-        _check_lengthscale(lengthscale, X)
-        dist = _unscaled_distance(X / lengthscale, X2 / lengthscale)
-        return dist if not sq_dist else dist**2
+        rescaled_X2 = X2 / lengthscale
+        dist = _unscaled_square_distance(rescaled_X, rescaled_X2)
+
+    return dist if sq_dist else torch.sqrt(dist)
 
 
-def _hamming_distance(lengthscale, X, X2=None):
+def _hamming_distance(
+    lengthscale: float | torch.Tensor,
+    X: torch.Tensor,
+    X2: torch.Tensor | None = None,
+) -> torch.Tensor:
     if X2 is None:
         X2 = X
 
-    def _distance(X, X2, lengthscale=1.0):
-        if isinstance(lengthscale, torch.Tensor):
-            lengthscale = lengthscale.detach().numpy()
-        indicator = np.expand_dims(X, axis=1) != X2
-        K = (-1 / (2 * lengthscale**2) * indicator).sum(axis=2)
-        K = np.exp(K)
-        return torch.from_numpy(K)
+    indicator = X.unsqueeze(1) != X2
+    C = -1 / (2 * lengthscale**2)
+    scaled_indicator = C * indicator
+    diffs = scaled_indicator.sum(dim=2)
 
     if isinstance(lengthscale, float) or len(lengthscale) == 1:
-        return _distance(X, X2) / lengthscale
+        return torch.exp(diffs) / lengthscale
     else:
-        _check_lengthscale(lengthscale, X)
-        return _distance(X, X2, lengthscale)
-
-
-def _check_lengthscale(lengthscale, X):
-    x_shape = len(X[0]) if isinstance(X, list) else X.shape[1]
-    assert len(lengthscale) == x_shape, (
-        "For a non-scaler theta, it needs to be of the same length as the dim"
-        "of the "
-        "input data, but got input dim of "
-        + str(x_shape)
-        + " and lengthscale dimension of "
-        + str(lengthscale.shape[0])
-    )
+        return torch.exp(diffs)
diff --git a/neps/optimizers/models/deepGP.py b/neps/optimizers/bayesian_optimization/models/deepGP.py
similarity index 99%
rename from neps/optimizers/models/deepGP.py
rename to neps/optimizers/bayesian_optimization/models/deepGP.py
index a94988ab..82355ec5 100644
--- a/neps/optimizers/models/deepGP.py
+++ b/neps/optimizers/bayesian_optimization/models/deepGP.py
@@ -2,7 +2,6 @@
 from dataclasses import dataclass, field
 
 import logging
-import os
 from copy import deepcopy
 from pathlib import Path
 
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index 73ecf019..6a878748 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -1,276 +1,279 @@
+from __future__ import annotations
+
 import logging
 from copy import deepcopy
-from typing import Iterable, Union
+from typing import Iterable, Literal, Sequence, Any
 
 import numpy as np
+import contextlib
 import torch
 
-from ..kernels.combine_kernels import ProductKernel, SumKernel
+from neps.optimizers.bayesian_optimization.kernels.combine_kernels import (
+    ProductKernel,
+    SumKernel,
+)
+
+from neps.optimizers.bayesian_optimization.kernels.graph_kernel import GraphKernels
+from neps.optimizers.bayesian_optimization.kernels.utils import extract_configs
+from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary
+from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import WeisfilerLehman
+from neps.search_spaces.search_space import SearchSpace
 
-# GP model as a weighted average between the vanilla vectorial GP and the graph GP
-from ..kernels.graph_kernel import GraphKernels
-from ..kernels.utils import extract_configs
-from ..kernels.vectorial_kernels import Stationary
-from ..kernels.weisfilerlehman import WeisfilerLehman
+logger = logging.getLogger(__name__)
 
 
 class ComprehensiveGP:
     def __init__(
         self,
+        space: SearchSpace,
         graph_kernels: Iterable,
         hp_kernels: Iterable,
-        likelihood: float = 1e-3,
-        weights=None,
-        vectorial_features: list = None,
-        combined_kernel: str = "sum",
-        logger=None,
-        surrogate_model_fit_args: dict = None,
+        initial_likelihood: float = 1e-3,
+        weights: Sequence[float] | torch.Tensor | None = None,
+        combined_kernel: Literal["sum", "product"] = "sum",
+        surrogate_model_fit_args: dict | None = None,
+        optimizer_kwargs: dict[str, Any] | None = None,
+        wl_subtree_candidates: Sequence[int] = (1, 2, 3, 4, 5),
+        wl_lengthscales: Sequence[float] = tuple(np.e**i for i in range(-2, 3)),
+        optimize_likelihood: bool = True,
+        optimizer: Literal["adam", "sgd"] = "adam",
+        optimizer_iters: int = 20,
+        max_likelihood: float = 0.01,
+        optimize_wl_layer_weights: bool = False,
     ):
-        self.likelihood = likelihood
-        self.surrogate_model_fit_args = surrogate_model_fit_args or {}
-
-        self.domain_kernels: list = []
-        if bool(graph_kernels):
-            self.domain_kernels += list(graph_kernels)
-        if bool(hp_kernels):
-            self.domain_kernels += list(hp_kernels)
-
-        self.n_kernels: int = len(self.domain_kernels)
-        self.n_graph_kernels: int = len(
-            [i for i in self.domain_kernels if isinstance(i, GraphKernels)]
-        )
-        self.n_vector_kernels: int = self.n_kernels - self.n_graph_kernels
-
-        self.vectorial_features = vectorial_features
-
+        graph_kernels = list(graph_kernels)
+        hp_kernels = list(hp_kernels)
+        n_graph_kernels = len(graph_kernels)
+        n_vector_kernels = len(hp_kernels)
+        n_kernels = n_graph_kernels + n_vector_kernels
+        domain_kernels = [*graph_kernels, *hp_kernels]
+
+        fixed_weights = weights is not None
         if weights is not None:
-            self.fixed_weights = True
             if weights is not None:
-                assert len(weights) == len(self.n_kernels), (
+                assert len(weights) == n_kernels, (
                     "the weights vector, if supplied, needs to have the same length as "
                     "the number of kernel_operators!"
                 )
-            self.init_weights = (
-                weights
-                if isinstance(weights, torch.Tensor)
-                else torch.tensor(weights).flatten()
-            )
+            init_weights = torch.as_tensor(weights).flatten()
         else:
-            self.fixed_weights = False
-            # Initialise the domain kernel weights to uniform
-            self.init_weights = torch.tensor(
-                [1.0 / self.n_kernels] * self.n_kernels,
-            )
-        self.weights = self.init_weights.clone()
+            uniform_weight = 1.0 / self.n_kernels
+            init_weights = torch.full((n_kernels,), uniform_weight, dtype=torch.float64)
 
         if combined_kernel == "product":
-            self.combined_kernel = ProductKernel(
-                *self.domain_kernels, weights=self.weights
-            )
+            _combined_kernel = ProductKernel(*domain_kernels, weights=weights)
         elif combined_kernel == "sum":
-            self.combined_kernel = SumKernel(*self.domain_kernels, weights=self.weights)
+            _combined_kernel = SumKernel(*domain_kernels, weights=weights)
         else:
             raise NotImplementedError(
                 f'Combining kernel {combined_kernel} is not yet implemented! Only "sum" '
                 f'or "product" are currently supported. '
             )
 
-        self.logger = logger or logging.getLogger("neps")
+        # TODO: Clone only needed while it can act like configurations
+        self.space = space.clone()
+        self.init_weights = init_weights
+        self.fixed_weights = fixed_weights
+        self.combined_kernel = _combined_kernel
+        self.initial_likelihood = initial_likelihood
+        self.surrogate_model_fit_args = surrogate_model_fit_args or {}
+        self.domain_kernels: list = [*graph_kernels, *hp_kernels]
+        self.n_kernels: int = len(self.domain_kernels)
+        self.n_graph_kernels: int = len(graph_kernels)
+        self.n_vector_kernels: int = len(hp_kernels)
+        self.optimizer_kwargs = optimizer_kwargs or {"lr": 0.1}
+        self.optimize_likelihood = optimize_likelihood
+        self.optimize_wl_layer_weights = optimize_wl_layer_weights
+        self.optimizer = optimizer
+        self.optimizer_iters = optimizer_iters
+        self.max_likelihood = max_likelihood
+        self.wl_subtree_candidates = wl_subtree_candidates
+        self.wl_lengthscales = wl_lengthscales
+
         # Cache the Gram matrix inverse and its log-determinant
-        self.K, self.K_i, self.logDetK = [None] * 3
-        self.theta_vector = None
-        self.layer_weights = None
-        self.nlml = None
-
-        self.x_configs: list = None
-        self.y: torch.Tensor = None
-        self.y_: torch.Tensor = None
-        self.y_mean: torch.Tensor = None
-        self.y_std: torch.Tensor = None
-        self.n: int = None
-
-    def _optimize_graph_kernels(self, h_: int, lengthscale_):
-        graphs, _ = extract_configs(self.x_configs)
-        for i, k in enumerate(self.combined_kernel.kernels):
-            if not isinstance(k, GraphKernels):
-                continue
-            elif isinstance(k, WeisfilerLehman):
-                _grid_search_wl_kernel(
-                    k,
-                    h_,
+        self.K_ = None
+        self.K_i_ = None
+        self.logDetK_ = None
+        self.theta_vector_ = None
+        self.layer_weights_ = None
+        self.nlml_ = None
+        self.likelihood_: float | None = None
+        self.weights_: torch.Tensor | None = None
+        self.x_configs_: list[SearchSpace] | None = None
+        self.y_: torch.Tensor | None = None
+        self.y_normalized_: torch.Tensor | None = None
+        self.y_mean_: float | None = None
+        self.y_std_: float | None = None
+        self.n_: int | None = None
+
+    def fit(self, train_x: list[SearchSpace], train_y: list[float]) -> None:
+        """Called by self.fit"""
+        self.x_configs = train_x
+        self.n_ = len(train_x)
+        self.y_ = torch.as_tensor(train_y, dtype=torch.float64)
+
+        # TODO: Dunno if I like this silent hack, setting std to 1 if no std
+        self.y_std_ = s if (s := torch.std(self.y_).item()) != 0 else 1
+        self.y_mean_ = torch.mean(self.y_).item()
+        self.y_normalized_ = (self.y_ - self.y_mean_) / self.y_std_
+
+        # The Gram matrix of the training data
+        self.K_i_, self.logDetK_ = None, None
+
+        if len(self.wl_subtree_candidates) > 0:
+            graphs, _ = extract_configs(self.x_configs)
+            graph_kernels = [
+                k for k in self.domain_kernels if isinstance(k, GraphKernels)
+            ]
+            for i, kernel in enumerate(graph_kernels):
+                if not isinstance(kernel, WeisfilerLehman):
+                    logger.warning(f"No kernel opt. for {type(kernel).__name__}.")
+                    continue
+
+                _xs = (
                     [x[i] for x in graphs]
                     if isinstance(graphs[0], list)
-                    else [c for c in graphs],
-                    self.y,
-                    self.likelihood,
-                    lengthscales=lengthscale_,
+                    else [x for x in graphs]
                 )
-            else:
-                self.logger.warning(
-                    "(Graph) kernel optimisation for "
-                    + type(k).__name__
-                    + " not implemented yet."
+                _grid_search_wl_kernel(
+                    kernel=kernel,
+                    subtree_candidates=self.wl_subtree_candidates,
+                    train_x=_xs,
+                    train_y=self.y_,
+                    likelihood=self.initial_likelihood,
+                    lengthscales=self.wl_lengthscales,
                 )
 
-    def fit(self, train_x, train_y):
-        self._fit(train_x, train_y, **self.surrogate_model_fit_args)
-
-    def _fit(
-        self,
-        train_x,
-        train_y,
-        iters: int = 20,
-        optimizer: str = "adam",
-        wl_subtree_candidates: tuple = tuple(range(5)),
-        wl_lengthscales: tuple = tuple(np.e**i for i in range(-2, 3)),
-        optimize_lik: bool = True,
-        max_lik: float = 0.01,
-        optimize_wl_layer_weights: bool = False,
-        optimizer_kwargs: dict = None,
-    ):
-        """Called by self.fit"""
-        self._reset_XY(train_x, train_y)
-
-        # Get the node weights, if needed
-
-        if optimizer_kwargs is None:
-            optimizer_kwargs = {"lr": 0.1}
-        if len(wl_subtree_candidates) > 0:
-            self._optimize_graph_kernels(
-                wl_subtree_candidates,
-                wl_lengthscales,
-            )
-
         weights = self.init_weights.clone()
 
-        if (not self.fixed_weights) and len(self.domain_kernels) > 1:
+        if not self.fixed_weights and self.n_kernels > 1:
             weights.requires_grad_(True)
 
-        theta_vector = get_theta_vector(vectorial_features=self.vectorial_features)
+        n_cat = len(self.space.categoricals)
+        n_num = len(self.space.numerical)
+        theta_categorical = torch.ones(
+            n_cat, requires_grad=n_cat > 1, dtype=torch.float64
+        )
+        theta_numerical = torch.ones(n_num, requires_grad=n_num > 1, dtype=torch.float64)
 
-        # Whether to include the likelihood (jitter or noise variance) as a hyperparameter
+        theta_vectors = {
+            "categorical": theta_categorical,
+            "continuous": theta_numerical,  # NOTE: This actually includes integers too -_-
+        }
         likelihood = torch.tensor(
-            self.likelihood,
+            self.initial_likelihood, requires_grad=self.optimize_likelihood
         )
-        if optimize_lik:
-            likelihood.requires_grad_(True)
 
         layer_weights = None
-        if optimize_wl_layer_weights:
-            for k in self.domain_kernels:
-                if isinstance(k, WeisfilerLehman):
-                    layer_weights = torch.ones(k.h + 1).requires_grad_(True)
-                    if layer_weights.shape[0] <= 1:
-                        layer_weights = None
-                    else:
-                        break
+        if self.optimize_wl_layer_weights:
+            for kernel in self.domain_kernels:
+                if isinstance(kernel, WeisfilerLehman) and kernel.h != 0:
+                    layer_weights = torch.ones(kernel.h + 1, requires_grad=True)
+                    break
 
         # Linking the optimizer variables to the sum kernel
-        optim_vars = []
-        for a in [weights, likelihood, layer_weights]:
-            if a is not None and a.is_leaf and a.requires_grad:
-                optim_vars.append(a)
-
-        if theta_vector is not None:
-            for a in theta_vector.values():
-                if a is not None and a.requires_grad:
-                    optim_vars.append(a)
+        optim_vars = [
+            a
+            for a in (
+                weights,
+                likelihood,
+                layer_weights,
+                theta_categorical,
+                theta_numerical,
+            )
+            if a is not None and a.is_leaf and a.requires_grad
+        ]
+
         nlml = None
         if len(optim_vars) == 0:  # Skip optimisation
             K = self.combined_kernel.fit_transform(
                 weights,
                 self.x_configs,
-                feature_lengthscale=theta_vector,
+                feature_lengthscale=theta_vectors,
                 layer_weights=layer_weights,
                 rebuild_model=True,
             )
-            K_i, logDetK = compute_pd_inverse(K, likelihood)
+            K_i, logDetK = compute_pd_inverse(K, jitter=likelihood)
         else:
             # Select the optimizer
-            assert optimizer.lower() in ["adam", "sgd"]
-            if optimizer.lower() == "adam":
-                optim = torch.optim.Adam(optim_vars, **optimizer_kwargs)
+            if self.optimizer == "adam":
+                optim = torch.optim.Adam(optim_vars, **self.optimizer_kwargs)  # type: ignore
+            elif self.optimizer == "sgd":
+                optim = torch.optim.SGD(optim_vars, **self.optimizer_kwargs)  # type: ignore
             else:
-                optim = torch.optim.SGD(optim_vars, **optimizer_kwargs)
+                raise ValueError(f"Invalid optimizer {self.optimizer}")
 
-            K = None
-            for i in range(iters):
+            K: torch.Tensor | None = None
+            for i in range(self.optimizer_iters):
                 optim.zero_grad()
                 K = self.combined_kernel.fit_transform(
-                    weights,
-                    self.x_configs,
-                    feature_lengthscale=theta_vector,
+                    weights=weights,
+                    configs=train_x,  # TODO
+                    feature_lengthscale=theta_vectors,
                     layer_weights=layer_weights,
                     rebuild_model=True,
                     save_gram_matrix=True,
                 )
-                K_i, logDetK = compute_pd_inverse(K, likelihood)
-                nlml = -compute_log_marginal_likelihood(K_i, logDetK, self.y)
+                K_i, logDetK = compute_pd_inverse(K, jitter=likelihood)
+                nlml = -compute_log_marginal_likelihood(
+                    K_i, logDetK, y=self.y_normalized_
+                )
                 nlml.backward()
                 if i % 10 == 0:
-                    self.logger.debug(
-                        f"Iteration: {i}/{iters} "
+                    logger.debug(
+                        f"Iteration: {i}/{self.optimizer_iters} "
                         f"Negative log-marginal likelihood:"
-                        f"{nlml.item()} {theta_vector} {weights} {likelihood}"
+                        f"{nlml.item()} {theta_vectors} {weights} {likelihood}"
                     )
+
                 optim.step()  # TODO
+
                 with torch.no_grad():
+                    if weights.is_leaf:
+                        weights.clamp_(0.0, 1.0)
 
-                    weights.clamp_(
-                        0.0, 1.0
-                    ) if weights is not None and weights.is_leaf else None
-                    theta_vector = self.combined_kernel.clamp_theta_vector(theta_vector)
-                    likelihood.clamp_(
-                        1e-5, max_lik
-                    ) if likelihood is not None and likelihood.is_leaf else None
-                    layer_weights.clamp_(
-                        0.0, 1.0
-                    ) if layer_weights is not None and layer_weights.is_leaf else None
+                    theta_vectors = self.combined_kernel.clamp_theta_vector(theta_vectors)
 
+                    if likelihood.is_leaf:
+                        likelihood.clamp_(1e-5, self.max_likelihood)
+
+                    if layer_weights is not None and layer_weights.is_leaf:
+                        layer_weights.clamp_(0.0, 1.0)
 
                 optim.zero_grad(set_to_none=True)
 
-            K_i, logDetK = compute_pd_inverse(K, likelihood)
+            assert K is not None
+            K_i, logDetK = compute_pd_inverse(K, jitter=likelihood)
 
         # Apply the optimal hyperparameters
-        self.weights = weights.clone() / torch.sum(weights)
-        self.K_i = K_i.clone()
-        self.K = K.clone()
-        self.logDetK = logDetK.clone()
-        self.likelihood = likelihood.item()
-        self.theta_vector = theta_vector
-        self.layer_weights = layer_weights
-        self.nlml = nlml.detach().cpu() if nlml is not None else None
-
-        for k in self.combined_kernel.kernels:
-            if isinstance(k, Stationary):
-                k.update_hyperparameters(lengthscale=theta_vector)
-
-        self.combined_kernel.weights = weights.clone()
-
-        self.logger.debug("Optimisation summary: ")
-        self.logger.debug(
-            f"Optimal NLML: {nlml}",
-        )
-        self.logger.debug(f"Lengthscales: {theta_vector}")
-        try:
-            self.logger.debug(
-                f"Optimal h: {self.domain_kernels[0]._h}",
-            )
-        except AttributeError:
-            pass
-        self.logger.debug(f"Weights: {self.weights}")
-        self.logger.debug(f"Lik: {self.likelihood}")
-        self.logger.debug(f"Optimal layer weights {layer_weights}")
-
-    def predict(self, x_configs, preserve_comp_graph: bool = False):
+        self.weights_ = weights.clone() / torch.sum(weights)
+        self.K_i_ = K_i.clone()
+        self.K_ = K.clone()
+        self.logDetK_ = logDetK.clone()
+        self.likelihood_ = likelihood.item()
+        self.theta_vector_ = theta_vectors
+        self.layer_weights_ = layer_weights
+        self.nlml_ = nlml.detach().cpu() if nlml is not None else None
+
+        for kernel in self.combined_kernel.kernels:
+            if isinstance(kernel, Stationary):
+                kernel.update_hyperparameters(lengthscale=self.theta_vector_)
+
+        logger.debug("Optimisation summary: ")
+        logger.debug(f"Optimal NLML: {nlml}")
+        logger.debug(f"Lengthscales: {theta_vectors}")
+        with contextlib.suppress(AttributeError):
+            logger.debug(f"Optimal h: {self.domain_kernels[0]._h}")
+        logger.debug(f"Weights: {self.weights_}")
+        logger.debug(f"Lik: {self.likelihood_}")
+        logger.debug(f"Optimal layer weights {layer_weights}")
+
+    def predict(self, x_configs: list[SearchSpace]) -> tuple[torch.Tensor, torch.Tensor]:
         """Kriging predictions"""
-
         if not isinstance(x_configs, list):
-            # Convert a single input X_s to a singleton list
             x_configs = [x_configs]
 
-        if self.K_i is None or self.logDetK is None:
+        if self.K_i_ is None or self.logDetK_ is None or self.weights_ is None:
             raise ValueError(
                 "Inverse of Gram matrix is not instantiated. Please call the optimize "
                 "function to fit on the training data first!"
@@ -278,246 +281,39 @@ def predict(self, x_configs, preserve_comp_graph: bool = False):
 
         # Concatenate the full list
         X_configs_all = self.x_configs + x_configs
-
-        # Make a copy of the sum_kernels for this step, to avoid breaking the autodiff
-        # if grad guided mutation is used
-        if preserve_comp_graph:
-            combined_kernel_copy = deepcopy(self.combined_kernel)
-        else:
-            combined_kernel_copy = self.combined_kernel
-
-        K_full = combined_kernel_copy.fit_transform(
-            self.weights,
-            X_configs_all,
-            layer_weights=self.layer_weights,
-            feature_lengthscale=self.theta_vector,
+        n_train = len(self.x_configs)
+        n_test = len(x_configs)
+
+        K_full = self.combined_kernel.fit_transform(
+            weights=self.weights_,
+            configs=X_configs_all,
+            layer_weights=self.layer_weights_,
+            feature_lengthscale=self.theta_vector_,
             rebuild_model=True,
             save_gram_matrix=False,
             gp_fit=False,
         )
 
-        K_s = K_full[: self.n :, self.n :]
-
-        K_ss = K_full[self.n :, self.n :] + self.likelihood * torch.eye(
-            len(x_configs),
-        )
-
-        mu_s = K_s.t() @ self.K_i @ self.y
-        cov_s = K_ss - K_s.t() @ self.K_i @ K_s
-        cov_s = torch.clamp(cov_s, self.likelihood, np.inf)
-        mu_s = unnormalize_y(mu_s, self.y_mean, self.y_std)
-        std_s = torch.sqrt(cov_s)
-        std_s = unnormalize_y(std_s, None, self.y_std, True)
-        cov_s = std_s**2
-        if preserve_comp_graph:
-            del combined_kernel_copy
-        return mu_s, cov_s
+        K_s = K_full[:n_train:, n_train:]
+        K_ss = K_full[n_train:, n_train:] + self.likelihood_ * torch.eye(n_test)
 
-    @property
-    def x(self):
-        return self.x_configs
+        mu_s = K_s.t() @ self.K_i_ @ self.y_
+        mu_s = mu_s * self.y_std_ + self.y_mean_
 
-    def _reset_XY(self, train_x: Iterable, train_y: Union[Iterable, torch.Tensor]):
-        self.x_configs = train_x
-        self.n = len(self.x_configs)
-        train_y_tensor = (
-            train_y
-            if isinstance(train_y, torch.Tensor)
-            else torch.tensor(train_y, dtype=torch.get_default_dtype())
-        )
-        self.y_ = train_y_tensor
-        self.y, self.y_mean, self.y_std = normalize_y(train_y_tensor)
-        # The Gram matrix of the training data
-        self.K_i, self.logDetK = None, None
+        cov_s = K_ss - K_s.t() @ self.K_i_ @ K_s
+        cov_s = torch.clamp(cov_s, self.likelihood_, np.inf)
+        cov_s = (torch.sqrt(cov_s) * self.y_std_) ** 2
 
-    def dmu_dphi(
-        self,
-        X_s=None,
-        # compute_grad_var=False,
-        average_across_features=True,
-        average_across_occurrences=False,
-    ):
-        r"""
-        Compute the derivative of the GP posterior mean at the specified input location with respect to the
-        *vector embedding* of the graph (e.g., if using WL-subtree, this function computes the gradient wrt
-        each subtree pattern)
-
-        The derivative is given by
-        $
-        \frac{\partial \mu^*}{\partial \phi ^*} = \frac{\partial K(\phi, \phi^*)}{\partial \phi ^ *}K(\phi, \phi)^{-1}
-        \mathbf{y}
-        $
-
-        which derives directly from the GP posterior mean formula, and since the term $K(\phi, \phi)^{-1} and \mathbf{y}
-        are both independent of the testing points (X_s, or \phi^*}, the posterior gradient is simply the matrix
-        produce of the kernel gradient with the inverse Gram and the training label vector.
-
-        Parameters
-        ----------
-        X_s: The locations on which the GP posterior mean derivatives should be evaluated. If left blank, the
-        derivatives will be evaluated at the training points.
-
-        compute_grad_var: bool. If true, also compute the gradient variance.
-
-        The derivative of GP is also a GP, and thus the predictive distribution of the posterior gradient is Gaussian.
-        The posterior mean is given above, and the posterior variance is:
-        $
-        \mathbb{V}[\frac{\partial f^*}{\partial \phi^*}]= \frac{\partial^2k(\phi^*, \phi^*)}{\partial \phi^*^2} -
-        \frac{\partial k(\phi^*, \Phi)}{\partial \phi^*}K(X, X)^{-1}\frac{\partial k{(\Phi, \phi^*)}}{\partial \phi^*}
-        $
-
-        Returns
-        -------
-        list of K torch.Tensor of the shape N x2 D, where N is the length of the X_s list (each element of which is a
-        networkx graph), K is the number of kernel_operators in the combined kernel and D is the dimensionality of the
-        feature vector (this is determined by the specific graph kernel.
-
-        OR
-
-        list of K torch.Tensor of shape D, if averaged_over_samples flag is enabled.
-        """
-        if self.K_i is None or self.logDetK is None:
-            raise ValueError(
-                "Inverse of Gram matrix is not instantiated. Please call the optimize "
-                "function to fit on the training data first!"
-            )
-        if self.n_vector_kernels:
-            if X_s is not None:
-                V_s = self._get_vectorial_features(X_s, self.vectorial_feactures)
-                V_s, _, _ = standardize_x(V_s, self.x_features_min, self.x_features_max)
-            else:
-                V_s = self.x_features
-                X_s = self.x[:]
-        else:
-            V_s = None
-            X_s = X_s if X_s is not None else self.x[:]
-
-        alpha = (self.K_i @ self.y).double().reshape(1, -1)
-        dmu_dphi = []
-        # dmu_dphi_var = [] if compute_grad_var else None
-
-        Ks_handles = []
-        feature_matrix = []
-        for j, x_s in enumerate(X_s):
-            jacob_vecs = []
-            if V_s is None:
-                handles = self.combined_kernel.forward_t(
-                    self.weights,
-                    [x_s],
-                )
-            else:
-                handles = self.combined_kernel.forward_t(self.weights, [x_s], V_s[j])
-            Ks_handles.append(handles)
-            # Each handle is a 2-tuple. first element is the Gram matrix, second element is the leaf variable
-            feature_vectors = []
-            for handle in handles:
-                k_s, y, _ = handle
-                # k_s is output, leaf is input, alpha is the K_i @ y term which is constant.
-                # When compute_grad_var is not required, computational graphs do not need to be saved.
-                jacob_vecs.append(
-                    torch.autograd.grad(
-                        outputs=k_s, inputs=y, grad_outputs=alpha, retain_graph=False
-                    )[0]
-                )
-                feature_vectors.append(y)
-            feature_matrix.append(feature_vectors)
-            jacob_vecs = torch.cat(jacob_vecs)
-            dmu_dphi.append(jacob_vecs)
-
-        feature_matrix = torch.cat([f[0] for f in feature_matrix])
-        if average_across_features:
-            dmu_dphi = torch.cat(dmu_dphi)
-            # compute the weighted average of the gradient across N_t.
-            # feature matrix is of shape N_t x K x D
-            avg_mu, avg_var, incidences = get_grad(
-                dmu_dphi, feature_matrix, average_across_occurrences
-            )
-            return avg_mu, avg_var, incidences
-        return (
-            dmu_dphi,
-            None,
-            feature_matrix.sum(dim=0) if average_across_occurrences else feature_matrix,
-        )
-
-
-def get_grad(grad_matrix, feature_matrix, average_occurrences=False):
-    r"""
-    Average across the samples via a Monte Carlo sampling scheme. Also estimates the
-    empirical variance. :param average_occurrences: if True, do a weighted summation
-    based on the frequency distribution of the occurrence to compute a gradient *per
-    each feature*. Otherwise, each different occurrence (\phi_i = k) will get a
-    different gradient estimate.
-    """
-    assert grad_matrix.shape == feature_matrix.shape
-    # Prune out the all-zero columns that pop up sometimes
-    valid_cols = []
-    for col_idx in range(feature_matrix.size(1)):
-        if not torch.all(feature_matrix[:, col_idx] == 0):
-            valid_cols.append(col_idx)
-    feature_matrix = feature_matrix[:, valid_cols]
-    grad_matrix = grad_matrix[:, valid_cols]
-
-    _, D = feature_matrix.shape
-    if average_occurrences:
-        avg_grad = torch.zeros(D)
-        avg_grad_var = torch.zeros(D)
-        for d in range(D):
-            current_feature = feature_matrix[:, d].clone().detach()
-            instances, indices, counts = torch.unique(
-                current_feature, return_inverse=True, return_counts=True
-            )
-            weight_vector = torch.tensor([counts[i] for i in indices]).type(torch.float)
-            weight_vector /= weight_vector.sum()
-            mean = torch.sum(weight_vector * grad_matrix[:, d])
-            # Compute the empirical variance of gradients
-            variance = torch.sum(weight_vector * grad_matrix[:, d] ** 2) - mean**2
-            avg_grad[d] = mean
-            avg_grad_var[d] = variance
-        return avg_grad, avg_grad_var, feature_matrix.sum(dim=0)
-    else:
-        # The maximum number possible occurrences -- 7 is an example, if problem occurs, maybe we can increase this
-        # number. But for now, for both NAS-Bench datasets, this should be more than enough!
-        max_occur = 7
-        avg_grad = torch.zeros(D, max_occur)
-        avg_grad_var = torch.zeros(D, max_occur)
-        incidences = torch.zeros(D, max_occur)
-        for d in range(D):
-            current_feature = feature_matrix[:, d].clone().detach()
-            instances, indices, counts = torch.unique(
-                current_feature, return_inverse=True, return_counts=True
-            )
-            for i, val in enumerate(instances):
-                # Find index of all feature counts that are equal to the current val
-                feature_at_val = grad_matrix[current_feature == val]
-                avg_grad[d, int(val)] = torch.mean(feature_at_val)
-                avg_grad_var[d, int(val)] = torch.var(feature_at_val)
-                incidences[d, int(val)] = counts[i]
-        return avg_grad, avg_grad_var, incidences
-
-
-# Optimize Graph kernel
-def getBack(var_grad_fn, logger):
-    logger.debug(var_grad_fn)
-    for n in var_grad_fn.next_functions:
-        if n[0]:
-            try:
-                tensor = getattr(n[0], "variable")
-                logger.debug(n[0])
-                logger.debug(f"Tensor with grad found: {tensor}")
-                logger.debug(f" - gradient: {tensor.grad}")
-            except AttributeError:
-                getBack(n[0], logger)
+        return mu_s, cov_s
 
 
 def _grid_search_wl_kernel(
-    k: WeisfilerLehman,
+    kernel: WeisfilerLehman,
     subtree_candidates,
     train_x: list,
     train_y: torch.Tensor,
-    lik: float,
-    subtree_prior=None,
+    likelihood: float,
     lengthscales=None,
-    lengthscales_prior=None,
 ):
     """Optimize the *discrete hyperparameters* of Weisfeiler Lehman kernel.
     k: a Weisfeiler-Lehman kernel instance
@@ -533,136 +329,73 @@ def _grid_search_wl_kernel(
     best_subtree_depth = None
     best_lengthscale = None
     best_K = None
-    if lengthscales is not None and k.se is not None:
+    if lengthscales is not None and kernel.se is not None:
         candidates = [(h_, l_) for h_ in subtree_candidates for l_ in lengthscales]
     else:
         candidates = [(h_, None) for h_ in subtree_candidates]
 
     for i in candidates:
-        if k.se is not None:
-            k.change_se_params({"lengthscale": i[1]})
-        k.change_kernel_params({"h": i[0]})
-        K = k.fit_transform(train_x, rebuild_model=True, save_gram_matrix=True)
-        # self.logger.debug(K)
-        K_i, logDetK = compute_pd_inverse(K, lik)
-        # self.logger.debug(train_y)
+        if kernel.se is not None:
+            kernel.change_se_params({"lengthscale": i[1]})
+
+        kernel.change_kernel_params({"h": i[0]})
+        K = kernel.fit_transform(train_x, rebuild_model=True, save_gram_matrix=True)
+        K_i, logDetK = compute_pd_inverse(K, jitter=likelihood)
         nlml = -compute_log_marginal_likelihood(K_i, logDetK, train_y)
-        # self.logger.debug(f"{i} {nlml}")
         if nlml < best_nlml:
             best_nlml = nlml
             best_subtree_depth, best_lengthscale = i
             best_K = torch.clone(K)
-    # self.logger.debug(f"h: {best_subtree_depth} theta: {best_lengthscale}")
-    # self.logger.debug(best_subtree_depth)
-    k.change_kernel_params({"h": best_subtree_depth})
-    if k.se is not None:
-        k.change_se_params({"lengthscale": best_lengthscale})
-    k._gram = best_K
-
-
-def get_theta_vector(vectorial_features):
-    if vectorial_features is None:
-        return None
-    theta_vector = {}
-    for key, dim in vectorial_features.items():
-        t = torch.ones(dim)
-        if t.shape[0] > 1:
-            t.requires_grad_(True)
-        theta_vector[key] = t
-    return theta_vector
-
-
-def normalize_y(y: torch.Tensor):
-    y_mean = torch.mean(y) if isinstance(y, torch.Tensor) else np.mean(y)
-    y_std = torch.std(y) if isinstance(y, torch.Tensor) else np.std(y)
-    if y_std == 0:
-        y_std = 1
-    y = (y - y_mean) / y_std
-    return y, y_mean, y_std
-
-
-def unnormalize_y(y, y_mean, y_std, scale_std=False):
-    """Similar to the undoing of the pre-processing step above, but on the output predictions"""
-    if not scale_std:
-        return y * y_std + y_mean
-    else:
-        return y * y_std
 
-
-def standardize_x(
-    x: torch.Tensor, x_min: torch.Tensor = None, x_max: torch.Tensor = None
-):
-    """Standardize the vectorial input into a d-dimensional hypercube [0, 1]^d, where d is the number of features.
-    if x_min ond x_max are supplied, x2 will be standardised using these instead. This is used when standardising the
-    validation/test inputs.
-    """
-    if (x_min is not None and x_max is None) or (x_min is None and x_max is not None):
-        raise ValueError(
-            "Either *both* or *neither* of x_min, x_max need to be supplied!"
-        )
-    if x_min is None:
-        x_min = torch.min(x, 0)[0]
-        x_max = torch.max(x, 0)[0]
-    x = (x - x_min) / (x_max - x_min)
-    return x, x_min, x_max
+    kernel.change_kernel_params({"h": best_subtree_depth})
+    if kernel.se is not None:
+        kernel.change_se_params({"lengthscale": best_lengthscale})
+    kernel._gram = best_K
 
 
 def compute_log_marginal_likelihood(
     K_i: torch.Tensor,
     logDetK: torch.Tensor,
     y: torch.Tensor,
+    *,
     normalize: bool = True,
-    log_prior_dist=None,
-):
+) -> torch.Tensor:
     """Compute the zero mean Gaussian process log marginal likelihood given the inverse of Gram matrix K(x2,x2), its
     log determinant, and the training label vector y.
     Option:
 
     normalize: normalize the log marginal likelihood by the length of the label vector, as per the gpytorch
     routine.
-
-    prior: A pytorch distribution object. If specified, the hyperparameter prior will be taken into consideration and
-    we use Type-II MAP instead of Type-II MLE (compute log_posterior instead of log_evidence)
     """
     lml = (
-        -0.5 * y.t() @ K_i @ y
+        -0.5 * (y.t() @ K_i @ y)
         + 0.5 * logDetK
-        - y.shape[0]
-        / 2.0
-        * torch.log(
-            2
-            * torch.tensor(
-                np.pi,
-            )
-        )
+        - y.shape[0] / 2.0 * torch.log(2 * torch.tensor(np.pi))
     )
-    if log_prior_dist is not None:
-        lml -= log_prior_dist
     return lml / y.shape[0] if normalize else lml
 
 
-def compute_pd_inverse(K: torch.tensor, jitter: float = 1e-6):
+def compute_pd_inverse(
+    K: torch.Tensor,
+    *,
+    jitter: float | torch.Tensor = 1e-6,
+    attempts: int = 3,
+) -> tuple[torch.Tensor, torch.Tensor]:
     """Compute the inverse of a postive-(semi)definite matrix K using Cholesky inversion."""
     n = K.shape[0]
     assert (
         isinstance(jitter, float) or jitter.ndim == 0
     ), "only homoscedastic noise variance is allowed here!"
-    is_successful = False
-    fail_count = 0
-    max_fail = 3
-    while fail_count < max_fail and not is_successful:
+    for i in range(attempts):
         try:
-            jitter_diag = jitter * torch.eye(n, device=K.device) * 10**fail_count
-            K_ = K + jitter_diag
-            try:
-                Kc = torch.linalg.cholesky(K_)
-            except AttributeError:  # For torch < 1.8.0
-                Kc = torch.cholesky(K_)
-            is_successful = True
+            jitter_diag = jitter * torch.eye(n, device=K.device) * 10**i
+            Kc = torch.linalg.cholesky(K + jitter_diag)
+            break
         except RuntimeError:
-            fail_count += 1
-    if not is_successful:
+            pass
+    else:
         raise RuntimeError(f"Gram matrix not positive definite despite of jitter:\n{K}")
+
     logDetK = -2 * torch.sum(torch.log(torch.diag(Kc)))
     K_i = torch.cholesky_inverse(Kc)
-    return K_i.to(torch.get_default_dtype()), logDetK.to(torch.get_default_dtype())
+    return K_i.to(dtype=torch.float64), logDetK.to(dtype=torch.float64)
diff --git a/neps/optimizers/models/__init__.py b/neps/optimizers/models/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/neps/search_spaces/search_space.py b/neps/search_spaces/search_space.py
index 3f0d6703..40ecd0cf 100644
--- a/neps/search_spaces/search_space.py
+++ b/neps/search_spaces/search_space.py
@@ -211,7 +211,7 @@ def __init__(self, **hyperparameters: Parameter):
 
                 if not isinstance(hp, NumericalParameter):
                     raise ValueError(
-                        "neps only suport float and integer fidelity parameters"
+                        f"Only float and integer fidelities supported, got {hp}"
                     )
 
                 _fidelity_param = hp
@@ -232,6 +232,25 @@ def __init__(self, **hyperparameters: Parameter):
         self.raw_tabular_space: SearchSpace | None = None
         self.has_tabular: bool = False
 
+        self.categoricals: Mapping[str, CategoricalParameter] = {
+            k: hp for k, hp in _hyperparameters if isinstance(hp, CategoricalParameter)
+        }
+        self.numerical: Mapping[str, NumericalParameter] = {
+            k: hp
+            for k, hp in _hyperparameters
+            if isinstance(hp, NumericalParameter) and not hp.is_fidelity
+        }
+        self.graphs: Mapping[str, GraphParameter] = {
+            k: hp for k, hp in _hyperparameters if isinstance(hp, GraphParameter)
+        }
+        self.constants: Mapping[str, Any] = {
+            k: hp.value for k, hp in _hyperparameters if isinstance(hp, ConstantParameter)
+        }
+        # NOTE: For future of multiple fidelities
+        self.fidelities: Mapping[str, NumericalParameter] = {}
+        if _fidelity_param is not None and _fidelity_name is None:
+            self.fidelities = {_fidelity_name: _fidelity_param}
+
     def set_custom_grid_space(
         self,
         grid_table: pd.Series | pd.DataFrame,

From dc3ae030b3d38f08f13f88d734dce70303e4cdf9 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 14 Aug 2024 18:44:54 +0200
Subject: [PATCH 05/63] refactor: Lengthscale is always tensor

---
 .../kernels/vectorial_kernels.py              | 55 +++++++------------
 1 file changed, 20 insertions(+), 35 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
index bd7a1661..9d0d4df2 100644
--- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
+++ b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
@@ -1,8 +1,7 @@
 from __future__ import annotations
 from copy import deepcopy
 from math import sqrt
-from dataclasses import dataclass
-from typing import Iterable
+from dataclasses import dataclass, field
 from typing_extensions import override
 
 import numpy as np
@@ -20,7 +19,8 @@ class Stationary:
     All the classes (i.e. the class of stationary kernel_operators) derived from this
     class use the scaled distance to compute the Gram matrix."""
 
-    lengthscale: float | torch.Tensor = 1.0
+    # A single value applies to all dimensions, a vector applies to each dimension
+    lengthscale: torch.Tensor = field(default_factory=lambda: torch.tensor(1.0))
     lengthscale_bounds: tuple[float, float] = LENGTHSCALE_BOUNDS_DEFAULT
     outputscale: float = 1.0
 
@@ -31,7 +31,7 @@ def forward(
         self,
         x1: torch.Tensor,
         x2: torch.Tensor | None = None,
-        l: float | torch.Tensor | None = None,
+        l: torch.Tensor | None = None,
     ) -> torch.Tensor:
         lengthscale = l if l is not None else self.lengthscale
         return _scaled_distance(lengthscale, x1, x2)
@@ -39,7 +39,7 @@ def forward(
     def fit_transform(
         self,
         x1,
-        l: float | torch.Tensor | None = None,
+        l: torch.Tensor | None = None,
         rebuild_model: bool = True,
         save_gram_matrix: bool = True,
     ) -> torch.Tensor:
@@ -52,7 +52,7 @@ def fit_transform(
             self.gram_ = K.clone()
         return K
 
-    def transform(self, x1, l: float | torch.Tensor | None = None) -> torch.Tensor:
+    def transform(self, x1, l: torch.Tensor | None = None) -> torch.Tensor:
         if self.gram_ is None or self.train_ is None:
             raise ValueError("The kernel has not been fitted. Run fit_transform first")
         return self.forward(self.train_, x1, l=l)
@@ -61,7 +61,7 @@ def forward_t(
         self,
         x2: torch.Tensor,
         x1: torch.Tensor | None = None,
-        l: float | torch.Tensor | None = None,
+        l: torch.Tensor | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         if x1 is None:
             x1 = torch.tensor(self.train_)
@@ -69,11 +69,8 @@ def forward_t(
         K = self.forward(x1, x2, l)
         return K, x2
 
-    def update_hyperparameters(self, lengthscale: Iterable[torch.Tensor]) -> None:
-        self.lengthscale = [
-            l_.clamp(self.lengthscale_bounds[0], self.lengthscale_bounds[1]).item()
-            for l_ in lengthscale
-        ]
+    def update_hyperparameters(self, lengthscale: torch.Tensor) -> None:
+        self.lengthscale = torch.clamp(lengthscale, *self.lengthscale_bounds)
 
 
 @dataclass
@@ -83,7 +80,7 @@ def forward(
         self,
         x1: torch.Tensor,
         x2: torch.Tensor | None = None,
-        l: float | torch.Tensor | None = None,
+        l: torch.Tensor | None = None,
     ) -> torch.Tensor:
         lengthscale = l if l is not None else self.lengthscale
         dist_sq = _scaled_distance(lengthscale, x1, x2, sq_dist=True)
@@ -130,7 +127,7 @@ def forward(
         self,
         x1: torch.Tensor,
         x2: torch.Tensor | None = None,
-        l: float | torch.Tensor | None = None,
+        l: torch.Tensor | None = None,
     ) -> torch.Tensor:
         lengthscale = l if l is not None else self.lengthscale
         dist = _scaled_distance(lengthscale, x1, x2)
@@ -143,7 +140,7 @@ def forward(
         self,
         x1: torch.Tensor,
         x2: torch.Tensor | None = None,
-        l: float | torch.Tensor | None = None,
+        l: torch.Tensor | None = None,
     ) -> torch.Tensor:
         lengthscale = l if l is not None else self.lengthscale
         dist = _scaled_distance(lengthscale, x1, x2, sq_dist=True)
@@ -153,12 +150,6 @@ def forward(
             * torch.exp(-sqrt(5.0) * dist)
         )
 
-    def update_hyperparameters(self, lengthscale):
-        if lengthscale is None or "continuous" not in lengthscale.keys():
-            raise ValueError("wtf")
-        lengthscale = lengthscale["continuous"]
-        super().update_hyperparameters(lengthscale=lengthscale)
-
 
 @dataclass
 class HammingKernel(Stationary):
@@ -167,18 +158,12 @@ def forward(
         self,
         x1: torch.Tensor,
         x2: torch.Tensor | None = None,
-        l: float | torch.Tensor | None = None,
+        l: torch.Tensor | None = None,
     ) -> torch.Tensor:
         lengthscale = l if l is not None else self.lengthscale
         dist = _hamming_distance(lengthscale, x1, x2)
         return self.outputscale * dist
 
-    def update_hyperparameters(self, lengthscale):
-        if lengthscale is None or "categorical" not in lengthscale.keys():
-            raise ValueError("wtf")
-        lengthscale = lengthscale["categorical"]
-        super().update_hyperparameters(lengthscale=lengthscale)
-
 
 @dataclass
 class RationalQuadraticKernel(Stationary):
@@ -189,7 +174,7 @@ def forward(
         self,
         x1: torch.Tensor,
         x2: torch.Tensor | None = None,
-        l: float | torch.Tensor | None = None,
+        l: torch.Tensor | None = None,
     ) -> torch.Tensor:
         lengthscale = l if l is not None else self.lengthscale
         dist_sq = _scaled_distance(lengthscale, x1, x2, sq_dist=True)
@@ -212,7 +197,7 @@ def _unscaled_square_distance(
 
 
 def _scaled_distance(
-    lengthscale: float | torch.Tensor,
+    lengthscale: torch.Tensor,
     X: torch.Tensor,
     X2: torch.Tensor | None = None,
     *,
@@ -224,7 +209,7 @@ def _scaled_distance(
     lengthscale for all dimensions. Otherwise, we have an ARD kernel and in which case
     the length of the lengthscale vector must be the same as the dimensionality of the
     problem."""
-    if isinstance(lengthscale, float):
+    if len(lengthscale) == 1:
         if sq_dist is False:
             return torch.sqrt(_unscaled_square_distance(X, X2)) / (lengthscale**2)
 
@@ -246,7 +231,7 @@ def _scaled_distance(
 
 
 def _hamming_distance(
-    lengthscale: float | torch.Tensor,
+    lengthscale: torch.Tensor,
     X: torch.Tensor,
     X2: torch.Tensor | None = None,
 ) -> torch.Tensor:
@@ -258,7 +243,7 @@ def _hamming_distance(
     scaled_indicator = C * indicator
     diffs = scaled_indicator.sum(dim=2)
 
-    if isinstance(lengthscale, float) or len(lengthscale) == 1:
+    if len(lengthscale) == 1:
         return torch.exp(diffs) / lengthscale
-    else:
-        return torch.exp(diffs)
+
+    return torch.exp(diffs)

From 3b8e549ffde1fcd5490e915cf3843221c490cf8c Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Thu, 15 Aug 2024 18:55:24 +0200
Subject: [PATCH 06/63] refactor: Cleanup and reduce duplicate preprocessing GP

---
 .../bayesian_optimization/cost_cooling.py     |   4 +-
 .../bayesian_optimization/kernels/__init__.py |   4 +
 .../kernels/combine_kernels.py                | 209 -------
 .../kernels/combine_kernels_hierarchy.py      |  44 --
 .../bayesian_optimization/kernels/encoding.py | 277 ---------
 .../kernels/get_kernels.py                    |  92 +--
 .../kernels/grakel_replace/__init__.py        |   8 +
 .../grakel_replace/vertex_histogram.py        |  15 +-
 .../grakel_replace/weisfeiler_lehman.py       |  19 +-
 .../kernels/graph_kernel.py                   |  35 --
 .../bayesian_optimization/kernels/kernel.py   | 110 ++++
 .../bayesian_optimization/kernels/utils.py    |  14 +-
 .../kernels/vectorial_kernels.py              | 167 ++---
 .../kernels/weisfilerlehman.py                | 360 ++---------
 .../bayesian_optimization/models/deepGP.py    |   4 +-
 .../bayesian_optimization/models/gp.py        | 578 +++++++-----------
 .../models/gp_hierarchy.py                    | 162 +----
 .../bayesian_optimization/optimizer.py        |   4 +-
 neps/optimizers/multi_fidelity/dyhpo.py       |   4 +-
 .../multi_fidelity/sampling_policy.py         |  27 +-
 neps/search_spaces/__init__.py                |   2 +
 neps/search_spaces/encoding.py                | 132 ++++
 22 files changed, 671 insertions(+), 1600 deletions(-)
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/combine_kernels.py
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/encoding.py
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/graph_kernel.py
 create mode 100644 neps/optimizers/bayesian_optimization/kernels/kernel.py
 create mode 100644 neps/search_spaces/encoding.py

diff --git a/neps/optimizers/bayesian_optimization/cost_cooling.py b/neps/optimizers/bayesian_optimization/cost_cooling.py
index f2878fe9..0d77fbc6 100644
--- a/neps/optimizers/bayesian_optimization/cost_cooling.py
+++ b/neps/optimizers/bayesian_optimization/cost_cooling.py
@@ -23,7 +23,7 @@
 from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
     AcquisitionSampler,
 )
-from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_kernels
+from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_default_kernels
 from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
 from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization
 
@@ -124,7 +124,7 @@ def __init__(
 
         surrogate_model_args = surrogate_model_args or {}
         cost_model_args = cost_model_args or {}
-        graph_kernels, hp_kernels = get_kernels(
+        graph_kernels, hp_kernels = get_default_kernels(
             self.pipeline_space,
             domain_se_kernel,
             graph_kernels,
diff --git a/neps/optimizers/bayesian_optimization/kernels/__init__.py b/neps/optimizers/bayesian_optimization/kernels/__init__.py
index 8d11ea81..44c8e0ac 100644
--- a/neps/optimizers/bayesian_optimization/kernels/__init__.py
+++ b/neps/optimizers/bayesian_optimization/kernels/__init__.py
@@ -1,8 +1,12 @@
 from __future__ import annotations
+from dataclasses import dataclass
 
 from functools import partial
 from typing import Callable
+from typing_extensions import TypeAlias
 
+from neps.optimizers.bayesian_optimization.kernels.graph_kernel import GraphKernels
+from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary
 from .encoding import NASBOTDistance
 from .vectorial_kernels import HammingKernel, Matern32Kernel, Matern52Kernel, RBFKernel
 from .weisfilerlehman import WeisfilerLehman
diff --git a/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py b/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py
deleted file mode 100644
index 3aa320b5..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py
+++ /dev/null
@@ -1,209 +0,0 @@
-import logging
-
-import torch
-
-from .utils import extract_configs
-from .vectorial_kernels import HammingKernel, Stationary
-from .weisfilerlehman import GraphKernels
-
-
-def _select_dimensions(k):
-    if isinstance(k, HammingKernel):
-        return "categorical"
-    return "continuous"
-
-
-class CombineKernel:
-    def __init__(
-        self,
-        combined_by="sum",
-        *kernels: list,
-        **kwargs,
-    ):
-        if combined_by not in ["sum", "product"]:
-            raise ValueError(f"Invalid value for combined_by ({combined_by})")
-
-        self.has_graph_kernels = False
-        self.has_vector_kernels = False
-        self.lengthscale_bounds = (None, None)
-        for k in kernels:
-            if isinstance(k, GraphKernels):
-                self.has_graph_kernels = True
-            if not isinstance(k, GraphKernels):
-                self.has_vector_kernels = True
-                self.lengthscale_bounds = k.lengthscale_bounds
-        self.kernels = kernels
-        # Store the training graphs and vector features..
-        self._gram = None
-        self.gr, self.x = None, None
-        self.combined_by = combined_by
-
-    def fit_transform(
-        self,
-        weights: torch.Tensor,
-        configs: list,
-        normalize: bool = True,
-        rebuild_model: bool = True,
-        save_gram_matrix: bool = True,
-        gp_fit: bool = True,
-        feature_lengthscale: dict[str, torch.Tensor] | None = None,
-        **kwargs,
-    ):
-        N = len(configs)
-        K = torch.zeros(N, N) if self.combined_by == "sum" else torch.ones(N, N)
-
-        gr1, x1 = extract_configs(configs)
-
-        for i, k in enumerate(self.kernels):
-            if isinstance(k, GraphKernels) and None not in gr1:
-                update_val = weights[i] * k.fit_transform(
-                    [g[i] for g in gr1] if isinstance(gr1[0], (list, tuple)) else gr1,
-                    rebuild_model=rebuild_model,
-                    save_gram_matrix=save_gram_matrix,
-                    gp_fit=gp_fit,
-                    **kwargs,
-                )
-
-            elif isinstance(k, Stationary) and None not in x1:
-                key = _select_dimensions(k)
-                update_val = (
-                    weights[i]
-                    * k.fit_transform(
-                        [x_[key] for x_ in x1],
-                        l=feature_lengthscale[key]
-                        if isinstance(feature_lengthscale, dict)
-                        else None,
-                        rebuild_model=rebuild_model,
-                        save_gram_matrix=save_gram_matrix,
-                    )
-                ).double()
-
-            else:
-                raise NotImplementedError(
-                    "For now, only the Stationary custom built kernel_operators are "
-                    "supported! "
-                )
-
-            if self.combined_by == "sum":
-                K += update_val
-            elif self.combined_by == "product":
-                K *= update_val
-
-        if normalize:
-            K_diag = torch.sqrt(torch.diag(K))
-            K /= torch.ger(K_diag, K_diag)
-        if save_gram_matrix:
-            self._gram = K.clone()
-
-        return K
-
-    def transform(
-        self,
-        weights: torch.Tensor,
-        configs: list,
-        x=None,
-        feature_lengthscale=None,
-    ):
-        if self._gram is None:
-            raise ValueError(
-                "The kernel has not been fitted. Call fit_transform first to generate "
-                "the training Gram matrix."
-            )
-        gr, x = extract_configs(configs)
-        # K is in shape of len(Y), len(X)
-        size = len(configs)
-        K = (
-            torch.zeros(size, self._gram.shape[0])
-            if self.combined_by == "sum"
-            else torch.ones(size, self._gram.shape[0])
-        )
-
-        for i, k in enumerate(self.kernels):
-            if isinstance(k, GraphKernels) and None not in gr:
-                update_val = weights[i] * k.transform(
-                    [g[i] for g in gr] if isinstance(gr, list) else gr
-                )
-            elif isinstance(k, Stationary) and None not in x:
-                key = _select_dimensions(k)
-                update_val = (
-                    weights[i]
-                    * k.transform(
-                        [x_[key] for x_ in x],
-                        l=feature_lengthscale[key]
-                        if isinstance(feature_lengthscale, dict)
-                        else None,
-                    ).double()
-                )
-            else:
-                raise NotImplementedError(
-                    "For now, only the Stationary custom built kernel_operators are "
-                    "supported! "
-                )
-
-            if self.combined_by == "sum":
-                K += update_val
-            elif self.combined_by == "product":
-                K *= update_val
-
-        return K.t()
-
-    def clamp_theta_vector(
-        self, theta_vector: dict[str, torch.Tensor]
-    ) -> dict[str, torch.Tensor]:
-        for t_ in theta_vector.values():
-            if t_.is_leaf:
-                t_.clamp_(self.lengthscale_bounds[0], self.lengthscale_bounds[1])
-
-        return theta_vector
-
-
-class SumKernel(CombineKernel):
-    def __init__(self, *kernels, **kwargs):
-        super().__init__("sum", *kernels, **kwargs)
-
-    def forward_t(
-        self,
-        weights: torch.Tensor,
-        gr2: list,
-        x2=None,
-        gr1: list = None,
-        x1=None,
-        feature_lengthscale=None,
-    ):
-        """
-        Compute the kernel gradient w.r.t the feature vector
-        Parameters
-        ----------
-        feature_lengthscale
-        x2
-        x1
-        gr1
-        weights
-        gr2
-
-        Returns ------- grads: k list of 2-tuple. (K, x2) where K is the weighted Gram
-        matrix of that matrix, x2 is the leaf variable on which Jacobian-vector product
-        to be computed.
-
-        """
-        grads = []
-        for i, k in enumerate(self.kernels):
-            if isinstance(k, GraphKernels):
-                handle = k.forward_t(gr2, gr1=gr1)
-                grads.append((weights[i] * handle[0], handle[1], handle[2]))
-            elif isinstance(k, Stationary):
-                key = _select_dimensions(k)
-                handle = k.forward_t(x2=x2[key], x1=x1[key], l=feature_lengthscale[i])
-                grads.append((weights[i] * handle[0], handle[1], handle[2]))
-            else:
-                logging.warning(
-                    "Gradient not implemented for kernel type" + str(k.__name__)
-                )
-                grads.append((None, None))
-        assert len(grads) == len(self.kernels)
-        return grads
-
-
-class ProductKernel(CombineKernel):
-    def __init__(self, *kernels, **kwargs):
-        super().__init__("product", *kernels, **kwargs)
diff --git a/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py b/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py
index 2f3d2bf6..086cfc03 100644
--- a/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py
+++ b/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py
@@ -195,50 +195,6 @@ class SumKernel(CombineKernel):
     def __init__(self, *kernels, **kwargs):
         super().__init__("sum", *kernels, **kwargs)
 
-    def forward_t(
-        self,
-        weights: torch.Tensor,
-        gr2: list,
-        x2=None,
-        gr1: list = None,
-        x1=None,
-        feature_lengthscale=None,
-    ):
-        """
-        Compute the kernel gradient w.r.t the feature vector
-        Parameters
-        ----------
-        feature_lengthscale
-        x2
-        x1
-        gr1
-        weights
-        gr2
-
-        Returns
-        -------
-        grads: k list of 2-tuple.
-        (K, x2) where K is the weighted Gram matrix of that matrix, x2 is the leaf variable on which Jacobian-vector
-        product to be computed.
-
-        """
-        weights = transform_weights(weights.clone())
-        grads = []
-        for i, k in enumerate(self.kernels):
-            if isinstance(k, GraphKernels):
-                handle = k.forward_t(gr2, gr1=gr1)
-                grads.append((weights[i] * handle[0], handle[1], handle[2]))
-            elif isinstance(k, Stationary):
-                handle = k.forward_t(x2=x2, x1=x1, l=feature_lengthscale)
-                grads.append((weights[i] * handle[0], handle[1], handle[2]))
-            else:
-                logging.warning(
-                    "Gradient not implemented for kernel type" + str(k.__name__)
-                )
-                grads.append((None, None))
-        assert len(grads) == len(self.kernels)
-        return grads
-
 
 class ProductKernel(CombineKernel):
     def __init__(self, *kernels, **kwargs):
diff --git a/neps/optimizers/bayesian_optimization/kernels/encoding.py b/neps/optimizers/bayesian_optimization/kernels/encoding.py
deleted file mode 100644
index 419b6926..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/encoding.py
+++ /dev/null
@@ -1,277 +0,0 @@
-# Code from https://github.com/xingchenwan/nasbowl
-
-import networkx as nx
-import numpy as np
-import torch
-
-from .graph_kernel import GraphKernels
-
-INPUT = "input"
-OUTPUT = "output"
-CONV3X3 = "conv3x3-bn-relu"
-CONV1X1 = "conv1x1-bn-relu"
-MAXPOOL3X3 = "maxpool3x3"
-OPS = [INPUT, CONV3X3, CONV1X1, MAXPOOL3X3, OUTPUT]
-OPS_EX = [
-    CONV3X3,
-    CONV1X1,
-    MAXPOOL3X3,
-]
-
-OPS_201 = ["avg_pool_3x3", "nor_conv_1x1", "nor_conv_3x3", "none", "skip_connect"]
-
-NUM_VERTICES = 7
-OP_SPOTS = NUM_VERTICES - 2
-MAX_EDGES = 9
-
-
-def get_op_list(string):
-    # given a string, get the list of operations
-    tokens = string.split("|")
-    ops = [t.split("~")[0] for i, t in enumerate(tokens) if i not in [0, 2, 5, 9]]
-    return ops
-
-
-def edit_distance(g1, g2):
-    g1_ops = get_op_list(g1.name)
-    g2_ops = get_op_list(g2.name)
-    return np.sum([1 for i in range(len(g1_ops)) if g1_ops[i] != g2_ops[i]])
-
-
-class NASBOTDistance(GraphKernels):
-    """NASBOT OATMANN distance according to BANANAS paper"""
-
-    def __init__(
-        self,
-        node_name="op_name",
-        include_op_list=None,
-        exclude_op_list=None,
-        lengthscale=3.0,
-        normalize=True,
-        max_size=None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.node_name = node_name
-        self.include_op_list = include_op_list if include_op_list is not None else OPS
-        self.exclude_op_list = exclude_op_list if exclude_op_list is not None else []
-        self.normalize = normalize
-        self.lengthscale = lengthscale
-        self.max_size = max_size
-        self._gram = None
-
-    def _compute_kernel(self, dist, l=None):
-        if dist is None:
-            return 0.0
-        if l is None:
-            l = self.lengthscale
-        return np.exp(-dist / (l**2))
-
-    def _compute_dist(
-        self,
-        g1: nx.Graph,
-        g2: nx.Graph,
-    ):
-        # if cell-based nasbench201
-        if "~" in g1.name:
-            g1_ops = get_op_list(g1.name)
-            g2_ops = get_op_list(g2.name)
-
-            g1_counts = [g1_ops.count(op) for op in OPS_201]
-            g2_counts = [g2_ops.count(op) for op in OPS_201]
-            ops_dist = np.sum(np.abs(np.subtract(g1_counts, g2_counts)))
-            edit_dist = edit_distance(g1, g2)
-            return ops_dist + edit_dist
-        else:
-            # adjacency matrices
-            a1 = nx.to_numpy_array(g1)
-            a2 = nx.to_numpy_array(g2)
-            row_sums = sorted(np.array(a1).sum(axis=0))
-            col_sums = sorted(np.array(a1).sum(axis=1))
-
-            other_row_sums = sorted(np.array(a2).sum(axis=0))
-            other_col_sums = sorted(np.array(a2).sum(axis=1))
-
-            row_sums_arr = np.atleast_2d(row_sums)
-            col_sums_arr = np.atleast_2d(col_sums)
-
-            other_row_sums_arr = np.atleast_2d(other_row_sums)
-            other_col_sums_arr = np.atleast_2d(other_col_sums)
-            row_dist = np.sum(
-                np.abs(np.diag(np.subtract(row_sums_arr, other_row_sums_arr.T)))
-            )
-            col_dist = np.sum(
-                np.abs(np.diag(np.subtract(col_sums_arr, other_col_sums_arr.T)))
-            )
-            counts = [0] * len(self.include_op_list)
-            other_counts = [0] * len(self.include_op_list)
-            for _, attrs in g1.nodes(data=True):
-                op_name = attrs[self.node_name]
-                if op_name not in self.exclude_op_list:
-                    idx = self.include_op_list.index(op_name)
-                    counts[idx] += 1
-            for _, attrs in g2.nodes(data=True):
-                op_name = attrs[self.node_name]
-                if op_name not in self.exclude_op_list:
-                    idx = self.include_op_list.index(op_name)
-                    other_counts[idx] += 1
-
-            ops_dist = np.sum(np.abs(np.subtract(counts, other_counts)))
-            return (row_dist + col_dist + ops_dist) + 0.0
-
-    def forward(self, *graphs: nx.Graph, l: float = None):
-        n = len(graphs)
-        K = torch.zeros((n, n))
-        for i in range(n):
-            for j in range(i, n):
-                K[i, j] = self._compute_kernel(
-                    self._compute_dist(graphs[i], graphs[j]), l
-                )
-                K[j, i] = K[i, j]
-        if self.normalize:
-            K = self.normalize_gram(K)
-        return K
-
-    def fit_transform(
-        self,
-        gr: list,
-        l: float = None,
-        rebuild_model: bool = False,
-        save_gram_matrix: bool = False,
-        **kwargs,
-    ):
-        if not rebuild_model and self._gram is not None:
-            return self._gram
-        K = self.forward(*gr, l=l)
-        if save_gram_matrix:
-            self._gram = K.clone()
-            self._train_x = gr[:]
-        return K
-
-    def transform(self, gr: list, l: float = None, **kwargs):
-        if self._gram is None:
-            raise ValueError("The kernel has not been fitted. Run fit_transform first")
-        n = len(gr)
-        K = torch.zeros((len(self._train_x), n))
-        for i, _ in enumerate(self._train_x):
-            for j in range(n):
-                K[i, j] = self._compute_kernel(
-                    self._compute_dist(self._train_x[i], gr[j]), l
-                )
-        return K
-
-
-class AdjacencyDistance(
-    NASBOTDistance,
-):
-    def _compute_dist(self, g1: nx.Graph, g2: nx.Graph):
-        # adjacency matrices
-        a1 = nx.to_numpy_array(g1)
-        a2 = nx.to_numpy_array(g2)
-        x1 = np.array([attrs[self.node_name] for node, attrs in g1.nodes(data=True)])
-        x2 = np.array([attrs[self.node_name] for node, attrs in g2.nodes(data=True)])
-        graph_dist = np.sum(a1 != a2)
-        ops_dist = np.sum(x1 != x2)
-        return (graph_dist + ops_dist) + 0.0
-
-
-class PathDistance(NASBOTDistance):
-    def get_paths(self, g: nx.Graph):
-        """
-        return all paths from input to output
-        """
-        paths: list = []
-        matrix = nx.to_numpy_array(g)
-        ops: list = []
-        for _, attr in g.nodes(data=True):
-            ops.append(attr[self.node_name])
-        for j in range(0, NUM_VERTICES):
-            if matrix[0][j]:
-                paths.append([[]])
-            else:
-                paths.append([])
-
-        # create paths sequentially
-        for i in range(1, NUM_VERTICES - 1):
-            for j in range(1, NUM_VERTICES):
-                if matrix[i][j]:
-                    for path in paths[i]:
-                        paths[j].append([*path, ops[i]])
-        return paths[-1]
-
-    def get_path_indices(self, g: nx.Graph):
-        """
-        compute the index of each path
-        There are 3^0 + ... + 3^5 paths total.
-        (Paths can be length 0 to 5, and for each path, for each node, there
-        are three choices for the operation.)
-        """
-        paths = self.get_paths(g)
-        mapping = {CONV3X3: 0, CONV1X1: 1, MAXPOOL3X3: 2}
-        path_indices = []
-
-        for path in paths:
-            index = 0
-            for i in range(NUM_VERTICES - 1):
-                if i == len(path):
-                    path_indices.append(index)
-                    break
-                else:
-                    index += len(OPS_EX) ** i * (mapping[path[i]] + 1)
-
-        return tuple(path_indices)
-
-    @staticmethod
-    def get_paths_201(g: nx.Graph):
-        """
-        return all paths from input to output
-        """
-        path_blueprints = [[3], [0, 4], [1, 5], [0, 2, 5]]
-        ops = get_op_list(g.name)
-        paths = []
-        for blueprint in path_blueprints:
-            paths.append([ops[node] for node in blueprint])
-
-        return paths
-
-    def get_path_indices_201(self, g: nx.Graph):
-        """
-        compute the index of each path
-        """
-        paths = self.get_paths_201(g)
-        path_indices = []
-        NUM_OPS = len(OPS_201)
-        for i, path in enumerate(paths):
-            if i == 0:
-                index = 0
-            elif i in [1, 2]:
-                index = NUM_OPS
-            else:
-                index = NUM_OPS + NUM_OPS**2
-            for j, op in enumerate(path):
-                index += OPS_201.index(op) * NUM_OPS**j
-            path_indices.append(index)
-
-        return tuple(path_indices)
-
-    def encode_paths(self, g: nx.Graph):
-        """output one-hot encoding of paths"""
-        if "~" in g.name:
-            LONGEST_PATH_LENGTH = 3
-            num_paths = sum(len(OPS_201) ** i for i in range(1, LONGEST_PATH_LENGTH + 1))
-            path_indices = self.get_path_indices_201(g)
-        elif "101" in g.name:
-            num_paths = sum(len(OPS_EX) ** i for i in range(OP_SPOTS + 1))
-            path_indices = self.get_path_indices(g)
-        else:
-            num_paths = sum(len(self.op_list) ** i for i in range(self.max_size - 1))
-            path_indices = self.get_paths(g)
-        path_encoding = np.zeros(num_paths)
-        for index in path_indices:
-            path_encoding[index] = 1
-        return path_encoding
-
-    def _compute_dist(self, g1: nx.Graph, g2: nx.Graph):
-        encode1 = self.encode_paths(g1)
-        encode2 = self.encode_paths(g2)
-        return np.sum(np.array(encode1 != np.array(encode2)))
diff --git a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py b/neps/optimizers/bayesian_optimization/kernels/get_kernels.py
index f606f442..3ed9b5b9 100644
--- a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py
+++ b/neps/optimizers/bayesian_optimization/kernels/get_kernels.py
@@ -1,40 +1,58 @@
 from __future__ import annotations
 
-from neps.utils.common import instance_from_map
-from ....search_spaces.architecture.core_graph_grammar import CoreGraphGrammar
-from ....search_spaces.hyperparameters.categorical import CategoricalParameter
-from ....search_spaces.hyperparameters.float import FloatParameter
-from ....search_spaces.hyperparameters.integer import IntegerParameter
-from ....utils.common import has_instance
-from . import GraphKernelMapping, StationaryKernelMapping
-
-
-def get_kernels(
-    pipeline_space, domain_se_kernel, graph_kernels, hp_kernels, optimal_assignment
-):
-    if not graph_kernels:
-        graph_kernels = []
-        if has_instance(pipeline_space.values(), CoreGraphGrammar):
-            graph_kernels.append("wl")
-    if not hp_kernels:
-        hp_kernels = []
-        if has_instance(pipeline_space.values(), FloatParameter, IntegerParameter):
-            hp_kernels.append("m52")
-        if has_instance(pipeline_space.values(), CategoricalParameter):
-            hp_kernels.append("hm")
-    graph_kernels = [
-        instance_from_map(GraphKernelMapping, kernel, "kernel", as_class=True)(
-            oa=optimal_assignment,
-            se_kernel=instance_from_map(
-                StationaryKernelMapping, domain_se_kernel, "se kernel"
-            ),
+from neps.optimizers.bayesian_optimization.kernels import Kernel
+from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import (
+    HammingKernel,
+    Matern52Kernel,
+)
+import torch
+from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import WeisfilerLehman
+
+from neps.search_spaces import SearchSpace
+
+
+# TODO: Option to combine numerical and categorical into one.
+def get_default_kernels(
+    *,
+    space: SearchSpace,
+    optimizable: bool = True,
+) -> list[tuple[Kernel, list[str]],]:
+    kernels: list[tuple[Kernel, list[str]]] = []
+    if any(space.graphs):
+        h = 2
+        if optimizable:
+            layer_weights = torch.nn.Parameter(torch.ones(h + 1))
+        else:
+            layer_weights = None
+
+        kernels.append(
+            (
+                WeisfilerLehman(h=2, layer_weights=layer_weights, oa=True),
+                list(space.graphs.keys()),
+            )
         )
-        for kernel in graph_kernels
-    ]
-    hp_kernels = [
-        instance_from_map(StationaryKernelMapping, kernel, "kernel")
-        for kernel in hp_kernels
-    ]
-    if not graph_kernels and not hp_kernels:
-        raise ValueError("No kernels are provided!")
-    return graph_kernels, hp_kernels
+
+    if any(space.categoricals):
+        if optimizable:
+            lengthscales = torch.nn.Parameter(torch.ones(len(space.categoricals)))
+        else:
+            lengthscales = torch.ones(len(space.categoricals))
+
+            kernels.append(
+                (
+                    HammingKernel(lengthscale=lengthscales),
+                    list(space.categoricals.keys()),
+                )
+            )
+
+    if any(space.numerical):
+        if optimizable:
+            lengthscales = torch.nn.Parameter(torch.ones(len(space.numerical)))
+        else:
+            lengthscales = torch.ones(len(space.numerical))
+
+            kernels.append(
+                (Matern52Kernel(lengthscale=lengthscales), list(space.numerical.keys()))
+            )
+
+    return kernels
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py
index e69de29b..ac1c60ad 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py
@@ -0,0 +1,8 @@
+from neps.optimizers.bayesian_optimization.kernels.grakel_replace.vertex_histogram import (
+    VertexHistogram,
+)
+from neps.optimizers.bayesian_optimization.kernels.grakel_replace.weisfeiler_lehman import (
+    WeisfeilerLehman,
+)
+
+__all__ = ["VertexHistogram", "WeisfeilerLehman"]
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
index 285b067c..103818ae 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
@@ -1,4 +1,5 @@
 """The vertex kernel as defined in :cite:`sugiyama2015halting`."""
+
 import logging
 from collections import Counter
 from collections.abc import Iterable
@@ -54,7 +55,7 @@ def __init__(
         sparse="auto",
         oa=False,
         mahalanobis_precision=None,
-        se_kernel: Stationary = None,
+        se_kernel: Stationary | None = None,
         requires_ordered_features: bool = False,
         as_tensor: bool = True,
     ):
@@ -75,6 +76,7 @@ def __init__(
             self.sparse = False
         else:
             self.sparse = sparse
+
         self.oa = oa
         self.se_kernel = se_kernel
         self._initialized.update({"sparse": True})
@@ -220,8 +222,11 @@ def parse_input(self, X, label_start_idx=0, label_end_idx=None):
 
                 except MemoryError:
                     warn("memory-error: switching to sparse")
-                    self.sparse_, features = True, csr_matrix(
-                        (data, (rows, cols)), shape=(ni, label_length), copy=False
+                    self.sparse_, features = (
+                        True,
+                        csr_matrix(
+                            (data, (rows, cols)), shape=(ni, label_length), copy=False
+                        ),
                     )
 
             if ni == 0:
@@ -257,7 +262,7 @@ def _calculate_kernel_matrix(self, Y=None):
                         K[j, i] = K[i, j]
             else:
                 if self.se_kernel is not None:
-                    K = self.se_kernel.forward(self.X, self.X)
+                    K = self.se_kernel._forwardd(self.X, self.X)
                 else:
                     K = self.X @ self.X.T
         else:
@@ -270,7 +275,7 @@ def _calculate_kernel_matrix(self, Y=None):
                         )
             else:
                 if self.se_kernel is not None:
-                    K = self.se_kernel.forward(self.X, Y)
+                    K = self.se_kernel._forwardd(self.X, Y)
                 else:
                     K = Y[:, : self.X.shape[1]] @ self.X.T
 
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
index 890c2c8d..dd5dd829 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
@@ -18,7 +18,6 @@
 
 from .vertex_histogram import VertexHistogram
 
-warnings.filterwarnings("ignore", message="Importing from numpy.matlib is deprecated ")
 
 class WeisfeilerLehman(Kernel):
     """Compute the Weisfeiler Lehman Kernel.
@@ -71,7 +70,7 @@ def __init__(
         h: int = 5,
         base_graph_kernel=VertexHistogram,
         node_weights=None,
-        layer_weights=None,
+        layer_weights: torch.Tensor | None = None,
         as_tensor: bool = True,
     ):
         """Initialise a `weisfeiler_lehman` kernel."""
@@ -114,7 +113,7 @@ def initialize(self):
             if base_graph_kernel is None:
                 base_graph_kernel, params = VertexHistogram, dict()
             # TODO: make sure we're always passing like this
-            elif type(base_graph_kernel) is type and issubclass(
+            elif type(base_graph_kernel) is type and issubclass(  # pylint: disable=C0123
                 base_graph_kernel, Kernel
             ):
                 params = dict()
@@ -129,7 +128,7 @@ def initialize(self):
                     ) from _error
 
                 if not (
-                    type(base_graph_kernel) is type
+                    type(base_graph_kernel) is type  # pylint: disable=C0123
                     and issubclass(base_graph_kernel, Kernel)
                 ):
                     raise TypeError(
@@ -159,10 +158,10 @@ def initialize(self):
             self._h = self.h + 1
             self._initialized["h"] = True
 
-            if self.layer_weights is None or self.layer_weights.shape[0] != self._h:
-                self.layer_weights = np.ones((self._h,))
-            if self.as_tensor and not isinstance(self.layer_weights, torch.Tensor):
-                self.layer_weights = torch.tensor(self.layer_weights)
+            if self.layer_weights is None:
+                self.layer_weights = torch.ones((self._h,))
+            else:
+                assert len(self.layer_weights) == self._h
 
             self._initialized["h"] = True
             self._initialized["layer_weights"] = True
@@ -424,9 +423,7 @@ def generate_graphs(label_count: int, WL_labels_inverse):
                 return K, base_graph_kernel
             return np.sum(K, axis=0), base_graph_kernel
 
-    def fit_transform(
-        self, X: Iterable, y=None, gp_fit: bool = True
-    ):
+    def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True):  # pylint: disable=unused-argument
         """Fit and transform, on the same dataset.
 
         Parameters
diff --git a/neps/optimizers/bayesian_optimization/kernels/graph_kernel.py b/neps/optimizers/bayesian_optimization/kernels/graph_kernel.py
deleted file mode 100644
index b9d10102..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/graph_kernel.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import torch
-
-
-class GraphKernels:
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        self.n_hyperparameters = 0
-        self.rbf_lengthscale = False
-        self.kern = None
-        self.__name__ = "GraphKernelBase"
-
-    @staticmethod
-    def normalize_gram(K: torch.Tensor):
-        K_diag = torch.sqrt(torch.diag(K))
-        K_diag_outer = torch.ger(K_diag, K_diag)
-        return K / K_diag_outer
-
-    def fit_transform(
-        self, gr: list, rebuild_model=False, save_gram_matrix=False, **kwargs
-    ):
-        raise NotImplementedError
-
-    def transform(
-        self,
-        gr: list,
-    ):
-        raise NotImplementedError
-
-    def forward_t(self, gr2, gr1: list = None):
-        """
-        Compute the derivative of the kernel function k(phi, phi*) with respect to phi* (the training point)
-        """
-        raise NotImplementedError(
-            "The kernel gradient is not implemented for the graph kernel called!"
-        )
diff --git a/neps/optimizers/bayesian_optimization/kernels/kernel.py b/neps/optimizers/bayesian_optimization/kernels/kernel.py
new file mode 100644
index 00000000..52db2751
--- /dev/null
+++ b/neps/optimizers/bayesian_optimization/kernels/kernel.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+import math
+import inspect
+import copy
+from typing import TypeVar, Generic, Any, Sequence, Mapping, Callable
+from typing_extensions import Self
+import torch
+import torch.nn as nn
+
+from neps.utils.types import NotSet
+
+T = TypeVar("T")
+
+
+class Kernel(nn.Module, Generic[T]):
+    def fit_transform(self, x: T) -> torch.Tensor:
+        raise NotImplementedError
+
+    def transform(self, x: T) -> torch.Tensor:
+        raise NotImplementedError
+
+    def clone(self) -> Self:
+        return self.clone_with()
+
+    def clone_with(self, **params: dict[str, Any]) -> Self:
+        # h ttps://github.com/scikit-learn/scikit-learn/blob/70fdc843a4b8182d97a3508c1a426acc5e87e980/sklearn/base.py#L197
+        sig = inspect.signature(self.__init__)
+
+        self_values = {}
+        for p in sig.parameters.values():
+            if p.name == "self":
+                continue
+
+            attr = getattr(self, p.name, NotSet)
+            if attr is NotSet:
+                raise ValueError(
+                    f"Could not clone as the variable {p.name} was not set in"
+                    f" the constructor on the object: {self}"
+                )
+            self_values[p.name] = params.get(p.name, attr)
+
+        new_self_values = copy.deepcopy(self_values)
+        return self.__class__(**new_self_values)
+
+    def grid_search(
+        self,
+        x: T,
+        *,
+        grid: Sequence[Mapping[str, Any]],
+        to_minimize: Callable[[torch.Tensor], float],
+    ) -> tuple[Self, float]:
+        if len(grid) == 0:
+            raise ValueError("Grid must have at least one element.")
+
+        def _fit_and_eval(_params: Mapping[str, Any]) -> tuple[Kernel[T], float]:
+            cloned_kernel = self.clone_with(**_params)
+            K = cloned_kernel.fit_transform(x)
+            metric = to_minimize(K)
+            return cloned_kernel, metric
+
+        return min(
+            (_fit_and_eval(params) for params in grid),
+            key=lambda x: x[1],
+        )
+
+
+class NumericKernel(Kernel[torch.Tensor]): ...
+
+
+PI = torch.tensor(math.pi)
+
+
+def compute_normalized_log_marginal_likelihood(
+    K_i: torch.Tensor,
+    logDetK: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    """Compute the zero mean Gaussian process log marginal likelihood
+    given the inverse of Gram matrix K(x2,x2), its log determinant,
+    and the training label vector y.
+    """
+    lml = -0.5 * (y.t() @ K_i @ y) + 0.5 * logDetK - y.shape[0] / 2.0 * torch.log(2 * PI)
+    return lml / y.shape[0]
+
+
+def compute_pd_inverse(
+    K: torch.Tensor,
+    *,
+    jitter: float | torch.Tensor = 1e-9,
+    attempts: int = 3,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Compute the inverse of a postive-(semi)definite matrix K using Cholesky inversion."""
+    n = K.shape[0]
+    assert (
+        isinstance(jitter, float) or jitter.ndim == 0
+    ), "only homoscedastic noise variance is allowed here!"
+    for i in range(attempts):
+        try:
+            jitter_diag = jitter * torch.eye(n, device=K.device) * 10**i
+            Kc = torch.linalg.cholesky(K + jitter_diag)
+            break
+        except RuntimeError:
+            pass
+    else:
+        raise RuntimeError(f"Gram matrix not positive definite despite of jitter:\n{K}")
+
+    logDetK = -2 * torch.sum(torch.log(torch.diag(Kc)))
+    K_i = torch.cholesky_inverse(Kc)
+    return K_i.to(dtype=torch.float64), logDetK.to(dtype=torch.float64)
diff --git a/neps/optimizers/bayesian_optimization/kernels/utils.py b/neps/optimizers/bayesian_optimization/kernels/utils.py
index 92ee1817..e134bfd0 100644
--- a/neps/optimizers/bayesian_optimization/kernels/utils.py
+++ b/neps/optimizers/bayesian_optimization/kernels/utils.py
@@ -33,16 +33,22 @@ def extract_configs(configs: list[SearchSpace]) -> Tuple[list, list]:
     """
     config_hps = [conf.get_normalized_hp_categories() for conf in configs]
     graphs = [hps["graphs"] for hps in config_hps]
+
     # Don't call np.array on structured objects
     # https://github.com/numpy/numpy/issues/24546#issuecomment-1693913119
     # _nested_graphs = np.array(graphs, dtype=object)
     # if _nested_graphs.ndim == 3
     #   graphs = _nested_graphs[:, :, 0].reshape(-1).tolist()
     # Long hand way of doing the above
-    if (len(graphs) > 0 and isinstance(graphs[0], list)
-        and len(graphs[0]) > 0 and isinstance(graphs[0][0], list)):
-        res = [_list for list_of_list in graphs for _list in list_of_list]
-        graphs = res
+    # I guess this is just flattening...
+    if (
+        len(graphs) > 0
+        and isinstance(graphs[0], list)
+        and len(graphs[0]) > 0
+        and isinstance(graphs[0][0], list)
+    ):
+        graphs = [_list for list_of_list in graphs for _list in list_of_list]
+
     return graphs, config_hps
 
 
diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
index 9d0d4df2..8e7a1074 100644
--- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
+++ b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
@@ -1,149 +1,80 @@
 from __future__ import annotations
-from copy import deepcopy
+
 from math import sqrt
-from dataclasses import dataclass, field
 from typing_extensions import override
+from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel
 
 import numpy as np
 import torch
 
-LENGTHSCALE_BOUNDS_DEFAULT = (
-    np.exp(-6.754111155189306),
-    np.exp(0.0858637988771976),
-)
+DEFAULT_LENGTHSCALE_BOUNDS = np.exp(-6.754111155189306), np.exp(0.0858637988771976)
 
 
-@dataclass
-class Stationary:
+class Stationary(Kernel[torch.Tensor]):
     """Here we follow the structure of GPy to build a sub class of stationary kernel.
-    All the classes (i.e. the class of stationary kernel_operators) derived from this
-    class use the scaled distance to compute the Gram matrix."""
-
-    # A single value applies to all dimensions, a vector applies to each dimension
-    lengthscale: torch.Tensor = field(default_factory=lambda: torch.tensor(1.0))
-    lengthscale_bounds: tuple[float, float] = LENGTHSCALE_BOUNDS_DEFAULT
-    outputscale: float = 1.0
 
-    gram_: torch.Tensor | None = None
-    train_: torch.Tensor | None = None
-
-    def forward(
-        self,
-        x1: torch.Tensor,
-        x2: torch.Tensor | None = None,
-        l: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        lengthscale = l if l is not None else self.lengthscale
-        return _scaled_distance(lengthscale, x1, x2)
+    All the classes (i.e. the class of stationary kernel_operators) derived from this
+    class use the scaled distance to compute the Gram matrix.
+    """
 
-    def fit_transform(
+    def __init__(
         self,
-        x1,
-        l: torch.Tensor | None = None,
-        rebuild_model: bool = True,
-        save_gram_matrix: bool = True,
-    ) -> torch.Tensor:
-        if not rebuild_model and self.gram_ is not None:
-            return self.gram_
-        K = self.forward(x1, l=l)
-        if save_gram_matrix:
-            self.train_ = deepcopy(x1)
-            assert isinstance(K, torch.Tensor), "it doesnt work with np arrays.."
-            self.gram_ = K.clone()
+        *,
+        lengthscale: torch.Tensor,
+        outputscale: float | torch.Tensor = 1.0,
+        lengthscale_bounds: tuple[float, float] = DEFAULT_LENGTHSCALE_BOUNDS,
+    ):
+        self.lengthscale = lengthscale
+        self.outputscale = outputscale
+        self.lengthscale_bounds = lengthscale_bounds
+
+        self.gram_: torch.Tensor | None = None
+        self.train_: torch.Tensor | None = None
+
+    def fit_transform(self, x: torch.Tensor) -> torch.Tensor:
+        K = self._forward(x)
+        self.train_ = x.clone().detach()
         return K
 
-    def transform(self, x1, l: torch.Tensor | None = None) -> torch.Tensor:
-        if self.gram_ is None or self.train_ is None:
+    def transform(self, x: torch.Tensor) -> torch.Tensor:
+        if self.train_ is None:
             raise ValueError("The kernel has not been fitted. Run fit_transform first")
-        return self.forward(self.train_, x1, l=l)
+        return self._forward(self.train_, x)
 
-    def forward_t(
-        self,
-        x2: torch.Tensor,
-        x1: torch.Tensor | None = None,
-        l: torch.Tensor | None = None,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        if x1 is None:
-            x1 = torch.tensor(self.train_)
-        x2 = torch.tensor(x2, requires_grad=True)
-        K = self.forward(x1, x2, l)
-        return K, x2
-
-    def update_hyperparameters(self, lengthscale: torch.Tensor) -> None:
-        self.lengthscale = torch.clamp(lengthscale, *self.lengthscale_bounds)
+    def _forward(self, x1: torch.Tensor, x2: torch.Tensor | None = None) -> torch.Tensor:
+        return _scaled_distance(self.lengthscale, x1, x2)
 
 
-@dataclass
 class RBFKernel(Stationary):
     @override
-    def forward(
+    def _forward(
         self,
         x1: torch.Tensor,
         x2: torch.Tensor | None = None,
-        l: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        lengthscale = l if l is not None else self.lengthscale
-        dist_sq = _scaled_distance(lengthscale, x1, x2, sq_dist=True)
+        dist_sq = _scaled_distance(self.lengthscale, x1, x2, sq_dist=True)
         return self.outputscale * torch.exp(-0.5 * dist_sq)
 
 
-@dataclass
-class LayeredRBFKernel(RBFKernel):
-    """
-    Same as the conventional RBF kernel, but adapted in a way as a midway between
-    spherical RBF and ARD RBF. In this case, one weight is assigned to each
-    Weisfiler-Lehman iteration only (e.g. one weight for h=0, another for h=1 and etc.)
-    """
-
-    @override
-    def forward(
-        self,
-        ard_dims: torch.Tensor,
-        x1: torch.Tensor,
-        x2: torch.Tensor | None = None,
-        l: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        _l = l if l is not None else self.lengthscale
-        assert isinstance(_l, torch.Tensor), "Lengthscale must be a torch tensor"
-        assert _l.shape[0] == ard_dims.shape[0], (
-            "LayeredRBF expects the lengthscale vector to have the same "
-            "dimensionality as the "
-            "number of WL iterations, but got lengthscale vector of shape"
-            + str(_l.shape[0])
-            + "and WL iteration of shape "
-            + str(ard_dims.shape[0])
-        )
-
-        M = torch.cat(
-            [torch.ones(int(ard_dims[i])) * _l[i] for i in range(len(ard_dims))]
-        )
-        return super().forward(x1, x2, M)
-
-
-@dataclass
 class Matern32Kernel(Stationary):
     @override
-    def forward(
+    def _forward(
         self,
         x1: torch.Tensor,
         x2: torch.Tensor | None = None,
-        l: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        lengthscale = l if l is not None else self.lengthscale
-        dist = _scaled_distance(lengthscale, x1, x2)
+        dist = _scaled_distance(self.lengthscale, x1, x2)
         return self.outputscale * (1 + sqrt(3.0) * dist) * torch.exp(-sqrt(3.0) * dist)
 
 
 class Matern52Kernel(Stationary):
     @override
-    def forward(
+    def _forward(
         self,
         x1: torch.Tensor,
         x2: torch.Tensor | None = None,
-        l: torch.Tensor | None = None,
     ) -> torch.Tensor:
-        lengthscale = l if l is not None else self.lengthscale
-        dist = _scaled_distance(lengthscale, x1, x2, sq_dist=True)
+        dist = _scaled_distance(self.lengthscale, x1, x2, sq_dist=True)
         return (
             self.outputscale
             * (1 + sqrt(5.0) * dist + 5.0 / 3.0 * dist)
@@ -151,36 +82,6 @@ def forward(
         )
 
 
-@dataclass
-class HammingKernel(Stationary):
-    @override
-    def forward(
-        self,
-        x1: torch.Tensor,
-        x2: torch.Tensor | None = None,
-        l: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        lengthscale = l if l is not None else self.lengthscale
-        dist = _hamming_distance(lengthscale, x1, x2)
-        return self.outputscale * dist
-
-
-@dataclass
-class RationalQuadraticKernel(Stationary):
-    power: float = 2.0
-
-    @override
-    def forward(
-        self,
-        x1: torch.Tensor,
-        x2: torch.Tensor | None = None,
-        l: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        lengthscale = l if l is not None else self.lengthscale
-        dist_sq = _scaled_distance(lengthscale, x1, x2, sq_dist=True)
-        return self.outputscale * (1 + dist_sq / 2.0) ** (-self.power)
-
-
 def _unscaled_square_distance(
     X: torch.Tensor,
     X2: torch.Tensor | None = None,
diff --git a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
index e6550d65..b1d4cd7e 100644
--- a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
+++ b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
@@ -1,335 +1,81 @@
-import logging
+from __future__ import annotations
 
-import numpy as np
 import torch
-from grakel.kernels import ShortestPathAttr
-from grakel.utils import graph_from_networkx
 
-from .grakel_replace.edge_histogram import EdgeHistogram
-from .grakel_replace.utils import calculate_kernel_matrix_as_tensor
-from .grakel_replace.vertex_histogram import VertexHistogram
-from .grakel_replace.weisfeiler_lehman import WeisfeilerLehman as _WL
-from .graph_kernel import GraphKernels
-from .utils import transform_to_undirected
-from .vectorial_kernels import Stationary
+from typing import Sequence
+from neps.optimizers.bayesian_optimization.kernels.grakel_replace import (
+    VertexHistogram,
+    WeisfeilerLehman as _WL,
+)
+from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel
+from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary
+from neps.search_spaces.encoding import WLInput
 
 
-class WeisfilerLehman(GraphKernels):
+class WeisfilerLehman(Kernel[Sequence[WLInput]]):
     """Weisfiler Lehman kernel using grakel functions"""
 
     def __init__(
         self,
         h: int = 0,
-        base_type: str = "subtree",
-        se_kernel: Stationary = None,
-        layer_weights=None,
-        node_weights=None,
+        se_kernel: Stationary | None = None,
+        layer_weights: torch.Tensor | None = None,
         oa: bool = False,
         node_label: str = "op_name",
-        edge_label: tuple = "op_name",
-        n_jobs: int = None,
-        return_tensor: bool = True,
-        requires_grad: bool = False,
-        undirected: bool = False,
-        **kwargs,
     ):
+        """Initializes the Weisfeiler-Lehman kernel.
+
+        Args:
+            h: The number of Weisfeiler-Lehman iterations
+            se_kernel: defines a stationary vector kernel to be used for
+                successive embedding (i.e. the kernel function on which the
+                vector embedding inner products are computed).
+                If None, uses the default linear kernel
+            layer_weights: The weights for each layer of the Weisfeiler-Lehman kernel.
+                If None, uses uniform
+            oa: whether the optimal assignment variant of the Weisfiler-Lehman
+                kernel should be used
+            node_label: the node_label defining the key node attribute.
         """
-
-        Parameters
-        ----------
-        h: int: The number of Weisfeiler-Lehman iterations
-        base_type: str: defines the base kernel of WL iteration. Possible types are 'subtree' (default), 'sp': shortest path
-        and 'edge' (The latter two are untested)
-        se_kernel: Stationary. defines a stationary vector kernel to be used for successive embedding (i.e. the kernel
-            function on which the vector embedding inner products are computed). if None, use the default linear kernel
-        node_weights
-        oa: whether the optimal assignment variant of the Weisfiler-Lehman kernel should be used
-        node_label: the node_label defining the key node attribute.
-        edge_label: the edge label defining the key edge attribute. only relevant when base_type == 'edge'
-        n_jobs: Parallisation to be used. *current version does not support parallel computing'
-        return_tensor: whether return a torch tensor. If False, a numpy array will be returned.
-        kwargs
-        """
-        super().__init__(**kwargs)
         if se_kernel is not None and oa:
             raise ValueError(
                 "Only one or none of se (successive embedding) and oa (optimal assignment) may be true!"
             )
+
         self.h = h
+        self.se_kernel = se_kernel
+        self.layer_weights = layer_weights
         self.oa = oa
         self.node_label = node_label
-        self.edge_label = edge_label
-        self.layer_weights = layer_weights
-        self.se = se_kernel
-        self.requires_grad = requires_grad
-        self.undirected = undirected
+        if node_label != "op_name":
+            raise NotImplementedError("Only 'op_name' is supported for node_label")
 
-        if base_type not in ["subtree", "sp", "edge"]:
-            raise ValueError(f"Invalid value for base_type ({base_type})")
-        if base_type == "subtree":
-            base_kernel = VertexHistogram, {
-                "sparse": False,
-                "requires_ordered_features": requires_grad,
-            }
-            if oa:
-                base_kernel = VertexHistogram, {
-                    "oa": True,
-                    "sparse": False,
-                    "requires_ordered_features": requires_grad,
-                }
-            elif se_kernel is not None:
-                base_kernel = VertexHistogram, {
-                    "se_kernel": se_kernel,
-                    "sparse": False,
-                    "requires_ordered_features": requires_grad,
-                }
-        elif base_type == "edge":
-            base_kernel = EdgeHistogram, {"sparse": False}
-            if oa:
-                base_kernel = EdgeHistogram, {
-                    "oa": True,
-                    "sparse": False,
-                    "requires_ordered_features": requires_grad,
-                }
-            elif se_kernel is not None:
-                base_kernel = EdgeHistogram, {
-                    "se_kernel": se_kernel,
-                    "sparse": False,
-                    "requires_ordered_features": requires_grad,
-                }
+        self.wl_kernel_: _WL | None = None
 
-        elif base_type == "sp":
-            base_kernel = ShortestPathAttr, {}
-        else:
-            raise NotImplementedError(
-                "The selected WL base kernel type"
-                + str(base_type)
-                + " is not implemented."
-            )
-        self.base_type = base_type
-        self.kern = _WL(
-            n_jobs,
-            h=h,
-            base_graph_kernel=base_kernel,
-            normalize=True,
+    def fit_transform(self, gr: Sequence[WLInput]) -> torch.Tensor:
+        self.wl_kernel_ = _WL(
+            h=self.h,
+            base_graph_kernel=(  # type: ignore
+                VertexHistogram,
+                {
+                    "sparse": False,
+                    "se_kernel": self.se_kernel,
+                    "oa": self.oa,
+                    "requires_ordered_features": True,
+                },
+            ),
             layer_weights=self.layer_weights,
-            node_weights=node_weights,
-        )
-        self.return_tensor = return_tensor
-        self._gram = None
-        self._train, self._train_transformed = None, None
-        self.__name__ = "WeisfeilerLehman"
-
-    def change_se_params(self, params: dict):
-        """Change the kernel parameter of the successive embedding kernel."""
-        if self.se is None:
-            logging.warning("SE kernel is None. change_se_params action voided.")
-            return
-        for k, v in params.items():
-            try:
-                setattr(self.se, k, v)
-            except AttributeError:
-                logging.warning(
-                    str(k) + " is not a valid attribute name of the SE kernel."
-                )
-                continue
-        self.kern.change_se_kernel(self.se)
-
-    def get_info_se_kernel(self):
-        return self.se.lengthscale, self.kern.X[0].X.shape[1]
-
-    def change_kernel_params(self, params: dict):
-        for k, v in params.items():
-            try:
-                getattr(self.kern, k)
-                setattr(self.kern, k, v)
-            except AttributeError:
-                logging.warning(str(k) + " is not a valid attribute name of this kernel.")
-                continue
-            try:
-                setattr(self, k, v)
-            except AttributeError:
-                pass
-        for k in self.kern._initialized.keys():
-            self.kern._initialized[k] = False
-
-        self.kern.initialize()
-
-    def fit_transform(
-        self,
-        gr: list,
-        rebuild_model: bool = False,
-        save_gram_matrix: bool = True,
-        layer_weights=None,
-        gp_fit: bool = True,
-        **kwargs,
-    ):
-        # Transform into GraKeL graph format
-        if rebuild_model is False and self._gram is not None:
-            return self._gram
-        if self.undirected:
-            gr = transform_to_undirected(gr)
-        if self.base_type == "edge":
-            if not all([g.graph_type == "edge_attr" for g in gr]):
-                raise ValueError(
-                    "One or more graphs passed are not edge-attributed graphs. You need all graphs to be"
-                    "in edge format to use 'edge' type Weisfiler-Lehman kernel."
-                )
-
-            gr_ = list(graph_from_networkx(gr, self.node_label, self.edge_label))
-        else:
-            gr_ = list(
-                graph_from_networkx(
-                    gr,
-                    self.node_label,
-                )
-            )
-
-        if rebuild_model or self._gram is None:
-            self._train = gr[:]
-            self._train_transformed = gr_[:]
-
-        if layer_weights is not None and layer_weights is not self.layer_weights:
-            self.change_kernel_params({"layer_weights": layer_weights})
-            self.layer_weights = layer_weights
-
-        K = self.kern.fit_transform(gr_, gp_fit=gp_fit)
-        if self.return_tensor and not isinstance(K, torch.Tensor):
-            K = torch.tensor(K)
-        if save_gram_matrix:
-            self._gram = K.clone()
-            self.layer_weights = self.kern.layer_weights
-        return K
-
-    def transform(
-        self,
-        gr: list,
-    ):
-        """transpose: by default, the grakel produces output in shape of len(y) * len(x2). Use transpose to
-        reshape that to a more conventional shape.."""
-        if self.undirected:
-            gr = transform_to_undirected(gr)
-        if self.base_type == "edge":
-            if not all([g.graph_type == "edge_attr" for g in gr]):
-                raise ValueError(
-                    "One or more graphs passed are not edge-attributed graphs. You need all graphs to be"
-                    "in edge format to use 'edge' type Weisfiler-Lehman kernel."
-                )
-            gr_ = graph_from_networkx(gr, self.node_label, self.edge_label)
-        else:
-            gr_ = graph_from_networkx(
-                gr,
-                self.node_label,
-            )
-
-        K = self.kern.transform(gr_)
-        if self.return_tensor and not isinstance(K, torch.Tensor):
-            K = torch.tensor(K)
-        return K
-
-    def forward_t(self, gr2, gr1=None):
-        """
-        Forward pass, but in tensor format.
-
-        Parameters
-        ----------
-        gr1: single networkx graph
-
-        Returns
-        -------
-        K: the kernel matrix
-        x2 or y: the leaf variable(s) with requires_grad enabled.
-        This allows future Jacobian-vector product to be efficiently computed.
-        """
-        if self.undirected:
-            gr2 = transform_to_undirected(gr2)
-
-        # Convert into GraKel compatible graph format
-        if self.base_type == "edge":
-            gr2 = graph_from_networkx(gr2, self.node_label, self.edge_label)
-        else:
-            gr2 = graph_from_networkx(gr2, self.node_label)
-
-        if gr1 is None:
-            gr1 = self._train_transformed
-        else:
-            if self.undirected:
-                gr1 = transform_to_undirected(gr1)
-            if self.base_type == "edge":
-                gr1 = graph_from_networkx(gr1, self.node_label, self.edge_label)
-            else:
-                gr1 = graph_from_networkx(gr1, self.node_label)
-
-        x_ = torch.tensor(
-            np.concatenate(self.kern.transform(gr1, return_embedding_only=True), axis=1)
-        )
-        y_ = torch.tensor(
-            np.concatenate(self.kern.transform(gr2, return_embedding_only=True), axis=1)
+            normalize=True,
         )
 
-        # Note that the vector length of the WL procedure is indeterminate, and thus dim(Y) != dim(X) in general.
-        # However, since the newly observed features in the test data is always concatenated at the end of the feature
-        # matrix, these features will not matter for the inference, and as such we can safely truncate the feature
-        # matrix for the test data so that only those appearing in both the training and testing datasets are included.
-
-        x_.requires_grad_()
-        y_ = y_[:, : x_.shape[1]].requires_grad_()
-        K = calculate_kernel_matrix_as_tensor(x_, y_, oa=self.oa, se_kernel=self.se)
-        return K, y_, x_
-
-    def feature_map(self, flatten=True):
-        """
-        Get the feature map in term of encoding (position in the feature index): the feature string.
-        Parameters
-        ----------
-        flatten: whether flatten the dict (originally, the result is layered in term of h (the number of WL iterations).
-
-        Returns
-        -------
-
-        """
-        if not self.requires_grad:
-            logging.warning(
-                "Requires_grad flag is off -- in this case, there is risk that the element order in the "
-                "feature map DOES NOT correspond to the order in the feature matrix. To suppress this warning,"
-                "when initialising the WL kernel, do WeisfilerLehman(requires_grad=True)"
-            )
-        if self._gram is None:
-            return None
-        if not flatten:
-            return self.kern._label_node_attr
-        else:
-            res = {}
-            for _, map_ in self.kern._label_node_attr.items():
-                for k, v in map_.items():
-                    res.update({k: v})
-            return res
-
-    def feature_value(self, X_s):
-        """Given a list of architectures X_s, compute their WL embedding of size N_s x D, where N_s is the length
-        of the list and D is the number of training set features.
+        # TODO: This could probably be lifted to the caller
+        K = self.wl_kernel_.fit_transform(gr)
+        K = torch.as_tensor(K, dtype=torch.float64)
+        self.layer_weights_ = self.wl_kernel_.layer_weights
+        return torch.as_tensor(K, dtype=torch.float64)
 
-        Returns:
-            embedding: torch.Tensor of shape N_s x D, described above
-            names: list of shape D, which has 1-to-1 correspondence to each element of the embedding matrix above
-        """
-        if not self.requires_grad:
-            logging.warning(
-                "Requires_grad flag is off -- in this case, there is risk that the element order in the "
-                "feature map DOES NOT correspond to the order in the feature matrix. To suppress this warning,"
-                "when initialising the WL kernel, do WeisfilerLehman(requires_grad=True)"
-            )
-        feat_map = self.feature_map(flatten=False)
-        len_feat_map = [len(f) for f in feat_map.values()]
-        X_s = graph_from_networkx(
-            X_s,
-            self.node_label,
-        )
-        embedding = self.kern.transform(X_s, return_embedding_only=True)
-        for j, em in enumerate(embedding):
-            # Remove some of the spurious features that pop up sometimes
-            embedding[j] = em[:, : len_feat_map[j]]
+    def transform(self, gr: Sequence[WLInput]) -> torch.Tensor:
+        assert self.wl_kernel_ is not None
 
-        # Generate the final embedding
-        embedding = torch.tensor(np.concatenate(embedding, axis=1))
-        return embedding, list(self.feature_map(flatten=True).values())
+        K = self.wl_kernel_.transform(gr)
+        return torch.as_tensor(K, dtype=torch.float64)
diff --git a/neps/optimizers/bayesian_optimization/models/deepGP.py b/neps/optimizers/bayesian_optimization/models/deepGP.py
index 82355ec5..ffc3606f 100644
--- a/neps/optimizers/bayesian_optimization/models/deepGP.py
+++ b/neps/optimizers/bayesian_optimization/models/deepGP.py
@@ -211,7 +211,7 @@ def encode_configs(
         # before inserting the one-hot encoding.
         offset = len(self.numericals)
         for hp_name, hp in self.categoricals.items():
-            budget_tensor = torch.tensor(
+            cat_tensor = torch.tensor(
                 [config[hp_name]._value_index for config in configs],  # type: ignore
                 device=self.device,
                 dtype=torch.float64,
@@ -219,7 +219,7 @@ def encode_configs(
 
             # .. and insert one-hot encoding (ChatGPT solution, verified locally)
             portion = x_buffer[:, offset : offset + len(hp.choices)]
-            portion.scatter_(1, budget_tensor.unsqueeze(1), 1)
+            portion.scatter_(1, cat_tensor.unsqueeze(1), 1)
 
             offset += len(hp.choices)
 
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index 6a878748..eef6b771 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -1,303 +1,267 @@
 from __future__ import annotations
 
 import logging
-from copy import deepcopy
-from typing import Iterable, Literal, Sequence, Any
-
-import numpy as np
-import contextlib
 import torch
-
-from neps.optimizers.bayesian_optimization.kernels.combine_kernels import (
-    ProductKernel,
-    SumKernel,
+import numpy as np
+from typing import Literal, Sequence, Any, Mapping
+from typing_extensions import Literal
+from dataclasses import dataclass, field
+from itertools import product
+
+from neps.optimizers.bayesian_optimization.kernels.kernel import (
+    Kernel,
+    NumericKernel,
+    compute_normalized_log_marginal_likelihood,
+    compute_pd_inverse,
 )
 
-from neps.optimizers.bayesian_optimization.kernels.graph_kernel import GraphKernels
-from neps.optimizers.bayesian_optimization.kernels.utils import extract_configs
 from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary
-from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import WeisfilerLehman
+from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import (
+    WeisfilerLehman,
+)
+from neps.search_spaces.encoding import TensorEncodedConfigs
 from neps.search_spaces.search_space import SearchSpace
 
 logger = logging.getLogger(__name__)
 
+f64 = torch.float64
 
-class ComprehensiveGP:
-    def __init__(
-        self,
-        space: SearchSpace,
-        graph_kernels: Iterable,
-        hp_kernels: Iterable,
-        initial_likelihood: float = 1e-3,
-        weights: Sequence[float] | torch.Tensor | None = None,
-        combined_kernel: Literal["sum", "product"] = "sum",
-        surrogate_model_fit_args: dict | None = None,
-        optimizer_kwargs: dict[str, Any] | None = None,
-        wl_subtree_candidates: Sequence[int] = (1, 2, 3, 4, 5),
-        wl_lengthscales: Sequence[float] = tuple(np.e**i for i in range(-2, 3)),
-        optimize_likelihood: bool = True,
-        optimizer: Literal["adam", "sgd"] = "adam",
-        optimizer_iters: int = 20,
-        max_likelihood: float = 0.01,
-        optimize_wl_layer_weights: bool = False,
-    ):
-        graph_kernels = list(graph_kernels)
-        hp_kernels = list(hp_kernels)
-        n_graph_kernels = len(graph_kernels)
-        n_vector_kernels = len(hp_kernels)
-        n_kernels = n_graph_kernels + n_vector_kernels
-        domain_kernels = [*graph_kernels, *hp_kernels]
-
-        fixed_weights = weights is not None
-        if weights is not None:
-            if weights is not None:
-                assert len(weights) == n_kernels, (
-                    "the weights vector, if supplied, needs to have the same length as "
-                    "the number of kernel_operators!"
-                )
-            init_weights = torch.as_tensor(weights).flatten()
-        else:
-            uniform_weight = 1.0 / self.n_kernels
-            init_weights = torch.full((n_kernels,), uniform_weight, dtype=torch.float64)
 
-        if combined_kernel == "product":
-            _combined_kernel = ProductKernel(*domain_kernels, weights=weights)
-        elif combined_kernel == "sum":
-            _combined_kernel = SumKernel(*domain_kernels, weights=weights)
-        else:
-            raise NotImplementedError(
-                f'Combining kernel {combined_kernel} is not yet implemented! Only "sum" '
-                f'or "product" are currently supported. '
-            )
+GRID_WL_LENGTHSCALES = torch.tensor([np.e**i for i in range(-2, 3)], dtype=f64)
+GRID_WL_SUBTREE_CANDIDATES = (1, 2, 3, 4, 5)
+
+
+def _default_param_grid() -> dict[type[Kernel], list[dict[str, Any]]]:
+    return {
+        WeisfilerLehman: [
+            {"h": h, "se_kernel": Stationary(lengthscale=l)}
+            for h, l in product(GRID_WL_SUBTREE_CANDIDATES, GRID_WL_LENGTHSCALES)
+        ]
+    }
 
-        # TODO: Clone only needed while it can act like configurations
-        self.space = space.clone()
-        self.init_weights = init_weights
-        self.fixed_weights = fixed_weights
-        self.combined_kernel = _combined_kernel
-        self.initial_likelihood = initial_likelihood
-        self.surrogate_model_fit_args = surrogate_model_fit_args or {}
-        self.domain_kernels: list = [*graph_kernels, *hp_kernels]
-        self.n_kernels: int = len(self.domain_kernels)
-        self.n_graph_kernels: int = len(graph_kernels)
-        self.n_vector_kernels: int = len(hp_kernels)
-        self.optimizer_kwargs = optimizer_kwargs or {"lr": 0.1}
-        self.optimize_likelihood = optimize_likelihood
-        self.optimize_wl_layer_weights = optimize_wl_layer_weights
-        self.optimizer = optimizer
-        self.optimizer_iters = optimizer_iters
-        self.max_likelihood = max_likelihood
-        self.wl_subtree_candidates = wl_subtree_candidates
-        self.wl_lengthscales = wl_lengthscales
-
-        # Cache the Gram matrix inverse and its log-determinant
-        self.K_ = None
-        self.K_i_ = None
-        self.logDetK_ = None
-        self.theta_vector_ = None
-        self.layer_weights_ = None
-        self.nlml_ = None
-        self.likelihood_: float | None = None
-        self.weights_: torch.Tensor | None = None
-        self.x_configs_: list[SearchSpace] | None = None
-        self.y_: torch.Tensor | None = None
-        self.y_normalized_: torch.Tensor | None = None
-        self.y_mean_: float | None = None
-        self.y_std_: float | None = None
-        self.n_: int | None = None
-
-    def fit(self, train_x: list[SearchSpace], train_y: list[float]) -> None:
-        """Called by self.fit"""
-        self.x_configs = train_x
-        self.n_ = len(train_x)
-        self.y_ = torch.as_tensor(train_y, dtype=torch.float64)
+
+@dataclass
+class ComprehensiveGP:
+    space: SearchSpace
+    kernels: Sequence[tuple[Kernel, Sequence[str]]]
+    combined_kernel: Literal["sum", "product"] = "sum"
+    initial_likelihood: float = 1e-3
+    optimize_likelihood: bool = True
+    max_likelihood: float = 0.01
+    optimizer: Literal["adam", "sgd"] = "adam"
+    optimizer_iters: int = 20
+    optimize_wl_layer_weights: bool = False
+    surrogate_model_fit_args: Mapping[str, Any] = field(default_factory=dict)
+    optimizer_kwargs: Mapping[str, Any] = field(default_factory=lambda: {"lr": 0.1})
+    kernel_hp_grids: Mapping[type[Kernel], Sequence[Mapping[str, Any]]] = field(
+        default_factory=_default_param_grid
+    )
+
+    # Post fit attributes
+    K_i_: torch.Tensor | None = None
+    n_train_: int | None = None
+    likelihood_: float | None = None
+    y_: torch.Tensor | None = None
+    y_normalized_: torch.Tensor | None = None
+    y_mean_: float | None = None
+    y_std_: float | None = None
+    optimized_kernels_: (
+        list[tuple[NumericKernel | WeisfilerLehman, Sequence[str]]] | None
+    ) = None
+    kernel_weights_: torch.Tensor | None = None
+
+    def __post_init__(self):
+        # TODO: Remove when search space is just definition and does not hold values.
+        self.space = self.space.clone()
+
+    def fit(self, x: TensorEncodedConfigs, train_y: torch.Tensor) -> None:
+        # Preprocessing
+        y_ = torch.as_tensor(train_y, dtype=f64)
 
         # TODO: Dunno if I like this silent hack, setting std to 1 if no std
-        self.y_std_ = s if (s := torch.std(self.y_).item()) != 0 else 1
-        self.y_mean_ = torch.mean(self.y_).item()
-        self.y_normalized_ = (self.y_ - self.y_mean_) / self.y_std_
-
-        # The Gram matrix of the training data
-        self.K_i_, self.logDetK_ = None, None
-
-        if len(self.wl_subtree_candidates) > 0:
-            graphs, _ = extract_configs(self.x_configs)
-            graph_kernels = [
-                k for k in self.domain_kernels if isinstance(k, GraphKernels)
-            ]
-            for i, kernel in enumerate(graph_kernels):
-                if not isinstance(kernel, WeisfilerLehman):
-                    logger.warning(f"No kernel opt. for {type(kernel).__name__}.")
-                    continue
-
-                _xs = (
-                    [x[i] for x in graphs]
-                    if isinstance(graphs[0], list)
-                    else [x for x in graphs]
-                )
-                _grid_search_wl_kernel(
-                    kernel=kernel,
-                    subtree_candidates=self.wl_subtree_candidates,
-                    train_x=_xs,
-                    train_y=self.y_,
-                    likelihood=self.initial_likelihood,
-                    lengthscales=self.wl_lengthscales,
-                )
-
-        weights = self.init_weights.clone()
-
-        if not self.fixed_weights and self.n_kernels > 1:
-            weights.requires_grad_(True)
-
-        n_cat = len(self.space.categoricals)
-        n_num = len(self.space.numerical)
-        theta_categorical = torch.ones(
-            n_cat, requires_grad=n_cat > 1, dtype=torch.float64
-        )
-        theta_numerical = torch.ones(n_num, requires_grad=n_num > 1, dtype=torch.float64)
+        self.y_std_ = s if (s := torch.std(y_).item()) != 0 else 1
+        self.y_mean_ = torch.mean(y_).item()
+        self.y_normalized_ = (y_ - self.y_mean_) / self.y_std_
+
+        optimized_kernels: list[
+            tuple[NumericKernel | WeisfilerLehman, Sequence[str]]
+        ] = []
+        _grids = self.kernel_hp_grids
+
+        def _eval_kernel(_K: torch.Tensor) -> float:
+            assert y_ is not None
+            K_i, logDetK = compute_pd_inverse(_K)
+            nlml = -compute_normalized_log_marginal_likelihood(K_i, logDetK, y_)
+            return float(nlml)
+
+        for kernel, hps in self.kernels:
+            if isinstance(kernel, WeisfilerLehman):
+                assert len(hps) == 1, "Only support single kernel per graph."
+                _xs = x.wl_graph_input(hps[0])
+            elif isinstance(kernel, NumericKernel):
+                _xs = x.tensor(hps)
+            else:
+                raise ValueError(f"Unsupported kernel type {type(kernel)}")
+
+            grid = next((g for t, g in _grids.items() if isinstance(kernel, t)), None)
+            if grid is None:
+                optimized_kernel = kernel.clone()
+                _ = optimized_kernel.fit_transform(_xs)  # type: ignore
+                optimized_kernels.append((kernel, hps))
+                continue
+
+            optimized_kernel, _ = kernel.grid_search(
+                x=_xs,  # type: ignore
+                grid=grid,
+                to_minimize=_eval_kernel,
+            )
+            optimized_kernels.append((optimized_kernel, hps))
 
-        theta_vectors = {
-            "categorical": theta_categorical,
-            "continuous": theta_numerical,  # NOTE: This actually includes integers too -_-
-        }
+        # Optimization weights
         likelihood = torch.tensor(
-            self.initial_likelihood, requires_grad=self.optimize_likelihood
+            self.initial_likelihood,
+            requires_grad=self.optimize_likelihood,
         )
 
-        layer_weights = None
-        if self.optimize_wl_layer_weights:
-            for kernel in self.domain_kernels:
-                if isinstance(kernel, WeisfilerLehman) and kernel.h != 0:
-                    layer_weights = torch.ones(kernel.h + 1, requires_grad=True)
-                    break
+        kernel_weights = torch.ones(
+            len(optimized_kernels),
+            requires_grad=len(optimized_kernels) > 1,
+            dtype=f64,
+        )
+        should_optimize = lambda p: p.is_leaf and p.requires_grad
 
         # Linking the optimizer variables to the sum kernel
-        optim_vars = [
+        optim_vars: list[torch.Tensor] = [
             a
-            for a in (
-                weights,
-                likelihood,
-                layer_weights,
-                theta_categorical,
-                theta_numerical,
-            )
-            if a is not None and a.is_leaf and a.requires_grad
+            for a in (kernel_weights, likelihood)
+            if a is not None and should_optimize(a)
+        ]
+        layer_weights = [
+            kernel.layer_weights_
+            for kernel, _ in optimized_kernels
+            if isinstance(kernel, WeisfilerLehman)
+            and kernel.layer_weights_ is not None
+            and should_optimize(kernel.layer_weights_)
+        ]
+        lengthscales = [
+            kernel.layer_weights_
+            for kernel, _ in optimized_kernels
+            if isinstance(kernel, NumericKernel) and should_optimize(kernel.lengthscale)
+        ]
+        lengthscalebounds = [
+            kernel.lengthscale_bounds
+            for kernel, _ in optimized_kernels
+            if isinstance(kernel, NumericKernel) and should_optimize(kernel.lengthscale)
         ]
 
-        nlml = None
-        if len(optim_vars) == 0:  # Skip optimisation
-            K = self.combined_kernel.fit_transform(
-                weights,
-                self.x_configs,
-                feature_lengthscale=theta_vectors,
-                layer_weights=layer_weights,
-                rebuild_model=True,
-            )
-            K_i, logDetK = compute_pd_inverse(K, jitter=likelihood)
+        # Select the optimizer
+        if self.optimizer == "adam":
+            optim = torch.optim.Adam(optim_vars, **self.optimizer_kwargs)  # type: ignore
+        elif self.optimizer == "sgd":
+            optim = torch.optim.SGD(optim_vars, **self.optimizer_kwargs)  # type: ignore
         else:
-            # Select the optimizer
-            if self.optimizer == "adam":
-                optim = torch.optim.Adam(optim_vars, **self.optimizer_kwargs)  # type: ignore
-            elif self.optimizer == "sgd":
-                optim = torch.optim.SGD(optim_vars, **self.optimizer_kwargs)  # type: ignore
-            else:
-                raise ValueError(f"Invalid optimizer {self.optimizer}")
-
-            K: torch.Tensor | None = None
-            for i in range(self.optimizer_iters):
-                optim.zero_grad()
-                K = self.combined_kernel.fit_transform(
-                    weights=weights,
-                    configs=train_x,  # TODO
-                    feature_lengthscale=theta_vectors,
-                    layer_weights=layer_weights,
-                    rebuild_model=True,
-                    save_gram_matrix=True,
-                )
-                K_i, logDetK = compute_pd_inverse(K, jitter=likelihood)
-                nlml = -compute_log_marginal_likelihood(
-                    K_i, logDetK, y=self.y_normalized_
-                )
-                nlml.backward()
-                if i % 10 == 0:
-                    logger.debug(
-                        f"Iteration: {i}/{self.optimizer_iters} "
-                        f"Negative log-marginal likelihood:"
-                        f"{nlml.item()} {theta_vectors} {weights} {likelihood}"
-                    )
-
-                optim.step()  # TODO
-
-                with torch.no_grad():
-                    if weights.is_leaf:
-                        weights.clamp_(0.0, 1.0)
-
-                    theta_vectors = self.combined_kernel.clamp_theta_vector(theta_vectors)
-
-                    if likelihood.is_leaf:
-                        likelihood.clamp_(1e-5, self.max_likelihood)
-
-                    if layer_weights is not None and layer_weights.is_leaf:
-                        layer_weights.clamp_(0.0, 1.0)
-
-                optim.zero_grad(set_to_none=True)
-
-            assert K is not None
+            raise ValueError(f"Invalid optimizer {self.optimizer}")
+
+        K: torch.Tensor | None = None
+        N = len(x)
+        for _ in range(self.optimizer_iters):
+            optim.zero_grad()
+
+            # Now we iterate over kernels to build up K
+            _init = torch.zeros if self.combined_kernel == "sum" else torch.ones
+            K = _init(N, N, dtype=f64)
+            for (kernel, hps), weight in zip(self.kernels, kernel_weights):
+                if isinstance(kernel, WeisfilerLehman):
+                    assert len(hps) == 1, "Only support single kernel per graph."
+                    _xs = x.wl_graph_input(hps[0])
+                    gram = kernel.fit_transform(_xs)
+                elif isinstance(kernel, NumericKernel):
+                    _xs = x.tensor(hps)
+                    gram = kernel.fit_transform(_xs)
+                else:
+                    raise ValueError(f"Unsupported kernel type {type(kernel)}")
+
+                if self.combined_kernel == "sum":
+                    K.add_(weight * gram)
+                elif self.combined_kernel == "product":
+                    K.mul_(weight * gram)
+                else:
+                    raise ValueError(f"Invalid combined_kernel {self.combined_kernel}")
+
+            # Normalize
+            K_diag = torch.sqrt(torch.diag(K))
+            K /= torch.ger(K_diag, K_diag)
             K_i, logDetK = compute_pd_inverse(K, jitter=likelihood)
 
+            # If there's nothing to optimize, break out early
+            if len(optim_vars) == 0:
+                break
+
+            nlml = -compute_normalized_log_marginal_likelihood(
+                K_i, logDetK, y=self.y_normalized_
+            )
+            nlml.backward()
+            optim.step()
+
+            with torch.no_grad():
+                kernel_weights.clamp_(0.0, 1.0)
+                if likelihood.is_leaf:
+                    likelihood.clamp_(1e-9, self.max_likelihood)
+
+                for ls, ls_bounds in zip(lengthscales, lengthscalebounds):
+                    ls.clamp_(*ls_bounds)
+
+                for lw in layer_weights:
+                    lw.clamp_(0.0, 1.0)
+
+            optim.zero_grad()
+
+        assert K is not None
+        K_i, logDetK = compute_pd_inverse(K, jitter=likelihood)
+
         # Apply the optimal hyperparameters
-        self.weights_ = weights.clone() / torch.sum(weights)
         self.K_i_ = K_i.clone()
-        self.K_ = K.clone()
-        self.logDetK_ = logDetK.clone()
         self.likelihood_ = likelihood.item()
-        self.theta_vector_ = theta_vectors
-        self.layer_weights_ = layer_weights
-        self.nlml_ = nlml.detach().cpu() if nlml is not None else None
-
-        for kernel in self.combined_kernel.kernels:
-            if isinstance(kernel, Stationary):
-                kernel.update_hyperparameters(lengthscale=self.theta_vector_)
-
-        logger.debug("Optimisation summary: ")
-        logger.debug(f"Optimal NLML: {nlml}")
-        logger.debug(f"Lengthscales: {theta_vectors}")
-        with contextlib.suppress(AttributeError):
-            logger.debug(f"Optimal h: {self.domain_kernels[0]._h}")
-        logger.debug(f"Weights: {self.weights_}")
-        logger.debug(f"Lik: {self.likelihood_}")
-        logger.debug(f"Optimal layer weights {layer_weights}")
-
-    def predict(self, x_configs: list[SearchSpace]) -> tuple[torch.Tensor, torch.Tensor]:
-        """Kriging predictions"""
-        if not isinstance(x_configs, list):
-            x_configs = [x_configs]
+        self.optimized_kernels_ = optimized_kernels
+        self.kernel_weights_ = kernel_weights
+        self.n_train_ = N
 
-        if self.K_i_ is None or self.logDetK_ is None or self.weights_ is None:
+    def predict(self, x: TensorEncodedConfigs) -> tuple[torch.Tensor, torch.Tensor]:
+        """Kriging predictions"""
+        if self.K_i_ is None or self.n_train_ is None or self.kernel_weights_ is None:
             raise ValueError(
                 "Inverse of Gram matrix is not instantiated. Please call the optimize "
                 "function to fit on the training data first!"
             )
 
+        _init = torch.zeros if self.combined_kernel == "sum" else torch.ones
+        N = self.n_train_ + len(x)
+        K = _init(N, N, dtype=f64)
+        for (kernel, hps), weight in zip(self.kernels, self.kernel_weights_):
+            if isinstance(kernel, WeisfilerLehman):
+                assert len(hps) == 1, "Only support single kernel per graph."
+                _x_test = x.wl_graph_input(hps[0])
+                gram = kernel.transform(_x_test)
+            elif isinstance(kernel, NumericKernel):
+                _x_test = x.tensor(hps)
+                gram = kernel.fit_transform(_x_test)
+            else:
+                raise ValueError(f"Unsupported kernel type {type(kernel)}")
+
+            if self.combined_kernel == "sum":
+                K.add_(weight * gram)
+            elif self.combined_kernel == "product":
+                K.mul_(weight * gram)
+            else:
+                raise ValueError(f"Invalid combined_kernel {self.combined_kernel}")
+
         # Concatenate the full list
-        X_configs_all = self.x_configs + x_configs
-        n_train = len(self.x_configs)
-        n_test = len(x_configs)
-
-        K_full = self.combined_kernel.fit_transform(
-            weights=self.weights_,
-            configs=X_configs_all,
-            layer_weights=self.layer_weights_,
-            feature_lengthscale=self.theta_vector_,
-            rebuild_model=True,
-            save_gram_matrix=False,
-            gp_fit=False,
-        )
+        n_test = len(x)
 
-        K_s = K_full[:n_train:, n_train:]
-        K_ss = K_full[n_train:, n_train:] + self.likelihood_ * torch.eye(n_test)
+        K_s = K[: self.n_train_ :, self.n_train_ :]
+        K_ss = K[self.n_train_ :, self.n_train_ :] + self.likelihood_ * torch.eye(n_test)
 
-        mu_s = K_s.t() @ self.K_i_ @ self.y_
+        mu_s = K_s.t() @ self.K_i_ @ self.y_normalized_
         mu_s = mu_s * self.y_std_ + self.y_mean_
 
         cov_s = K_ss - K_s.t() @ self.K_i_ @ K_s
@@ -305,97 +269,3 @@ def predict(self, x_configs: list[SearchSpace]) -> tuple[torch.Tensor, torch.Ten
         cov_s = (torch.sqrt(cov_s) * self.y_std_) ** 2
 
         return mu_s, cov_s
-
-
-def _grid_search_wl_kernel(
-    kernel: WeisfilerLehman,
-    subtree_candidates,
-    train_x: list,
-    train_y: torch.Tensor,
-    likelihood: float,
-    lengthscales=None,
-):
-    """Optimize the *discrete hyperparameters* of Weisfeiler Lehman kernel.
-    k: a Weisfeiler-Lehman kernel instance
-    hyperparameter_candidate: list of candidate hyperparameter to try
-    train_x: the train data
-    train_y: the train label
-    lik: likelihood
-    lengthscale: if using RBF kernel for successive embedding, the list of lengthscale to be grid searched over
-    """
-    # lik = 1e-6
-    assert len(train_x) == len(train_y)
-    best_nlml = torch.tensor(np.inf)
-    best_subtree_depth = None
-    best_lengthscale = None
-    best_K = None
-    if lengthscales is not None and kernel.se is not None:
-        candidates = [(h_, l_) for h_ in subtree_candidates for l_ in lengthscales]
-    else:
-        candidates = [(h_, None) for h_ in subtree_candidates]
-
-    for i in candidates:
-        if kernel.se is not None:
-            kernel.change_se_params({"lengthscale": i[1]})
-
-        kernel.change_kernel_params({"h": i[0]})
-        K = kernel.fit_transform(train_x, rebuild_model=True, save_gram_matrix=True)
-        K_i, logDetK = compute_pd_inverse(K, jitter=likelihood)
-        nlml = -compute_log_marginal_likelihood(K_i, logDetK, train_y)
-        if nlml < best_nlml:
-            best_nlml = nlml
-            best_subtree_depth, best_lengthscale = i
-            best_K = torch.clone(K)
-
-    kernel.change_kernel_params({"h": best_subtree_depth})
-    if kernel.se is not None:
-        kernel.change_se_params({"lengthscale": best_lengthscale})
-    kernel._gram = best_K
-
-
-def compute_log_marginal_likelihood(
-    K_i: torch.Tensor,
-    logDetK: torch.Tensor,
-    y: torch.Tensor,
-    *,
-    normalize: bool = True,
-) -> torch.Tensor:
-    """Compute the zero mean Gaussian process log marginal likelihood given the inverse of Gram matrix K(x2,x2), its
-    log determinant, and the training label vector y.
-    Option:
-
-    normalize: normalize the log marginal likelihood by the length of the label vector, as per the gpytorch
-    routine.
-    """
-    lml = (
-        -0.5 * (y.t() @ K_i @ y)
-        + 0.5 * logDetK
-        - y.shape[0] / 2.0 * torch.log(2 * torch.tensor(np.pi))
-    )
-    return lml / y.shape[0] if normalize else lml
-
-
-def compute_pd_inverse(
-    K: torch.Tensor,
-    *,
-    jitter: float | torch.Tensor = 1e-6,
-    attempts: int = 3,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """Compute the inverse of a postive-(semi)definite matrix K using Cholesky inversion."""
-    n = K.shape[0]
-    assert (
-        isinstance(jitter, float) or jitter.ndim == 0
-    ), "only homoscedastic noise variance is allowed here!"
-    for i in range(attempts):
-        try:
-            jitter_diag = jitter * torch.eye(n, device=K.device) * 10**i
-            Kc = torch.linalg.cholesky(K + jitter_diag)
-            break
-        except RuntimeError:
-            pass
-    else:
-        raise RuntimeError(f"Gram matrix not positive definite despite of jitter:\n{K}")
-
-    logDetK = -2 * torch.sum(torch.log(torch.diag(Kc)))
-    K_i = torch.cholesky_inverse(Kc)
-    return K_i.to(dtype=torch.float64), logDetK.to(dtype=torch.float64)
diff --git a/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py b/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py
index a359b937..2c9993be 100644
--- a/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py
+++ b/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py
@@ -637,7 +637,7 @@ def _fit(
 
         for k in self.combined_kernel.kernels:
             if isinstance(k, Stationary):
-                k.update_hyperparameters(lengthscale=torch.exp(theta_vector))
+                k.update_lengthscales(lengthscale=torch.exp(theta_vector))
 
         self.combined_kernel.weights = weights.clone()
 
@@ -693,55 +693,6 @@ def predict(self, x_configs, preserve_comp_graph: bool = False):
             del combined_kernel_copy
         return mu_s, cov_s
 
-    def predict_single_hierarchy(
-        self, x_configs, hierarchy_id=0, preserve_comp_graph: bool = False
-    ):
-        """Kriging predictions"""
-
-        if not isinstance(x_configs, list):
-            # Convert a single input X_s to a singleton list
-            x_configs = [x_configs]
-
-        if self.K_i is None or self.logDetK is None:
-            raise ValueError(
-                "Inverse of Gram matrix is not instantiated. Please call the optimize function to "
-                "fit on the training data first!"
-            )
-
-        # Concatenate the full list
-        X_configs_all = self.x_configs + x_configs
-
-        # Make a copy of the sum_kernels for this step, to avoid breaking the autodiff if grad guided mutation is used
-        if preserve_comp_graph:
-            combined_kernel_copy = deepcopy(self.combined_kernel)
-        else:
-            combined_kernel_copy = self.combined_kernel
-
-        K_sub_full = combined_kernel_copy.fit_transform_single_hierarchy(
-            self.weights,
-            X_configs_all,
-            normalize=self.normalize_combined_kernel,
-            hierarchy_id=hierarchy_id,
-            feature_lengthscale=torch.exp(self.theta_vector),
-            layer_weights=self.layer_weights,
-            rebuild_model=True,
-            save_gram_matrix=False,
-            gp_fit=False,
-        )
-
-        K_s = K_sub_full[: self.n :, self.n :]
-        K_ss = K_sub_full[self.n :, self.n :]
-        mu_s = K_s.t() @ self.K_i @ self.y
-        cov_s_full = K_ss - K_s.t() @ self.K_i @ K_s
-        cov_s = torch.clamp(cov_s_full, self.likelihood, np.inf)
-        mu_s = unnormalize_y(mu_s, self.y_mean, self.y_std)
-        std_s = torch.sqrt(cov_s)
-        std_s = unnormalize_y(std_s, None, self.y_std, True)
-        cov_s = std_s**2
-        if preserve_comp_graph:
-            del combined_kernel_copy
-        return mu_s, cov_s
-
     @property
     def x(self):
         return self.x_configs
@@ -759,115 +710,6 @@ def _reset_XY(self, train_x: Iterable, train_y: Union[Iterable, torch.Tensor]):
         # The Gram matrix of the training data
         self.K_i, self.logDetK = None, None
 
-    def dmu_dphi(
-        self,
-        X_s=None,
-        # compute_grad_var=False,
-        average_across_features=True,
-        average_across_occurrences=False,
-    ):
-        r"""
-        Compute the derivative of the GP posterior mean at the specified input location with respect to the
-        *vector embedding* of the graph (e.g., if using WL-subtree, this function computes the gradient wrt
-        each subtree pattern)
-
-        The derivative is given by
-        $
-        \frac{\partial \mu^*}{\partial \phi ^*} = \frac{\partial K(\phi, \phi^*)}{\partial \phi ^ *}K(\phi, \phi)^{-1}
-        \mathbf{y}
-        $
-
-        which derives directly from the GP posterior mean formula, and since the term $K(\phi, \phi)^{-1} and \mathbf{y}
-        are both independent of the testing points (X_s, or \phi^*}, the posterior gradient is simply the matrix
-        produce of the kernel gradient with the inverse Gram and the training label vector.
-
-        Parameters
-        ----------
-        X_s: The locations on which the GP posterior mean derivatives should be evaluated. If left blank, the
-        derivatives will be evaluated at the training points.
-
-        compute_grad_var: bool. If true, also compute the gradient variance.
-
-        The derivative of GP is also a GP, and thus the predictive distribution of the posterior gradient is Gaussian.
-        The posterior mean is given above, and the posterior variance is:
-        $
-        \mathbb{V}[\frac{\partial f^*}{\partial \phi^*}]= \frac{\partial^2k(\phi^*, \phi^*)}{\partial \phi^*^2} -
-        \frac{\partial k(\phi^*, \Phi)}{\partial \phi^*}K(X, X)^{-1}\frac{\partial k{(\Phi, \phi^*)}}{\partial \phi^*}
-        $
-
-        Returns
-        -------
-        list of K torch.Tensor of the shape N x2 D, where N is the length of the X_s list (each element of which is a
-        networkx graph), K is the number of kernel_operators in the combined kernel and D is the dimensionality of the
-        feature vector (this is determined by the specific graph kernel.
-
-        OR
-
-        list of K torch.Tensor of shape D, if averaged_over_samples flag is enabled.
-        """
-        if self.K_i is None or self.logDetK is None:
-            raise ValueError(
-                "Inverse of Gram matrix is not instantiated. Please call the optimize "
-                "function to fit on the training data first!"
-            )
-        if self.n_vector_kernels:
-            if X_s is not None:
-                V_s = self._get_vectorial_features(X_s, self.vectorial_feactures)
-                V_s, _, _ = standardize_x(V_s, self.x_features_min, self.x_features_max)
-            else:
-                V_s = self.x_features
-                X_s = self.x[:]
-        else:
-            V_s = None
-            X_s = X_s if X_s is not None else self.x[:]
-
-        alpha = (self.K_i @ self.y).double().reshape(1, -1)
-        dmu_dphi = []
-        # dmu_dphi_var = [] if compute_grad_var else None
-
-        Ks_handles = []
-        feature_matrix = []
-        for j, x_s in enumerate(X_s):
-            jacob_vecs = []
-            if V_s is None:
-                handles = self.combined_kernel.forward_t(
-                    self.weights,
-                    [x_s],
-                )
-            else:
-                handles = self.combined_kernel.forward_t(self.weights, [x_s], V_s[j])
-            Ks_handles.append(handles)
-            # Each handle is a 2-tuple. first element is the Gram matrix, second element is the leaf variable
-            feature_vectors = []
-            for handle in handles:
-                k_s, y, _ = handle
-                # k_s is output, leaf is input, alpha is the K_i @ y term which is constant.
-                # When compute_grad_var is not required, computational graphs do not need to be saved.
-                jacob_vecs.append(
-                    torch.autograd.grad(
-                        outputs=k_s, inputs=y, grad_outputs=alpha, retain_graph=False
-                    )[0]
-                )
-                feature_vectors.append(y)
-            feature_matrix.append(feature_vectors)
-            jacob_vecs = torch.cat(jacob_vecs)
-            dmu_dphi.append(jacob_vecs)
-
-        feature_matrix = torch.cat([f[0] for f in feature_matrix])
-        if average_across_features:
-            dmu_dphi = torch.cat(dmu_dphi)
-            # compute the weighted average of the gradient across N_t.
-            # feature matrix is of shape N_t x K x D
-            avg_mu, avg_var, incidences = get_grad(
-                dmu_dphi, feature_matrix, average_across_occurrences
-            )
-            return avg_mu, avg_var, incidences
-        return (
-            dmu_dphi,
-            None,
-            feature_matrix.sum(dim=0) if average_across_occurrences else feature_matrix,
-        )
-
 
 def get_grad(grad_matrix, feature_matrix, average_occurrences=False):
     r"""
@@ -982,7 +824,7 @@ def _grid_search_wl_kernel(
     k.change_kernel_params({"h": best_subtree_depth})
     if k.se is not None:
         k.change_se_params({"lengthscale": best_lengthscale})
-    k._gram = best_K
+    k.gram_ = best_K
 
 
 def get_theta_vector(vectorial_features):
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 9fc3aeae..2002aeab 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -25,7 +25,7 @@
 from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
     AcquisitionSampler,
 )
-from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_kernels
+from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_default_kernels
 from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
 
 if TYPE_CHECKING:
@@ -142,7 +142,7 @@ def __init__(
         self.sample_default_first = sample_default_first
 
         surrogate_model_args = surrogate_model_args or {}
-        graph_kernels, hp_kernels = get_kernels(
+        graph_kernels, hp_kernels = get_default_kernels(
             self.pipeline_space,
             domain_se_kernel,
             graph_kernels,
diff --git a/neps/optimizers/multi_fidelity/dyhpo.py b/neps/optimizers/multi_fidelity/dyhpo.py
index 59804637..bb4879b9 100755
--- a/neps/optimizers/multi_fidelity/dyhpo.py
+++ b/neps/optimizers/multi_fidelity/dyhpo.py
@@ -20,7 +20,7 @@
 from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
     AcquisitionSampler,
 )
-from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_kernels
+from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_default_kernels
 from neps.optimizers.multi_fidelity.mf_bo import FreezeThawModel, PFNSurrogate
 from neps.optimizers.multi_fidelity.utils import MFObservedData
 
@@ -116,7 +116,7 @@ def __init__(
         )
 
         # Preparing model
-        self.graph_kernels, self.hp_kernels = get_kernels(
+        self.graph_kernels, self.hp_kernels = get_default_kernels(
             pipeline_space=pipeline_space,
             domain_se_kernel=domain_se_kernel,
             graph_kernels=graph_kernels,
diff --git a/neps/optimizers/multi_fidelity/sampling_policy.py b/neps/optimizers/multi_fidelity/sampling_policy.py
index 9321633c..fceb44e9 100644
--- a/neps/optimizers/multi_fidelity/sampling_policy.py
+++ b/neps/optimizers/multi_fidelity/sampling_policy.py
@@ -22,7 +22,7 @@
 from ..bayesian_optimization.acquisition_samplers.base_acq_sampler import (
     AcquisitionSampler,
 )
-from ..bayesian_optimization.kernels.get_kernels import get_kernels
+from ..bayesian_optimization.kernels.get_kernels import get_default_kernels
 from ..bayesian_optimization.models import SurrogateModelMapping
 from ..multi_fidelity_prior.utils import (
     compute_config_dist,
@@ -170,16 +170,13 @@ def sample(
         policy_idx = np.random.choice(range(len(prob_weights)), p=prob_weights)
         policy = sorted(self.policy_map.keys())[policy_idx]
 
-        self.logger.info(
-            f"Sampling from {policy} with weights (i, p, r)={prob_weights}"
-        )
+        self.logger.info(f"Sampling from {policy} with weights (i, p, r)={prob_weights}")
 
         if policy == "prior":
             config = self.pipeline_space.sample(
                 patience=self.patience, user_priors=True, ignore_fidelity=True
             )
         elif policy == "inc":
-
             if (
                 hasattr(self.pipeline_space, "has_prior")
                 and self.pipeline_space.has_prior
@@ -213,9 +210,7 @@ def sample(
                 # the weight distributed across prior adnd inc
                 _w_priors = 1 - self.policy_map["random"]
                 # re-calculate normalized score ratio for prior-inc
-                w_prior = np.clip(
-                    self.policy_map["prior"] / _w_priors, a_min=0, a_max=1
-                )
+                w_prior = np.clip(self.policy_map["prior"] / _w_priors, a_min=0, a_max=1)
                 w_inc = np.clip(self.policy_map["inc"] / _w_priors, a_min=0, a_max=1)
                 # calculating difference of prior and inc score
                 score_diff = np.abs(w_prior - w_inc)
@@ -288,7 +283,7 @@ def __init__(
 
         surrogate_model_args = surrogate_model_args or {}
 
-        graph_kernels, hp_kernels = get_kernels(
+        graph_kernels, hp_kernels = get_default_kernels(
             pipeline_space=pipeline_space,
             domain_se_kernel=domain_se_kernel,
             graph_kernels=graph_kernels,
@@ -302,9 +297,9 @@ def __init__(
         if not surrogate_model_args["hp_kernels"]:
             raise ValueError("No kernels are provided!")
         if "vectorial_features" not in surrogate_model_args:
-            surrogate_model_args[
-                "vectorial_features"
-            ] = pipeline_space.get_vectorial_dim()
+            surrogate_model_args["vectorial_features"] = (
+                pipeline_space.get_vectorial_dim()
+            )
 
         self.surrogate_model = instance_from_map(
             SurrogateModelMapping,
@@ -439,7 +434,7 @@ def __init__(
 
         surrogate_model_args = surrogate_model_args or {}
 
-        graph_kernels, hp_kernels = get_kernels(
+        graph_kernels, hp_kernels = get_default_kernels(
             pipeline_space=pipeline_space,
             domain_se_kernel=domain_se_kernel,
             graph_kernels=graph_kernels,
@@ -453,9 +448,9 @@ def __init__(
         if not surrogate_model_args["hp_kernels"]:
             raise ValueError("No kernels are provided!")
         if "vectorial_features" not in surrogate_model_args:
-            surrogate_model_args[
-                "vectorial_features"
-            ] = pipeline_space.get_vectorial_dim()
+            surrogate_model_args["vectorial_features"] = (
+                pipeline_space.get_vectorial_dim()
+            )
 
         self.surrogate_model = instance_from_map(
             SurrogateModelMapping,
diff --git a/neps/search_spaces/__init__.py b/neps/search_spaces/__init__.py
index 7eb4332a..8a289100 100644
--- a/neps/search_spaces/__init__.py
+++ b/neps/search_spaces/__init__.py
@@ -1,5 +1,6 @@
 from neps.search_spaces.architecture.api import ArchitectureParameter, FunctionParameter
 from neps.search_spaces.architecture.graph_grammar import (
+    CoreGraphGrammar,
     GraphGrammar,
     GraphGrammarCell,
     GraphGrammarRepetitive,
@@ -23,6 +24,7 @@
     "ArchitectureParameter",
     "CategoricalParameter",
     "ConstantParameter",
+    "CoreGraphGrammar",
     "FloatParameter",
     "FunctionParameter",
     "GraphGrammar",
diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
new file mode 100644
index 00000000..1ab1f92a
--- /dev/null
+++ b/neps/search_spaces/encoding.py
@@ -0,0 +1,132 @@
+from __future__ import annotations
+
+from collections.abc import Sized
+
+from dataclasses import dataclass
+from grakel.utils import graph_from_networkx
+
+from typing import Sequence, Iterable, TypeAlias
+from typing_extensions import Self
+from more_itertools import split_when
+from itertools import chain
+import torch
+
+from neps.search_spaces.search_space import SearchSpace
+
+WLInput: TypeAlias = tuple[dict, dict | None, dict | None]
+
+
+@dataclass
+class TensorEncodedConfigs(Sized):
+    _tensor_pack: torch.Tensor | None
+    """Layout such that _tensor_pack[0] is the first config.
+
+    In the case that there are no numeric/categorical hyperparameters,
+    this is None.
+
+    index config_row_id | fidelities... | numericals... | one_hot_categoricals...
+           0
+           1
+           2
+          ...
+
+    NOTE: A slight memory innefficiency here is that we store the one-hot encoded
+    as a float tensor, rather than a byte tensor. This makes joint numerical/categorical
+    kernels more efficient, as well as entire config row access at the cost of memory.
+    This should not be a problem if we do not have a large number of categorical
+    hyperparameters with a high number of choices.
+    """
+    _graphs: dict[str, Sequence[WLInput]]
+    _col_lookup: dict[str, tuple[int, int]]  # range(inclusive, exclusive)
+
+    def __len__(self) -> int:
+        return self._tensor_pack.shape[0] if self._tensor_pack is not None else 0
+
+    def wl_graph_input(self, hp: str) -> Sequence[WLInput]:
+        return self._graphs[hp]
+
+    def tensor(self, hps: Iterable[str]) -> torch.Tensor:
+        if self._tensor_pack is None:
+            raise ValueError("No numerical/categorical hyperparameters were encoded.")
+
+        cols: list[tuple[int, int]] = []
+        for hp in hps:
+            _cols = self._col_lookup.get(hp)
+            if _cols is None:
+                raise ValueError(f"Hyperparameter {hp} not found in the lookup table.")
+            cols.append(_cols)
+
+        # OPTIM: This code with `split_when` and `chunks` makes sure to grab
+        # consecutive chunks of memory where possible. For example,
+        # if we want all categoricals, this will just return the entire
+        # categorical tensor, rather than subselecting each part and then concatenating.
+        # Also works for numericals.
+        sorted_indices = sorted(cols)
+        non_consecutive_tuple = lambda x, y: x[1] != y[0]
+        chunks = list(split_when(sorted_indices, non_consecutive_tuple))
+        slices = [slice(chunk[0][0], chunk[-1][1]) for chunk in chunks]
+        tensors = [self._tensor_pack[:, s] for s in slices]
+
+        if len(tensors) == 1:
+            return tensors[0].clone()
+
+        return torch.cat(tensors, dim=1)
+
+    @classmethod
+    def encode(
+        cls,
+        space: SearchSpace,
+        configs: list[SearchSpace],
+        *,
+        node_label: str = "op_name",
+        device: torch.device,
+    ) -> Self:
+        assert node_label == "op_name", "Only 'op_name' is supported for node_label"
+
+        _graphs: dict[str, Sequence[WLInput]] = {}
+        for hp_name in space.graphs.keys():
+            gs = [conf.graphs[hp_name].value for conf in configs]
+            if (
+                len(gs) > 0
+                and isinstance(gs[0], list)
+                and len(gs[0]) > 0
+                and isinstance(gs[0][0], list)
+            ):
+                gs = [_list for list_of_list in gs for _list in list_of_list]
+            _graphs[hp_name] = graph_from_networkx(gs)  # type: ignore
+
+        _lookup: dict[str, tuple[int, int]] = {}
+
+        n_fids = len(space.fidelities)
+        n_nums = len(space.numerical)
+        n_cats = sum(len(hp.choices) for hp in space.categoricals.values())
+
+        width = n_fids + n_nums + n_cats
+        if width == 0:
+            return cls(_graphs=_graphs, _tensor_pack=None, _col_lookup={})
+
+        _tensor_pack = torch.empty(size=(len(configs), width), dtype=torch.float64)
+
+        offset = 0
+        for hp_name in chain(space.fidelities, space.numerical):
+            _lookup[hp_name] = (offset, offset + 1)
+            _xs = [config.fidelities[hp_name].normalized_value for config in configs]
+            values = torch.tensor(_xs, torch.float64, device=device)
+
+            _tensor_pack[:, offset] = values
+
+            offset += 1
+
+        for hp_name, cat in space.categoricals.items():
+            n_choices = len(cat.choices)
+            _lookup[hp_name] = (offset, offset + n_choices)
+
+            # .. and insert one-hot encoding (ChatGPT solution, verified locally)
+            _xs = [config[hp_name].normalized_value for config in configs]
+            cat_tensor = torch.tensor(_xs, torch.float64, device=device).unsqueeze(1)
+
+            _tensor_pack[:, offset : offset + n_choices].scatter_(1, cat_tensor, 1)
+
+            offset += n_choices
+
+        return cls(_graphs=_graphs, _tensor_pack=_tensor_pack, _col_lookup=_lookup)

From 0f235ee39a47739dbed20e8761e8d4ff2de1020b Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Thu, 15 Aug 2024 18:57:23 +0200
Subject: [PATCH 07/63] fix: typo

---
 .../kernels/grakel_replace/vertex_histogram.py                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
index 103818ae..e59b5433 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
@@ -262,7 +262,7 @@ def _calculate_kernel_matrix(self, Y=None):
                         K[j, i] = K[i, j]
             else:
                 if self.se_kernel is not None:
-                    K = self.se_kernel._forwardd(self.X, self.X)
+                    K = self.se_kernel._forward(self.X, self.X)
                 else:
                     K = self.X @ self.X.T
         else:
@@ -275,7 +275,7 @@ def _calculate_kernel_matrix(self, Y=None):
                         )
             else:
                 if self.se_kernel is not None:
-                    K = self.se_kernel._forwardd(self.X, Y)
+                    K = self.se_kernel._forward(self.X, Y)
                 else:
                     K = Y[:, : self.X.shape[1]] @ self.X.T
 

From 83331d9db60b313b0f9a5ef23adffea189d11085 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Thu, 15 Aug 2024 19:02:55 +0200
Subject: [PATCH 08/63] fix: unscaled distance

---
 .../bayesian_optimization/kernels/vectorial_kernels.py      | 2 +-
 neps/optimizers/bayesian_optimization/models/gp.py          | 6 +-----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
index 8e7a1074..b25446d4 100644
--- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
+++ b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
@@ -89,7 +89,7 @@ def _unscaled_square_distance(
     """The unscaled distance between X and X2."""
     assert X.ndim == 2
     X1sq = torch.sum(X**2, 1)
-    X2sq = X1sq if X is X2 else torch.sum(X**2, 1)
+    X2sq = X1sq if (X2 is None or X is X2) else torch.sum(X2**2, 1)
     X2 = X if X2 is None else X2
 
     r2 = -2 * X @ X2.T + X1sq[:, None] + X2sq[None, :]
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index eef6b771..8173aaac 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -253,13 +253,9 @@ def predict(self, x: TensorEncodedConfigs) -> tuple[torch.Tensor, torch.Tensor]:
             elif self.combined_kernel == "product":
                 K.mul_(weight * gram)
             else:
-                raise ValueError(f"Invalid combined_kernel {self.combined_kernel}")
-
-        # Concatenate the full list
-        n_test = len(x)
 
         K_s = K[: self.n_train_ :, self.n_train_ :]
-        K_ss = K[self.n_train_ :, self.n_train_ :] + self.likelihood_ * torch.eye(n_test)
+        K_ss = K[self.n_train_ :, self.n_train_ :] + self.likelihood_ * torch.eye(len(x))
 
         mu_s = K_s.t() @ self.K_i_ @ self.y_normalized_
         mu_s = mu_s * self.y_std_ + self.y_mean_

From a3a230d6eaaca1b8324364015f2dfd043940b578 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Sun, 18 Aug 2024 17:07:43 +0200
Subject: [PATCH 09/63] fix: GP Fixed up

---
 .../bayesian_optimization/cost_cooling.py     |   1 -
 .../bayesian_optimization/kernels/__init__.py |   6 -
 .../kernels/combine_kernels_hierarchy.py      | 201 ----
 .../kernels/get_kernels.py                    |  58 --
 .../kernels/grakel_replace/edge_histogram.py  |  12 +-
 .../bayesian_optimization/kernels/kernel.py   | 159 ++-
 .../kernels/vectorial_kernels.py              | 187 ++--
 .../kernels/weisfilerlehman.py                |  29 +-
 .../bayesian_optimization/models/__init__.py  |   4 +-
 .../bayesian_optimization/models/deepGP.py    |  28 +-
 .../bayesian_optimization/models/gp.py        | 379 +++----
 .../models/gp_hierarchy.py                    | 957 ------------------
 .../bayesian_optimization/optimizer.py        |  60 +-
 neps/optimizers/multi_fidelity/dyhpo.py       |   1 -
 .../multi_fidelity/sampling_policy.py         |  23 -
 neps/search_spaces/encoding.py                | 430 ++++++--
 neps_examples/basic_usage/hyperparameters.py  |   2 +-
 pyproject.toml                                |   1 -
 18 files changed, 765 insertions(+), 1773 deletions(-)
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/get_kernels.py
 delete mode 100644 neps/optimizers/bayesian_optimization/models/gp_hierarchy.py

diff --git a/neps/optimizers/bayesian_optimization/cost_cooling.py b/neps/optimizers/bayesian_optimization/cost_cooling.py
index 0d77fbc6..5a8926c7 100644
--- a/neps/optimizers/bayesian_optimization/cost_cooling.py
+++ b/neps/optimizers/bayesian_optimization/cost_cooling.py
@@ -23,7 +23,6 @@
 from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
     AcquisitionSampler,
 )
-from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_default_kernels
 from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
 from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization
 
diff --git a/neps/optimizers/bayesian_optimization/kernels/__init__.py b/neps/optimizers/bayesian_optimization/kernels/__init__.py
index 44c8e0ac..6ab80672 100644
--- a/neps/optimizers/bayesian_optimization/kernels/__init__.py
+++ b/neps/optimizers/bayesian_optimization/kernels/__init__.py
@@ -1,13 +1,8 @@
 from __future__ import annotations
-from dataclasses import dataclass
 
 from functools import partial
 from typing import Callable
-from typing_extensions import TypeAlias
 
-from neps.optimizers.bayesian_optimization.kernels.graph_kernel import GraphKernels
-from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary
-from .encoding import NASBOTDistance
 from .vectorial_kernels import HammingKernel, Matern32Kernel, Matern52Kernel, RBFKernel
 from .weisfilerlehman import WeisfilerLehman
 
@@ -29,5 +24,4 @@
         h=0,
         oa=False,
     ),
-    "nasbot": NASBOTDistance,
 }
diff --git a/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py b/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py
deleted file mode 100644
index 086cfc03..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py
+++ /dev/null
@@ -1,201 +0,0 @@
-import logging
-
-import numpy as np
-import torch
-
-from .utils import extract_configs_hierarchy
-from .vectorial_kernels import HammingKernel, Stationary
-from .weisfilerlehman import GraphKernels
-
-
-# normalise weights in front of additive kernels
-def transform_weights(weights):
-    return torch.exp(weights) / torch.sum(torch.exp(weights))
-
-
-def _select_dimensions(k):
-    if isinstance(k, HammingKernel):
-        return "categorical"
-    return "continuous"
-
-
-class CombineKernel:
-    def __init__(
-        self,
-        combined_by="sum",
-        *kernels: list,
-        **kwargs,
-    ):
-        if combined_by not in ["sum", "product"]:
-            raise ValueError(f"Invalid value for combined_by ({combined_by})")
-
-        self.has_graph_kernels = False
-        self.has_vector_kernels = False
-        self.hierarchy_consider = kwargs["hierarchy_consider"]
-        self.d_graph_features = kwargs["d_graph_features"]
-        # if use global graph features of the final architecture graph, prepare for normalising
-        # them based on training data
-        if self.d_graph_features > 0:
-            self.train_graph_feature_mean = None
-            self.train_graph_feature_std = None
-
-        self.lengthscale_bounds = (None, None)
-        for k in kernels:
-            if isinstance(k, GraphKernels):
-                self.has_graph_kernels = True
-            if not isinstance(k, GraphKernels):
-                self.has_vector_kernels = True
-                self.lengthscale_bounds = k.lengthscale_bounds
-        self.kernels = kernels
-        # Store the training graphs and vector features..
-        self._gram = None
-        self.gr, self.x = None, None
-        self.combined_by = combined_by
-
-    def fit_transform(
-        self,
-        weights: torch.Tensor,
-        configs: list,
-        normalize: bool = True,
-        rebuild_model: bool = True,
-        save_gram_matrix: bool = True,
-        gp_fit: bool = True,
-        feature_lengthscale: list = None,
-        **kwargs,
-    ):
-        weights = transform_weights(weights.clone())
-        N = len(configs)
-        K = torch.zeros(N, N) if self.combined_by == "sum" else torch.ones(N, N)
-
-        gr1, x1 = extract_configs_hierarchy(
-            configs,
-            d_graph_features=self.d_graph_features,
-            hierarchy_consider=self.hierarchy_consider,
-        )
-
-        # normalise the global graph features if we plan to use them
-        if self.d_graph_features > 0:
-            if gp_fit:
-                # compute the mean and std based on training data
-                self.train_graph_feature_mean = np.mean(x1, 0)
-                self.train_graph_feature_std = np.std(x1, 0)
-            x1 = (x1 - self.train_graph_feature_mean) / self.train_graph_feature_std
-        # k_values = [] # for debug
-        # k_features = [] # for debug
-        for i, k in enumerate(self.kernels):
-            if isinstance(k, GraphKernels) and None not in gr1:
-                if len(gr1) == N and self.hierarchy_consider is None:
-                    # only the final graph is used
-                    k_i = k.fit_transform(
-                        [g[i] for g in gr1] if isinstance(gr1[0], (list, tuple)) else gr1,
-                        rebuild_model=rebuild_model,
-                        save_gram_matrix=save_gram_matrix,
-                        gp_fit=gp_fit,
-                        **kwargs,
-                    )
-                    if normalize:
-                        K_i_diag = torch.sqrt(torch.diag(k_i))
-                        k_i /= torch.ger(K_i_diag, K_i_diag)
-                    update_val = weights[i] * k_i
-
-                else:
-                    # graphs in the early hierarchies are also used;
-                    # assume the combined kernel list always start with graph kernels i.e. kernels=[graph kernels, hp kernels]
-                    gr1_i = gr1[i]
-                    k_i = k.fit_transform(
-                        [g[i] for g in gr1_i]
-                        if isinstance(gr1_i[0], (list, tuple))
-                        else gr1_i,
-                        rebuild_model=rebuild_model,
-                        save_gram_matrix=save_gram_matrix,
-                        gp_fit=gp_fit,
-                        **kwargs,
-                    )
-                    if normalize:
-                        K_i_diag = torch.sqrt(torch.diag(k_i))
-                        k_i /= torch.ger(K_i_diag, K_i_diag)
-
-                    update_val = weights[i] * k_i
-                # k_features.append([value.X.shape[1] for key, value in k.kern.X.items()])
-
-            elif isinstance(k, Stationary) and None not in x1:
-                k_i = k.fit_transform(
-                    x1,
-                    rebuild_model=rebuild_model,
-                    save_gram_matrix=save_gram_matrix,
-                    l=feature_lengthscale,
-                )
-                update_val = (weights[i] * k_i).double()
-            else:
-                raise NotImplementedError(
-                    " For now, only the Stationary custom built kernel_operators are supported!"
-                )
-
-            # k_values.append(k_i) # for debug
-
-            if self.combined_by == "sum":
-                K += update_val
-            elif self.combined_by == "product":
-                K *= update_val
-
-        # self.k_values = k_values # for debug
-        # self.k_features = k_features # for debug
-        # self.weights_trans = weights # for debug
-        # if not normalize:
-        #     K_diag = torch.sqrt(torch.diag(K))
-        #     K /= torch.ger(K_diag, K_diag)
-
-        if save_gram_matrix:
-            self._gram = K.clone()
-
-        return K
-
-    def fit_transform_single_hierarchy(
-        self,
-        weights: torch.Tensor,
-        configs: list,
-        hierarchy_id: int,
-        normalize: bool = True,
-        rebuild_model: bool = True,
-        gp_fit: bool = True,
-        **kwargs,
-    ):
-        weights = transform_weights(weights.clone())
-        # N = len(configs)
-        # K = torch.zeros(N, N) if self.combined_by == "sum" else torch.ones(N, N)
-
-        gr1, _ = extract_configs_hierarchy(
-            configs,
-            d_graph_features=self.d_graph_features,
-            hierarchy_consider=self.hierarchy_consider,
-        )
-        # get the corresponding graph kernel and hierarchy graph data
-        graph_kernel_list = [k for k in self.kernels if isinstance(k, GraphKernels)]
-        # first graph kernel is on the final architecture graph
-        k_single_hierarchy = graph_kernel_list[int(hierarchy_id + 1)]
-        gr1_single_hierarchy = gr1[int(hierarchy_id + 1)]
-        weight_single_hierarchy = weights[int(hierarchy_id + 1)]
-        k_raw = k_single_hierarchy.fit_transform(
-            gr1_single_hierarchy,
-            rebuild_model=rebuild_model,
-            gp_fit=gp_fit,
-            **kwargs,
-        )
-        k_raw = k_raw.to(torch.float32)
-        if normalize:
-            K_diag = torch.sqrt(torch.diag(k_raw))
-            k_raw /= torch.ger(K_diag, K_diag)
-
-        K = weight_single_hierarchy * k_raw
-
-        return K
-
-
-class SumKernel(CombineKernel):
-    def __init__(self, *kernels, **kwargs):
-        super().__init__("sum", *kernels, **kwargs)
-
-
-class ProductKernel(CombineKernel):
-    def __init__(self, *kernels, **kwargs):
-        super().__init__("product", *kernels, **kwargs)
diff --git a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py b/neps/optimizers/bayesian_optimization/kernels/get_kernels.py
deleted file mode 100644
index 3ed9b5b9..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from __future__ import annotations
-
-from neps.optimizers.bayesian_optimization.kernels import Kernel
-from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import (
-    HammingKernel,
-    Matern52Kernel,
-)
-import torch
-from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import WeisfilerLehman
-
-from neps.search_spaces import SearchSpace
-
-
-# TODO: Option to combine numerical and categorical into one.
-def get_default_kernels(
-    *,
-    space: SearchSpace,
-    optimizable: bool = True,
-) -> list[tuple[Kernel, list[str]],]:
-    kernels: list[tuple[Kernel, list[str]]] = []
-    if any(space.graphs):
-        h = 2
-        if optimizable:
-            layer_weights = torch.nn.Parameter(torch.ones(h + 1))
-        else:
-            layer_weights = None
-
-        kernels.append(
-            (
-                WeisfilerLehman(h=2, layer_weights=layer_weights, oa=True),
-                list(space.graphs.keys()),
-            )
-        )
-
-    if any(space.categoricals):
-        if optimizable:
-            lengthscales = torch.nn.Parameter(torch.ones(len(space.categoricals)))
-        else:
-            lengthscales = torch.ones(len(space.categoricals))
-
-            kernels.append(
-                (
-                    HammingKernel(lengthscale=lengthscales),
-                    list(space.categoricals.keys()),
-                )
-            )
-
-    if any(space.numerical):
-        if optimizable:
-            lengthscales = torch.nn.Parameter(torch.ones(len(space.numerical)))
-        else:
-            lengthscales = torch.ones(len(space.numerical))
-
-            kernels.append(
-                (Matern52Kernel(lengthscale=lengthscales), list(space.numerical.keys()))
-            )
-
-    return kernels
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py
index f643dcc8..1b0b37d6 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py
@@ -1,4 +1,5 @@
 """The Edge Histogram kernel as defined in :cite:`sugiyama2015halting`."""
+
 from collections import Counter
 from collections.abc import Iterable
 from warnings import warn
@@ -102,9 +103,7 @@ def parse_input(self, X: Iterable, **kwargs):
             # Initialise the feature matrix
             if self._method_calling in [1, 2]:
                 if self.sparse == "auto":
-                    self.sparse_ = (
-                        len(cols) / float(ni * len(labels)) <= 0.5
-                    )
+                    self.sparse_ = len(cols) / float(ni * len(labels)) <= 0.5
                 else:
                     self.sparse_ = bool(self.sparse)
 
@@ -119,8 +118,11 @@ def parse_input(self, X: Iterable, **kwargs):
                     features[rows, cols] = data
                 except MemoryError:
                     warn("memory-error: switching to sparse")
-                    self.sparse_, features = True, csr_matrix(
-                        (data, (rows, cols)), shape=(ni, len(labels)), copy=False
+                    self.sparse_, features = (
+                        True,
+                        csr_matrix(
+                            (data, (rows, cols)), shape=(ni, len(labels)), copy=False
+                        ),
                     )
 
             if ni == 0:
diff --git a/neps/optimizers/bayesian_optimization/kernels/kernel.py b/neps/optimizers/bayesian_optimization/kernels/kernel.py
index 52db2751..57cd4895 100644
--- a/neps/optimizers/bayesian_optimization/kernels/kernel.py
+++ b/neps/optimizers/bayesian_optimization/kernels/kernel.py
@@ -1,29 +1,37 @@
 from __future__ import annotations
 
-import math
-import inspect
 import copy
-from typing import TypeVar, Generic, Any, Sequence, Mapping, Callable
+import inspect
+from abc import ABC, abstractmethod
+import math
+from typing import Any, ClassVar, Generic, Mapping, Sequence, TypeVar
 from typing_extensions import Self
+
 import torch
-import torch.nn as nn
+from torch import nn
 
 from neps.utils.types import NotSet
 
 T = TypeVar("T")
 
 
-class Kernel(nn.Module, Generic[T]):
-    def fit_transform(self, x: T) -> torch.Tensor:
-        raise NotImplementedError
+class Kernel(ABC, nn.Module, Generic[T]):
+    suggested_grid: ClassVar[Sequence[Mapping[str, Any]]]
+
+    def __init__(self) -> None:
+        super().__init__()
 
-    def transform(self, x: T) -> torch.Tensor:
+    @abstractmethod
+    def as_optimizable(self) -> Self: ...
+
+    @abstractmethod
+    def forward(self, x: T, x2: T | None = None) -> torch.Tensor:
         raise NotImplementedError
 
     def clone(self) -> Self:
         return self.clone_with()
 
-    def clone_with(self, **params: dict[str, Any]) -> Self:
+    def clone_with(self, **params: Any) -> Self:
         # h ttps://github.com/scikit-learn/scikit-learn/blob/70fdc843a4b8182d97a3508c1a426acc5e87e980/sklearn/base.py#L197
         sig = inspect.signature(self.__init__)
 
@@ -46,65 +54,114 @@ def clone_with(self, **params: dict[str, Any]) -> Self:
     def grid_search(
         self,
         x: T,
+        y: torch.Tensor,
         *,
         grid: Sequence[Mapping[str, Any]],
-        to_minimize: Callable[[torch.Tensor], float],
-    ) -> tuple[Self, float]:
+        noise_variances: Sequence[float] = (1e-6,),
+    ) -> tuple[Self, float] | Exception:
+        # Returns: (Kernel[T], float) | None if failed
         if len(grid) == 0:
             raise ValueError("Grid must have at least one element.")
 
-        def _fit_and_eval(_params: Mapping[str, Any]) -> tuple[Kernel[T], float]:
+        def _fit_and_eval(
+            _params: Mapping[str, Any],
+        ) -> tuple[Kernel[T], float] | Exception:
             cloned_kernel = self.clone_with(**_params)
-            K = cloned_kernel.fit_transform(x)
-            metric = to_minimize(K)
-            return cloned_kernel, metric
+            K = cloned_kernel.forward(x)
+
+            best_lml = -float("inf")
+            exception: Exception | None = None
+            for noise_variance in noise_variances:
+                K.diag().add_(noise_variance)
 
-        return min(
-            (_fit_and_eval(params) for params in grid),
-            key=lambda x: x[1],
-        )
+                K_inv, logDetK = compute_pd_inverse(K)
+                lml = log_marginal_likelihood(K_inv, logDetK, y).item()
+                if lml > best_lml:
+                    best_lml = lml
+
+                K.diag().sub_(noise_variance)
+
+            if exception is None:
+                return cloned_kernel, best_lml
+
+            return exception
+
+        evals = [_fit_and_eval(params) for params in grid]
+        evals_with_score = [e for e in evals if not isinstance(e, Exception)]
+        if not any(evals_with_score):
+            raise evals[-1]  # type: ignore
+
+        best_eval = max(evals_with_score, key=lambda e: e[1])  # type: ignore
+        return best_eval
 
 
 class NumericKernel(Kernel[torch.Tensor]): ...
 
 
-PI = torch.tensor(math.pi)
+TWO_LOG_2_PI = 2 * torch.log(torch.tensor(2 * math.pi))
 
 
-def compute_normalized_log_marginal_likelihood(
-    K_i: torch.Tensor,
+def log_marginal_likelihood(
+    K_inv: torch.Tensor,
     logDetK: torch.Tensor,
     y: torch.Tensor,
 ) -> torch.Tensor:
-    """Compute the zero mean Gaussian process log marginal likelihood
-    given the inverse of Gram matrix K(x2,x2), its log determinant,
-    and the training label vector y.
-    """
-    lml = -0.5 * (y.t() @ K_i @ y) + 0.5 * logDetK - y.shape[0] / 2.0 * torch.log(2 * PI)
-    return lml / y.shape[0]
+    # y.T @ K_inv @ y  --- Benchmarked to be twice as fast
+    quad_form = torch.matmul(y, torch.matmul(K_inv, y))
+    n = y.shape[0]
 
+    # TODO: We can drop the `n / 2 * TWO_LOG_2_PI` term for the grid
+    # search above as it's constant between the different kernel grids
+    # as it's purely data dependant with the `n`
+    return -0.5 * quad_form + 0.5 * logDetK - n / TWO_LOG_2_PI
 
-def compute_pd_inverse(
+
+class _CholeskyError(RuntimeError):
+    """Raised when the Cholesky decomposition fails."""
+
+
+# https://github.com/cornellius-gp/linear_operator/blob/eec70f9e1cd9106c32b05a3e774ea29d00d71cea/linear_operator/utils/cholesky.py#L12
+def _cholesky_routine(
     K: torch.Tensor,
-    *,
-    jitter: float | torch.Tensor = 1e-9,
-    attempts: int = 3,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """Compute the inverse of a postive-(semi)definite matrix K using Cholesky inversion."""
-    n = K.shape[0]
-    assert (
-        isinstance(jitter, float) or jitter.ndim == 0
-    ), "only homoscedastic noise variance is allowed here!"
-    for i in range(attempts):
-        try:
-            jitter_diag = jitter * torch.eye(n, device=K.device) * 10**i
-            Kc = torch.linalg.cholesky(K + jitter_diag)
-            break
-        except RuntimeError:
-            pass
-    else:
-        raise RuntimeError(f"Gram matrix not positive definite despite of jitter:\n{K}")
-
-    logDetK = -2 * torch.sum(torch.log(torch.diag(Kc)))
-    K_i = torch.cholesky_inverse(Kc)
-    return K_i.to(dtype=torch.float64), logDetK.to(dtype=torch.float64)
+    jitter: float | torch.Tensor = 1e-6,
+    max_tries: int = 4,
+) -> torch.Tensor:
+    L, info = torch.linalg.cholesky_ex(K)
+    if not torch.any(info):
+        return L
+
+    # Clone as we will modify in place, still cheaper
+    # than creating a new full tensor for identity.
+    K_prime = K.clone()
+    jitter_prev = 0
+    for i in range(max_tries):
+        jitter_new = jitter * (10**i)
+        K_prime.diagonal().add_(jitter_new - jitter_prev)
+        L, info = torch.linalg.cholesky_ex(K_prime)
+        if not torch.any(info):
+            return L
+
+        jitter_prev = jitter_new
+
+    raise _CholeskyError("Failed to compute Cholesky decomposition.")
+
+
+def compute_pd_inverse(K: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    # Adding noise to the diagonal of K helps with numerical stability
+    # when K is singular or near-singular, (i.e. it helps K be more "positive") which
+    # is required for the decomposition.
+
+    try:
+        # L @ L.T = K_inv  --- solves for L
+        L = _cholesky_routine(K)
+        logDetK = 2 * torch.sum(torch.log(torch.diag(L)))
+
+        # K_inv = L_inv @ L_inv.T  --- Efficiently solve for K_inv using just L
+        K_inv = torch.cholesky_inverse(L)
+    except _CholeskyError:
+        # If we fail to compute the Cholesky decomposition,
+        # then just compute the inverse directly.
+        K_inv = torch.linalg.inv(K)
+        logDetK = torch.linalg.slogdet(K)[1]
+
+    return K_inv, logDetK
diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
index b25446d4..8bcbd45b 100644
--- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
+++ b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
@@ -1,150 +1,103 @@
 from __future__ import annotations
 
 from math import sqrt
-from typing_extensions import override
-from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel
+from typing import Any, Mapping, Sequence, ClassVar
+from typing_extensions import override, Self
 
-import numpy as np
+from itertools import product
 import torch
+import torch.nn as nn
+
+from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel
 
-DEFAULT_LENGTHSCALE_BOUNDS = np.exp(-6.754111155189306), np.exp(0.0858637988771976)
+# TODO:
+# We should try some variations of singular length scales
+# (1 scale shared across all dimensions)
+# and individual ARD lengthscales (1 for each dimension)
+# ARD can overfit if not properly tuned...
+LENGTHSCALE_GRID = (1e-2, 1e-1, 1, 1e1, 1e2)
+STD_ENCODED_OUTPUT_SCALE = (1e-2, 1e-1, 1, 1e1, 1e2)
 
 
 class Stationary(Kernel[torch.Tensor]):
-    """Here we follow the structure of GPy to build a sub class of stationary kernel.
-
-    All the classes (i.e. the class of stationary kernel_operators) derived from this
-    class use the scaled distance to compute the Gram matrix.
-    """
+    suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] = [
+        {"lengthscale": l, "output_scale": o}
+        for l, o in product(LENGTHSCALE_GRID, STD_ENCODED_OUTPUT_SCALE)
+    ]
 
     def __init__(
         self,
         *,
-        lengthscale: torch.Tensor,
-        outputscale: float | torch.Tensor = 1.0,
-        lengthscale_bounds: tuple[float, float] = DEFAULT_LENGTHSCALE_BOUNDS,
+        lengthscale: torch.Tensor | None = None,
+        outputscale: torch.Tensor | None = None,
+        lengthscale_bounds: tuple[float, float] | None = (1e-2, 1e2),
+        outputscale_bounds: tuple[float, float] | None = (1e-2, 1e2),
+        device: torch.device | None = None,
     ):
-        self.lengthscale = lengthscale
-        self.outputscale = outputscale
+        super().__init__()
+        self.lengthscale = (
+            torch.as_tensor(lengthscale, dtype=torch.float64, device=device)
+            if lengthscale is not None
+            else torch.tensor(1, dtype=torch.float64, device=device)
+        )
+        self.outputscale = (
+            torch.as_tensor(outputscale, dtype=torch.float64, device=device)
+            if outputscale is not None
+            else torch.tensor(1, dtype=torch.float64, device=device)
+        )
         self.lengthscale_bounds = lengthscale_bounds
+        self.outputscale_bounds = outputscale_bounds
+        self.device = device
 
-        self.gram_: torch.Tensor | None = None
         self.train_: torch.Tensor | None = None
 
-    def fit_transform(self, x: torch.Tensor) -> torch.Tensor:
-        K = self._forward(x)
-        self.train_ = x.clone().detach()
-        return K
+    def as_optimizable(self) -> Self:
+        return self.clone_with(
+            lengthscale=nn.Parameter(self.lengthscale),
+            outputscale=nn.Parameter(self.outputscale),
+        )
+
+    def forward(self, x: torch.Tensor, x2: torch.Tensor | None = None) -> torch.Tensor:
+        # NOTE: I don't think this is the right way to do this...
+        with torch.no_grad():
+            self.lengthscale.data.clamp_(*self.lengthscale_bounds)
+            self.outputscale.data.clamp_(*self.outputscale_bounds)
 
-    def transform(self, x: torch.Tensor) -> torch.Tensor:
-        if self.train_ is None:
-            raise ValueError("The kernel has not been fitted. Run fit_transform first")
-        return self._forward(self.train_, x)
+        x2 = x if x2 is None else x2
+        return self._forward(x, x2)
 
-    def _forward(self, x1: torch.Tensor, x2: torch.Tensor | None = None) -> torch.Tensor:
-        return _scaled_distance(self.lengthscale, x1, x2)
+    def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+        return self.outputscale * torch.cdist(x1, x2, p=2)
 
 
 class RBFKernel(Stationary):
     @override
-    def _forward(
-        self,
-        x1: torch.Tensor,
-        x2: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        dist_sq = _scaled_distance(self.lengthscale, x1, x2, sq_dist=True)
-        return self.outputscale * torch.exp(-0.5 * dist_sq)
+    def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+        dist_sq = torch.cdist(x1, x2, p=2) ** 2
+        return self.outputscale * torch.exp(-dist_sq / (2 * self.lengthscale**2))
 
 
 class Matern32Kernel(Stationary):
     @override
-    def _forward(
-        self,
-        x1: torch.Tensor,
-        x2: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        dist = _scaled_distance(self.lengthscale, x1, x2)
-        return self.outputscale * (1 + sqrt(3.0) * dist) * torch.exp(-sqrt(3.0) * dist)
+    def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+        dist = torch.cdist(x1, x2, p=2) / self.lengthscale
+        factor = sqrt(3.0) * dist
+        matern32 = (1 + factor) * torch.exp(-factor)
+        return self.outputscale * matern32
 
 
-class Matern52Kernel(Stationary):
+class HammingKernel(Stationary):
     @override
-    def _forward(
-        self,
-        x1: torch.Tensor,
-        x2: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        dist = _scaled_distance(self.lengthscale, x1, x2, sq_dist=True)
-        return (
-            self.outputscale
-            * (1 + sqrt(5.0) * dist + 5.0 / 3.0 * dist)
-            * torch.exp(-sqrt(5.0) * dist)
-        )
+    def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+        dists = (x1.unsqueeze(1) != x2.unsqueeze(0)).float().sum(-1) / x1.shape[-1]
+        scaled_dists = dists / self.lengthscale
+        return self.outputscale * torch.exp(-scaled_dists)
 
 
-def _unscaled_square_distance(
-    X: torch.Tensor,
-    X2: torch.Tensor | None = None,
-) -> torch.Tensor:
-    """The unscaled distance between X and X2."""
-    assert X.ndim == 2
-    X1sq = torch.sum(X**2, 1)
-    X2sq = X1sq if (X2 is None or X is X2) else torch.sum(X2**2, 1)
-    X2 = X if X2 is None else X2
-
-    r2 = -2 * X @ X2.T + X1sq[:, None] + X2sq[None, :]
-    r2 += 1e-15
-    return torch.clamp_min(r2, 0.0)
-
-
-def _scaled_distance(
-    lengthscale: torch.Tensor,
-    X: torch.Tensor,
-    X2: torch.Tensor | None = None,
-    *,
-    sq_dist: bool = False,
-) -> torch.Tensor:
-    """Compute the *scaled* distance between X and x2 (or, if X2 is not supplied,
-    the distance between X and itself) by the lengthscale. if a scalar (float) or a
-    dim=1 lengthscale vector is supplied, then it is assumed that we use one
-    lengthscale for all dimensions. Otherwise, we have an ARD kernel and in which case
-    the length of the lengthscale vector must be the same as the dimensionality of the
-    problem."""
-    if len(lengthscale) == 1:
-        if sq_dist is False:
-            return torch.sqrt(_unscaled_square_distance(X, X2)) / (lengthscale**2)
-
-        return _unscaled_square_distance(X, X2) / lengthscale
-
-    # ARD kernel - one lengthscale per dimension
-    assert len(lengthscale) == X.shape[1], (
-        f"Lengthscale must have the same dimensionality as the input data."
-        f"Got {len(lengthscale)} and {X.shape[1]}"
-    )
-    rescaled_X = X / lengthscale
-    if X2 is None:
-        dist = _unscaled_square_distance(rescaled_X)
-    else:
-        rescaled_X2 = X2 / lengthscale
-        dist = _unscaled_square_distance(rescaled_X, rescaled_X2)
-
-    return dist if sq_dist else torch.sqrt(dist)
-
-
-def _hamming_distance(
-    lengthscale: torch.Tensor,
-    X: torch.Tensor,
-    X2: torch.Tensor | None = None,
-) -> torch.Tensor:
-    if X2 is None:
-        X2 = X
-
-    indicator = X.unsqueeze(1) != X2
-    C = -1 / (2 * lengthscale**2)
-    scaled_indicator = C * indicator
-    diffs = scaled_indicator.sum(dim=2)
-
-    if len(lengthscale) == 1:
-        return torch.exp(diffs) / lengthscale
-
-    return torch.exp(diffs)
+class Matern52Kernel(Stationary):
+    @override
+    def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+        dist = torch.cdist(x1, x2, p=2) / self.lengthscale
+        factor = sqrt(5.0) * dist
+        matern52 = (1 + factor + (factor**2) / 3) * torch.exp(-factor)
+        return self.outputscale * matern52
diff --git a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
index b1d4cd7e..68d257b1 100644
--- a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
+++ b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
@@ -1,8 +1,13 @@
 from __future__ import annotations
 
+from typing import Any, ClassVar, Mapping, Sequence
+from typing_extensions import Self
+
 import torch
+import torch.nn as nn
+from itertools import product
 
-from typing import Sequence
+import numpy as np
 from neps.optimizers.bayesian_optimization.kernels.grakel_replace import (
     VertexHistogram,
     WeisfeilerLehman as _WL,
@@ -11,9 +16,17 @@
 from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary
 from neps.search_spaces.encoding import WLInput
 
+GRID_WL_LENGTHSCALES = torch.tensor([np.e**i for i in range(-2, 3)])
+GRID_WL_SUBTREE_CANDIDATES = (1, 2, 3, 4, 5)
+
 
 class WeisfilerLehman(Kernel[Sequence[WLInput]]):
-    """Weisfiler Lehman kernel using grakel functions"""
+    """Weisfiler Lehman kernel using grakel functions."""
+
+    suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] = [
+        {"h": h, "se_kernel": Stationary(lengthscale=l)}
+        for h, l in product(GRID_WL_SUBTREE_CANDIDATES, GRID_WL_LENGTHSCALES)
+    ]
 
     def __init__(
         self,
@@ -44,7 +57,9 @@ def __init__(
 
         self.h = h
         self.se_kernel = se_kernel
-        self.layer_weights = layer_weights
+        self.layer_weights = (
+            layer_weights if layer_weights is not None else torch.ones(h + 1)
+        )
         self.oa = oa
         self.node_label = node_label
         if node_label != "op_name":
@@ -52,7 +67,11 @@ def __init__(
 
         self.wl_kernel_: _WL | None = None
 
+    def as_optimizable(self) -> Self:
+        return self.clone_with(layer_weights=nn.Parameter(self.layer_weights))
+
     def fit_transform(self, gr: Sequence[WLInput]) -> torch.Tensor:
+        self.layer_weights.clamp_(0, 1)
         self.wl_kernel_ = _WL(
             h=self.h,
             base_graph_kernel=(  # type: ignore
@@ -68,14 +87,12 @@ def fit_transform(self, gr: Sequence[WLInput]) -> torch.Tensor:
             normalize=True,
         )
 
-        # TODO: This could probably be lifted to the caller
         K = self.wl_kernel_.fit_transform(gr)
-        K = torch.as_tensor(K, dtype=torch.float64)
-        self.layer_weights_ = self.wl_kernel_.layer_weights
         return torch.as_tensor(K, dtype=torch.float64)
 
     def transform(self, gr: Sequence[WLInput]) -> torch.Tensor:
         assert self.wl_kernel_ is not None
+        self.layer_weights.clamp_(0, 1)
 
         K = self.wl_kernel_.transform(gr)
         return torch.as_tensor(K, dtype=torch.float64)
diff --git a/neps/optimizers/bayesian_optimization/models/__init__.py b/neps/optimizers/bayesian_optimization/models/__init__.py
index 6279e973..6ce65b61 100755
--- a/neps/optimizers/bayesian_optimization/models/__init__.py
+++ b/neps/optimizers/bayesian_optimization/models/__init__.py
@@ -1,7 +1,6 @@
 from neps.utils.common import MissingDependencyError
 
-from .gp import ComprehensiveGP
-from .gp_hierarchy import ComprehensiveGPHierarchy
+from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP
 
 try:
     from neps.optimizers.models.deepGP import DeepGP
@@ -16,6 +15,5 @@
 SurrogateModelMapping = {
     "deep_gp": DeepGP,
     "gp": ComprehensiveGP,
-    "gp_hierarchy": ComprehensiveGPHierarchy,
     "pfn": PFN_SURROGATE,
 }
diff --git a/neps/optimizers/bayesian_optimization/models/deepGP.py b/neps/optimizers/bayesian_optimization/models/deepGP.py
index ffc3606f..a98242a1 100644
--- a/neps/optimizers/bayesian_optimization/models/deepGP.py
+++ b/neps/optimizers/bayesian_optimization/models/deepGP.py
@@ -1,18 +1,17 @@
 from __future__ import annotations
-from dataclasses import dataclass, field
 
 import logging
 from copy import deepcopy
+from dataclasses import dataclass, field
 from pathlib import Path
 
 import gpytorch
 import numpy as np
 import torch
-import torch.nn as nn
-from neps.search_spaces.architecture.graph_grammar import GraphParameter
+from torch import nn
 
 from neps.exceptions import SurrogateFailedToFit
-
+from neps.search_spaces.architecture.graph_grammar import GraphParameter
 from neps.search_spaces.search_space import (
     CategoricalParameter,
     FloatParameter,
@@ -50,9 +49,7 @@ def count_non_improvement_steps(root_directory: Path | str) -> int:
 
 
 class NeuralFeatureExtractor(nn.Module):
-    """
-    Neural network to be used in the DeepGP
-    """
+    """Neural network to be used in the DeepGP."""
 
     def __init__(self, input_size: int, **kwargs):
         super().__init__()
@@ -121,15 +118,11 @@ def forward(self, x, budgets, learning_curves):
 
         # put learning curve features into the last layer along with the higher level features.
         x = torch.cat((x, lc_features), dim=1)
-        x = self.activation(getattr(self, f"fc{self.n_layers}")(x))
-
-        return x
+        return self.activation(getattr(self, f"fc{self.n_layers}")(x))
 
 
 class GPRegressionModel(gpytorch.models.ExactGP):
-    """
-    A simple GP model.
-    """
+    """A simple GP model."""
 
     def __init__(
         self,
@@ -137,8 +130,7 @@ def __init__(
         train_y: torch.Tensor,
         likelihood: gpytorch.likelihoods.GaussianLikelihood,
     ):
-        """
-        Constructor of the GPRegressionModel.
+        """Constructor of the GPRegressionModel.
 
         Args:
             train_x: The initial train examples for the GP.
@@ -237,7 +229,7 @@ def encode_configs(
     def encode_learning_curves(self, learning_curves: list[list[float]]) -> torch.Tensor:
         lc_height = len(learning_curves)
         lc_width = max(
-            max(len(lc) for lc in learning_curves), self.min_learning_curve_length
+            *(len(lc) for lc in learning_curves), self.min_learning_curve_length
         )
         lc_buffer = torch.full(
             (lc_width, lc_height),
@@ -285,7 +277,7 @@ def _train_model(
 
     optimizer = torch.optim.Adam(
         [
-            dict({"params": model.parameters()}, **optimizer_args),
+            dict({"model_params": model.parameters()}, **optimizer_args),
             dict({"params": nn.parameters()}, **optimizer_args),
         ]
     )
@@ -294,7 +286,7 @@ def _train_model(
     min_avg_loss_val = np.inf
     average_loss: float = 0.0
 
-    for epoch_nr in range(0, n_epochs):
+    for epoch_nr in range(n_epochs):
         if early_stopping and count_down == 0:
             logger.info(
                 f"Epoch: {epoch_nr - 1} surrogate training stops due to early "
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index 8173aaac..f46e89b9 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -1,160 +1,132 @@
 from __future__ import annotations
 
 import logging
-import torch
-import numpy as np
-from typing import Literal, Sequence, Any, Mapping
-from typing_extensions import Literal
 from dataclasses import dataclass, field
-from itertools import product
+from typing import TYPE_CHECKING, Any, Literal, Mapping, Sequence
+from typing_extensions import Literal
+import torch.nn as nn
+
+import numpy as np
+import torch
 
 from neps.optimizers.bayesian_optimization.kernels.kernel import (
     Kernel,
-    NumericKernel,
-    compute_normalized_log_marginal_likelihood,
+    log_marginal_likelihood,
     compute_pd_inverse,
 )
-
-from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary
+from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import (
+    HammingKernel,
+    Matern52Kernel,
+)
 from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import (
     WeisfilerLehman,
 )
-from neps.search_spaces.encoding import TensorEncodedConfigs
-from neps.search_spaces.search_space import SearchSpace
-
-logger = logging.getLogger(__name__)
-
-f64 = torch.float64
+from neps.search_spaces import SearchSpace
+from neps.search_spaces.encoding import (
+    IntegerCategoricalTransformer,
+    JointTransformer,
+    MinMaxNormalizer,
+    OneHotEncoder,
+    TensorTransformer,
+    Transformer,
+    WLInputTransformer,
+)
+from neps.search_spaces.hyperparameters.float import FloatParameter
+from neps.search_spaces.hyperparameters.integer import IntegerParameter
 
+if TYPE_CHECKING:
+    from neps.search_spaces.search_space import SearchSpace
 
-GRID_WL_LENGTHSCALES = torch.tensor([np.e**i for i in range(-2, 3)], dtype=f64)
-GRID_WL_SUBTREE_CANDIDATES = (1, 2, 3, 4, 5)
+logger = logging.getLogger(__name__)
 
 
-def _default_param_grid() -> dict[type[Kernel], list[dict[str, Any]]]:
-    return {
-        WeisfilerLehman: [
-            {"h": h, "se_kernel": Stationary(lengthscale=l)}
-            for h, l in product(GRID_WL_SUBTREE_CANDIDATES, GRID_WL_LENGTHSCALES)
-        ]
-    }
+# The optimization we do for the noise is relatively cheap while the matrices
+NOISE_VARIANCE_GRID = (1e-6, 1e-4, 1e-2, 1, 1e1, 1e2)
 
 
 @dataclass
 class ComprehensiveGP:
     space: SearchSpace
-    kernels: Sequence[tuple[Kernel, Sequence[str]]]
+    kernels: dict[str, tuple[Kernel, Transformer]]
     combined_kernel: Literal["sum", "product"] = "sum"
-    initial_likelihood: float = 1e-3
-    optimize_likelihood: bool = True
-    max_likelihood: float = 0.01
+
+    noise_variance: Sequence[float] = NOISE_VARIANCE_GRID
+    kernel_parameter_grid: Mapping[str, Sequence[Mapping[str, Any]]] | bool = True
+
     optimizer: Literal["adam", "sgd"] = "adam"
-    optimizer_iters: int = 20
-    optimize_wl_layer_weights: bool = False
-    surrogate_model_fit_args: Mapping[str, Any] = field(default_factory=dict)
     optimizer_kwargs: Mapping[str, Any] = field(default_factory=lambda: {"lr": 0.1})
-    kernel_hp_grids: Mapping[type[Kernel], Sequence[Mapping[str, Any]]] = field(
-        default_factory=_default_param_grid
-    )
+    optimizer_iters: int = 20
+    device: torch.device | None = None
 
     # Post fit attributes
-    K_i_: torch.Tensor | None = None
+    K_inv_: torch.Tensor | None = None
     n_train_: int | None = None
     likelihood_: float | None = None
     y_: torch.Tensor | None = None
     y_normalized_: torch.Tensor | None = None
     y_mean_: float | None = None
     y_std_: float | None = None
-    optimized_kernels_: (
-        list[tuple[NumericKernel | WeisfilerLehman, Sequence[str]]] | None
-    ) = None
-    kernel_weights_: torch.Tensor | None = None
+    optimized_kernels_: dict[str, Kernel] | None = None
+    train_data_: dict[str, Any] | None = None
 
     def __post_init__(self):
         # TODO: Remove when search space is just definition and does not hold values.
         self.space = self.space.clone()
 
-    def fit(self, x: TensorEncodedConfigs, train_y: torch.Tensor) -> None:
+    def fit(self, x: list[dict[str, Any]], train_y: torch.Tensor) -> None:
         # Preprocessing
-        y_ = torch.as_tensor(train_y, dtype=f64)
+        y_ = torch.as_tensor(train_y, device=self.device, dtype=torch.float64)
 
         # TODO: Dunno if I like this silent hack, setting std to 1 if no std
         self.y_std_ = s if (s := torch.std(y_).item()) != 0 else 1
         self.y_mean_ = torch.mean(y_).item()
         self.y_normalized_ = (y_ - self.y_mean_) / self.y_std_
-
-        optimized_kernels: list[
-            tuple[NumericKernel | WeisfilerLehman, Sequence[str]]
-        ] = []
-        _grids = self.kernel_hp_grids
-
-        def _eval_kernel(_K: torch.Tensor) -> float:
-            assert y_ is not None
-            K_i, logDetK = compute_pd_inverse(_K)
-            nlml = -compute_normalized_log_marginal_likelihood(K_i, logDetK, y_)
-            return float(nlml)
-
-        for kernel, hps in self.kernels:
-            if isinstance(kernel, WeisfilerLehman):
-                assert len(hps) == 1, "Only support single kernel per graph."
-                _xs = x.wl_graph_input(hps[0])
-            elif isinstance(kernel, NumericKernel):
-                _xs = x.tensor(hps)
-            else:
-                raise ValueError(f"Unsupported kernel type {type(kernel)}")
-
-            grid = next((g for t, g in _grids.items() if isinstance(kernel, t)), None)
-            if grid is None:
-                optimized_kernel = kernel.clone()
-                _ = optimized_kernel.fit_transform(_xs)  # type: ignore
-                optimized_kernels.append((kernel, hps))
-                continue
-
-            optimized_kernel, _ = kernel.grid_search(
-                x=_xs,  # type: ignore
+        self.y_ = y_
+
+        _data = {
+            key: transformer.encode(x, self.space)
+            for key, (_, transformer) in self.kernels.items()
+        }
+
+        # optimized kernel parameters + noise variance
+        optim_vars: list[nn.Parameter] = []
+
+        grids = {
+            name: k.suggested_grid
+            for name, (k, _) in self.kernels.items()
+            if k.suggested_grid is not None
+        }
+
+        kernels: dict[str, Kernel] = {}
+        for kernel_name, (kernel, _) in self.kernels.items():
+            xs = _data[kernel_name]
+            grid = grids[kernel_name]
+
+            maybe_optimized_kernel = kernel.grid_search(
+                x=xs,
+                y=self.y_normalized_,
                 grid=grid,
-                to_minimize=_eval_kernel,
             )
-            optimized_kernels.append((optimized_kernel, hps))
-
-        # Optimization weights
-        likelihood = torch.tensor(
-            self.initial_likelihood,
-            requires_grad=self.optimize_likelihood,
+            if isinstance(maybe_optimized_kernel, Exception):
+                raise ValueError(
+                    f"Failed to optimize kernel {kernel_name} with grid {grid}."
+                ) from maybe_optimized_kernel
+
+            opt_kernel, _ = maybe_optimized_kernel
+            gradient_enabled_kernel = opt_kernel.as_optimizable()
+            kernels[kernel_name] = gradient_enabled_kernel
+
+            optim_vars.extend(gradient_enabled_kernel.parameters())
+
+        # Now that we've optimized the kernels, we convert go convert their
+        # parameters into a tensor we can further refine with some optimizer iterations
+        # - Optimize kernel-lengthscales, kernel-outputscale, noise-variance
+        #   and any additional parameters they wish to advertise.
+        noise_variance = nn.Parameter(
+            torch.tensor(1e-3, device=self.device, dtype=torch.float64)
         )
+        optim_vars.append(noise_variance)
 
-        kernel_weights = torch.ones(
-            len(optimized_kernels),
-            requires_grad=len(optimized_kernels) > 1,
-            dtype=f64,
-        )
-        should_optimize = lambda p: p.is_leaf and p.requires_grad
-
-        # Linking the optimizer variables to the sum kernel
-        optim_vars: list[torch.Tensor] = [
-            a
-            for a in (kernel_weights, likelihood)
-            if a is not None and should_optimize(a)
-        ]
-        layer_weights = [
-            kernel.layer_weights_
-            for kernel, _ in optimized_kernels
-            if isinstance(kernel, WeisfilerLehman)
-            and kernel.layer_weights_ is not None
-            and should_optimize(kernel.layer_weights_)
-        ]
-        lengthscales = [
-            kernel.layer_weights_
-            for kernel, _ in optimized_kernels
-            if isinstance(kernel, NumericKernel) and should_optimize(kernel.lengthscale)
-        ]
-        lengthscalebounds = [
-            kernel.lengthscale_bounds
-            for kernel, _ in optimized_kernels
-            if isinstance(kernel, NumericKernel) and should_optimize(kernel.lengthscale)
-        ]
-
-        # Select the optimizer
         if self.optimizer == "adam":
             optim = torch.optim.Adam(optim_vars, **self.optimizer_kwargs)  # type: ignore
         elif self.optimizer == "sgd":
@@ -162,106 +134,153 @@ def _eval_kernel(_K: torch.Tensor) -> float:
         else:
             raise ValueError(f"Invalid optimizer {self.optimizer}")
 
-        K: torch.Tensor | None = None
+        K_inv: torch.Tensor | None = None
         N = len(x)
-        for _ in range(self.optimizer_iters):
+        for i in range(self.optimizer_iters):
             optim.zero_grad()
-
             # Now we iterate over kernels to build up K
             _init = torch.zeros if self.combined_kernel == "sum" else torch.ones
-            K = _init(N, N, dtype=f64)
-            for (kernel, hps), weight in zip(self.kernels, kernel_weights):
-                if isinstance(kernel, WeisfilerLehman):
-                    assert len(hps) == 1, "Only support single kernel per graph."
-                    _xs = x.wl_graph_input(hps[0])
-                    gram = kernel.fit_transform(_xs)
-                elif isinstance(kernel, NumericKernel):
-                    _xs = x.tensor(hps)
-                    gram = kernel.fit_transform(_xs)
-                else:
-                    raise ValueError(f"Unsupported kernel type {type(kernel)}")
+            K = _init(N, N, device=self.device, dtype=torch.float64)
+            for kernel_name, kernel in kernels.items():
+                data = _data[kernel_name]
+                gram = kernel.forward(data, data)
 
                 if self.combined_kernel == "sum":
-                    K.add_(weight * gram)
-                elif self.combined_kernel == "product":
-                    K.mul_(weight * gram)
+                    K.add_(gram)
                 else:
-                    raise ValueError(f"Invalid combined_kernel {self.combined_kernel}")
+                    K.mul_(gram)
 
-            # Normalize
-            K_diag = torch.sqrt(torch.diag(K))
-            K /= torch.ger(K_diag, K_diag)
-            K_i, logDetK = compute_pd_inverse(K, jitter=likelihood)
+            K.diag().add_(noise_variance)
 
-            # If there's nothing to optimize, break out early
-            if len(optim_vars) == 0:
-                break
+            K_inv, logDetK = compute_pd_inverse(K)
+            nlml = -log_marginal_likelihood(K_inv, logDetK, y=self.y_normalized_)
 
-            nlml = -compute_normalized_log_marginal_likelihood(
-                K_i, logDetK, y=self.y_normalized_
-            )
+            # TODO: Could early stop here...
             nlml.backward()
             optim.step()
 
             with torch.no_grad():
-                kernel_weights.clamp_(0.0, 1.0)
-                if likelihood.is_leaf:
-                    likelihood.clamp_(1e-9, self.max_likelihood)
-
-                for ls, ls_bounds in zip(lengthscales, lengthscalebounds):
-                    ls.clamp_(*ls_bounds)
-
-                for lw in layer_weights:
-                    lw.clamp_(0.0, 1.0)
-
-            optim.zero_grad()
-
-        assert K is not None
-        K_i, logDetK = compute_pd_inverse(K, jitter=likelihood)
+                noise_variance.clamp_(1e-6, np.inf)
 
         # Apply the optimal hyperparameters
-        self.K_i_ = K_i.clone()
-        self.likelihood_ = likelihood.item()
-        self.optimized_kernels_ = optimized_kernels
-        self.kernel_weights_ = kernel_weights
+        assert K_inv is not None
+        self.K_inv_ = K_inv.clone()
+        self.noise_variance_ = noise_variance.item()
+        self.optimized_kernels_ = kernels
         self.n_train_ = N
-
-    def predict(self, x: TensorEncodedConfigs) -> tuple[torch.Tensor, torch.Tensor]:
-        """Kriging predictions"""
-        if self.K_i_ is None or self.n_train_ is None or self.kernel_weights_ is None:
+        self.train_data_ = _data
+
+    def predict(self, x: list[dict[str, Any]]) -> tuple[torch.Tensor, torch.Tensor]:
+        """Kriging predictions."""
+        if (
+            self.K_inv_ is None
+            or self.n_train_ is None
+            or self.optimized_kernels_ is None
+            or self.train_data_ is None
+            or self.y_normalized_ is None
+            or self.y_std_ is None
+        ):
             raise ValueError(
                 "Inverse of Gram matrix is not instantiated. Please call the optimize "
                 "function to fit on the training data first!"
             )
+        _data = {
+            key: transformer.encode(x, self.space)
+            for key, (_, transformer) in self.kernels.items()
+        }
 
         _init = torch.zeros if self.combined_kernel == "sum" else torch.ones
-        N = self.n_train_ + len(x)
-        K = _init(N, N, dtype=f64)
-        for (kernel, hps), weight in zip(self.kernels, self.kernel_weights_):
-            if isinstance(kernel, WeisfilerLehman):
-                assert len(hps) == 1, "Only support single kernel per graph."
-                _x_test = x.wl_graph_input(hps[0])
-                gram = kernel.transform(_x_test)
-            elif isinstance(kernel, NumericKernel):
-                _x_test = x.tensor(hps)
-                gram = kernel.fit_transform(_x_test)
+        n_test = len(x)
+
+        K_train_test = _init(
+            self.n_train_, n_test, device=self.device, dtype=torch.float64
+        )
+        K_test_test = _init(n_test, n_test, device=self.device, dtype=torch.float64)
+
+        for kernel_name, kernel in self.optimized_kernels_.items():
+            train_x = self.train_data_[kernel_name]
+            test_x = _data[kernel_name]
+
+            gram = kernel.forward(train_x, test_x)
+            if self.combined_kernel == "sum":
+                K_train_test.add_(gram)
             else:
-                raise ValueError(f"Unsupported kernel type {type(kernel)}")
+                K_train_test.mul_(gram)
 
+            gram = kernel.forward(test_x, test_x)
             if self.combined_kernel == "sum":
-                K.add_(weight * gram)
-            elif self.combined_kernel == "product":
-                K.mul_(weight * gram)
+                K_test_test.add_(gram)
             else:
+                K_test_test.mul_(gram)
 
-        K_s = K[: self.n_train_ :, self.n_train_ :]
-        K_ss = K[self.n_train_ :, self.n_train_ :] + self.likelihood_ * torch.eye(len(x))
+        # Compute the predictive mean
 
-        mu_s = K_s.t() @ self.K_i_ @ self.y_normalized_
+        # Scale by the standard deviation and mean
+        mu_s = K_train_test.t() @ self.K_inv_ @ self.y_normalized_
         mu_s = mu_s * self.y_std_ + self.y_mean_
 
-        cov_s = K_ss - K_s.t() @ self.K_i_ @ K_s
-        cov_s = torch.clamp(cov_s, self.likelihood_, np.inf)
-        cov_s = (torch.sqrt(cov_s) * self.y_std_) ** 2
+        cov_s = K_test_test - K_train_test.t() @ self.K_inv_ @ K_train_test
+        cov_s.diagonal().clamp_(self.noise_variance_, np.inf)
+        cov_s *= self.y_std_**2
 
         return mu_s, cov_s
+
+    @classmethod
+    def get_default(
+        cls, space: SearchSpace, *, include_fidelities: bool = False
+    ) -> ComprehensiveGP:
+        kernels = get_default_kernels(space=space, include_fidelities=include_fidelities)
+        return cls(space=space, kernels=kernels)
+
+
+def get_default_kernels(
+    *,
+    space: SearchSpace,
+    include_fidelities: bool = False,
+) -> dict[str, tuple[Kernel, Transformer]]:
+    kernels: dict[str, tuple[Kernel, Transformer]] = {}
+
+    # We will always need to use a graph kernel for graphs and there's no
+    # possibility to embed them into a tensor.
+    if any(space.graphs):
+        for hp_name in space.graphs:
+            kernels[f"graph_{hp_name}"] = (
+                WeisfilerLehman(h=2, oa=True),
+                WLInputTransformer((hp_name,)),
+            )
+
+    assert all(
+        isinstance(f, (IntegerParameter, FloatParameter)) for f in space.fidelities
+    ), "Assumption for numeric represetnation of fidelity broken"
+
+    any_numerical = any(space.numerical) or (include_fidelities and any(space.fidelities))
+    if any_numerical:
+        # At least one numerical, fuse numeric + categoricals into one tensor encoding
+        transformers: list[TensorTransformer] = []
+        if any(space.categoricals):
+            transformers.append(OneHotEncoder(tuple(space.categoricals)))
+
+        if include_fidelities:
+            min_max_normalizer = MinMaxNormalizer(
+                tuple(space.numerical) + tuple(space.fidelities)
+            )
+        else:
+            min_max_normalizer = MinMaxNormalizer(tuple(space.numerical))
+
+        transformers.append(min_max_normalizer)
+        kernels["vectorial"] = (Matern52Kernel(), JointTransformer.join(*transformers))
+    else:
+        # At this point, we assume only categoricals and maybe fidelities
+        assert any(space.categoricals)
+
+        if include_fidelities and any(space.fidelities):
+            fid_normalizer = MinMaxNormalizer(tuple(space.fidelities))
+            one_hot_encoder = OneHotEncoder(tuple(space.categoricals))
+
+            transformer = JointTransformer.join(one_hot_encoder, fid_normalizer)
+            kernels["vectorial"] = (Matern52Kernel(), transformer)
+        else:
+            transformer = IntegerCategoricalTransformer(tuple(space.categoricals))
+            kernels["categorical"] = (HammingKernel(), transformer)
+
+    return kernels
diff --git a/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py b/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py
deleted file mode 100644
index 2c9993be..00000000
--- a/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py
+++ /dev/null
@@ -1,957 +0,0 @@
-import itertools
-import logging
-import warnings
-from copy import deepcopy
-from typing import Iterable, Union
-
-import numpy as np
-import torch
-
-from ..kernels.combine_kernels_hierarchy import ProductKernel, SumKernel
-
-# GP model as a weighted average between the vanilla vectorial GP and the graph GP
-from ..kernels.graph_kernel import GraphKernels
-from ..kernels.utils import extract_configs_hierarchy
-from ..kernels.vectorial_kernels import Stationary
-from ..kernels.weisfilerlehman import WeisfilerLehman
-
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-# Code for psd_safe_cholesky from gypytorch
-class _value_context:
-    _global_value = None
-
-    @classmethod
-    def value(cls):
-        return cls._global_value
-
-    @classmethod
-    def _set_value(cls, value):
-        cls._global_value = value
-
-    def __init__(self, value):
-        self._orig_value = self.__class__.value()
-        self._instance_value = value
-
-    def __enter__(
-        self,
-    ):
-        self.__class__._set_value(self._instance_value)
-
-    def __exit__(self, *args):
-        self.__class__._set_value(self._orig_value)
-        return False
-
-
-class _dtype_value_context:
-    _global_float_value = None
-    _global_double_value = None
-    _global_half_value = None
-
-    @classmethod
-    def value(cls, dtype):
-        if torch.is_tensor(dtype):
-            dtype = dtype.dtype
-        if dtype == torch.float:
-            return cls._global_float_value
-        elif dtype == torch.double:
-            return cls._global_double_value
-        elif dtype == torch.half:
-            return cls._global_half_value
-        else:
-            raise RuntimeError(f"Unsupported dtype for {cls.__name__}.")
-
-    @classmethod
-    def _set_value(cls, float_value, double_value, half_value):
-        if float_value is not None:
-            cls._global_float_value = float_value
-        if double_value is not None:
-            cls._global_double_value = double_value
-        if half_value is not None:
-            cls._global_half_value = half_value
-
-    def __init__(self, float=None, double=None, half=None):
-        self._orig_float_value = self.__class__.value()
-        self._instance_float_value = float
-        self._orig_double_value = self.__class__.value()
-        self._instance_double_value = double
-        self._orig_half_value = self.__class__.value()
-        self._instance_half_value = half
-
-    def __enter__(
-        self,
-    ):
-        self.__class__._set_value(
-            self._instance_float_value,
-            self._instance_double_value,
-            self._instance_half_value,
-        )
-
-    def __exit__(self, *args):
-        self.__class__._set_value(
-            self._orig_float_value, self._orig_double_value, self._orig_half_value
-        )
-        return False
-
-
-class cholesky_jitter(_dtype_value_context):
-    """
-    The jitter value used by `psd_safe_cholesky` when using cholesky solves.
-    - Default for `float`: 1e-6
-    - Default for `double`: 1e-8
-    """
-
-    _global_float_value = 1e-6  # type: ignore[assignment]
-    _global_double_value = 1e-8  # type: ignore[assignment]
-
-    @classmethod
-    def value(cls, dtype=None):
-        if dtype is None:
-            # Deprecated in 1.4: remove in 1.5
-            warnings.warn(
-                "cholesky_jitter is now a _dtype_value_context and should be called with a dtype argument",
-                DeprecationWarning,
-            )
-            return cls._global_float_value
-        return super().value(dtype=dtype)
-
-
-class _feature_flag:
-    r"""Base class for feature flag settings with global scope.
-    The default is set via the `_default` class attribute.
-    """
-
-    _default = False
-    _state = None
-
-    @classmethod
-    def is_default(cls):
-        return cls._state is None
-
-    @classmethod
-    def on(cls):
-        if cls.is_default():
-            return cls._default
-        return cls._state
-
-    @classmethod
-    def off(cls):
-        return not cls.on()
-
-    @classmethod
-    def _set_state(cls, state):
-        cls._state = state
-
-    def __init__(self, state=True):
-        self.prev = self.__class__._state
-        self.state = state
-
-    def __enter__(self):
-        self.__class__._set_state(self.state)
-
-    def __exit__(self, *args):
-        self.__class__._set_state(self.prev)
-        return False
-
-
-class verbose_linalg(_feature_flag):
-    """
-    Print out information whenever running an expensive linear algebra routine (e.g. Cholesky, CG, Lanczos, CIQ, etc.)
-    (Default: False)
-    """
-
-    _default = False
-
-    # Create a global logger
-    logger = logging.getLogger("LinAlg (Verbose)")
-    logger.setLevel(logging.DEBUG)
-
-    # Output logging results to the stdout stream
-    ch = logging.StreamHandler()
-    ch.setLevel(logging.DEBUG)
-    formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s")
-    ch.setFormatter(formatter)
-    logger.addHandler(ch)
-
-
-class cholesky_max_tries(_value_context):
-    """
-    The max_tries value used by `psd_safe_cholesky` when using cholesky solves.
-    (Default: 3)
-    """
-
-    _global_value = 3  # type: ignore[assignment]
-
-
-class NumericalWarning(RuntimeWarning):
-    """
-    Warning thrown when convergence criteria are not met, or when comptuations require extra stability.
-    """
-
-    pass
-
-
-class NanError(RuntimeError):
-    pass
-
-
-class NotPSDError(RuntimeError):
-    pass
-
-
-def _psd_safe_cholesky(A, out=None, jitter=None, max_tries=None):
-    # Maybe log
-    if verbose_linalg.on():
-        verbose_linalg.logger.debug(f"Running Cholesky on a matrix of size {A.shape}.")
-
-    if out is not None:
-        out = (out, torch.empty(A.shape[:-2], dtype=torch.int32, device=out.device))
-
-    L, info = torch.linalg.cholesky_ex(A, out=out)
-    if not torch.any(info):
-        return L
-
-    isnan = torch.isnan(A)
-    if isnan.any():
-        raise NanError(
-            f"cholesky_cpu: {isnan.sum().item()} of {A.numel()} elements of the {A.shape} tensor are NaN."
-        )
-
-    if jitter is None:
-        jitter = cholesky_jitter.value(A.dtype)
-    if max_tries is None:
-        max_tries = cholesky_max_tries.value()
-    Aprime = A.clone()
-    jitter_prev = 0
-    for i in range(max_tries):
-        jitter_new = jitter * (10**i)
-        # add jitter only where needed
-        diag_add = (
-            ((info > 0) * (jitter_new - jitter_prev))
-            .unsqueeze(-1)
-            .expand(*Aprime.shape[:-1])
-        )
-        Aprime.diagonal(dim1=-1, dim2=-2).add_(diag_add)
-        jitter_prev = jitter_new
-        warnings.warn(
-            f"A not p.d., added jitter of {jitter_new:.1e} to the diagonal",
-            NumericalWarning,
-        )
-        L, info = torch.linalg.cholesky_ex(Aprime, out=out)
-        if not torch.any(info):
-            return L
-    raise NotPSDError(
-        f"Matrix not positive definite after repeatedly adding jitter up to {jitter_new:.1e}."
-    )
-
-
-def psd_safe_cholesky(A, upper=False, out=None, jitter=None, max_tries=None):
-    """Compute the Cholesky decomposition of A. If A is only p.s.d, add a small jitter to the diagonal.
-    Args:
-        A (Tensor):
-            The tensor to compute the Cholesky decomposition of
-        upper (bool, optional):
-            See torch.cholesky
-        out (Tensor, optional):
-            See torch.cholesky
-        jitter (float, optional):
-            The jitter to add to the diagonal of A in case A is only p.s.d. If omitted,
-            uses settings.cholesky_jitter.value()
-        max_tries (int, optional):
-            Number of attempts (with successively increasing jitter) to make before raising an error.
-    """
-    L = _psd_safe_cholesky(A, out=out, jitter=jitter, max_tries=max_tries)
-    if upper:
-        if out is not None:
-            out = out.transpose_(-1, -2)
-        else:
-            L = L.transpose(-1, -2)
-    return L
-
-
-# Code for psd_safe_cholesky from gypytorch
-
-
-class ComprehensiveGPHierarchy:
-    def __init__(
-        self,
-        graph_kernels: Iterable,
-        hp_kernels: Iterable,
-        likelihood: float = 1e-3,
-        weights=None,
-        learn_all_h=False,
-        graph_feature_ard=True,
-        d_graph_features: int = 0,
-        normalize_combined_kernel=True,
-        hierarchy_consider: list = None,  # or a list of integers e.g. [0,1,2,3]
-        vectorial_features: list = None,
-        combined_kernel: str = "sum",
-        verbose: bool = False,
-        surrogate_model_fit_args: dict = None,
-        gpytorch_kinv: bool = False,
-    ):
-        self.likelihood = likelihood
-        self.surrogate_model_fit_args = surrogate_model_fit_args or {}
-        self.learn_all_h = learn_all_h
-        self.hierarchy_consider = hierarchy_consider
-        self.normalize_combined_kernel = normalize_combined_kernel
-        if self.hierarchy_consider is None:
-            self.learn_all_h = False
-        self.domain_kernels: list = []
-        if bool(graph_kernels):
-            self.domain_kernels += list(graph_kernels)
-        if bool(hp_kernels):
-            self.domain_kernels += list(hp_kernels)
-
-        self.hp_kernels = hp_kernels  # impose on scalar graph features
-        self.n_kernels: int = len(self.domain_kernels)
-        self.n_graph_kernels: int = len(
-            [i for i in self.domain_kernels if isinstance(i, GraphKernels)]
-        )
-        self.n_vector_kernels: int = self.n_kernels - self.n_graph_kernels
-        self.graph_feature_ard = graph_feature_ard
-        self.vectorial_features = vectorial_features
-        self.d_graph_features = d_graph_features
-
-        if weights is not None:
-            self.fixed_weights = True
-            if weights is not None:
-                assert len(weights) == self.n_kernels, (
-                    "the weights vector, if supplied, needs to have the same length as "
-                    "the number of kernel_operators!"
-                )
-            self.init_weights = (
-                weights
-                if isinstance(weights, torch.Tensor)
-                else torch.tensor(weights).flatten()
-            )
-        else:
-            self.fixed_weights = False
-            # Initialise the domain kernel weights to uniform
-            self.init_weights = torch.tensor(
-                [1.0 / self.n_kernels] * self.n_kernels,
-            )
-
-        self.weights = self.init_weights.clone()
-
-        if combined_kernel == "product":
-            self.combined_kernel = ProductKernel(
-                *self.domain_kernels,
-                weights=self.weights,
-                hierarchy_consider=self.hierarchy_consider,
-                d_graph_features=self.d_graph_features,
-            )
-        elif combined_kernel == "sum":
-            self.combined_kernel = SumKernel(
-                *self.domain_kernels,
-                weights=self.weights,
-                hierarchy_consider=self.hierarchy_consider,
-                d_graph_features=self.d_graph_features,
-            )
-        else:
-            raise NotImplementedError(
-                f'Combining kernel {combined_kernel} is not yet implemented! Only "sum" '
-                f'or "product" are currently supported. '
-            )
-        # Verbose mode
-        self.verbose = verbose
-        # Cache the Gram matrix inverse and its log-determinant
-        self.K, self.K_i, self.logDetK = [None] * 3
-        self.layer_weights = None
-        self.nlml = None
-
-        self.x_configs: list = None  # type: ignore[assignment]
-        self.y: torch.Tensor = None
-        self.y_: torch.Tensor = None
-        self.y_mean: torch.Tensor = None
-        self.y_std: torch.Tensor = None
-        self.n: int = None  # type: ignore[assignment]
-
-        self.gpytorch_kinv = gpytorch_kinv
-
-    def _optimize_graph_kernels(self, h_: int, lengthscale_):
-        weights = self.init_weights.clone()
-        if self.hierarchy_consider is None:
-            graphs, _ = extract_configs_hierarchy(
-                self.x_configs,
-                d_graph_features=self.d_graph_features,
-                hierarchy_consider=self.hierarchy_consider,
-            )
-            for i, k in enumerate(self.combined_kernel.kernels):
-                if not isinstance(k, GraphKernels):
-                    continue
-                elif isinstance(k, WeisfilerLehman):
-                    _grid_search_wl_kernel(
-                        k,
-                        h_,
-                        [x[i] for x in graphs]
-                        if isinstance(graphs[0], list)
-                        else [c for c in graphs],
-                        self.y,
-                        self.likelihood,
-                        lengthscales=lengthscale_,
-                        gpytorch_kinv=self.gpytorch_kinv,
-                    )
-                else:
-                    logging.warning(
-                        "(Graph) kernel optimisation for "
-                        + type(k).__name__
-                        + " not implemented yet."
-                    )
-        else:
-            if self.learn_all_h:
-                best_nlml = torch.tensor(np.inf)
-                best_subtree_depth_combo = None
-                best_K = None
-                train_y = self.y
-                h_combo_candidates = generate_h_combo_candidates(self.hierarchy_consider)
-
-                for h_combo in h_combo_candidates:
-                    for i, k in enumerate(self.combined_kernel.kernels):
-                        if isinstance(k, WeisfilerLehman):
-                            k.change_kernel_params({"h": h_combo[i]})
-                    K = self.combined_kernel.fit_transform(
-                        weights,
-                        self.x_configs,
-                        normalize=self.normalize_combined_kernel,
-                        layer_weights=None,
-                        rebuild_model=True,
-                        save_gram_matrix=True,
-                    )
-                    K_i, logDetK = compute_pd_inverse(
-                        K, self.likelihood, self.gpytorch_kinv
-                    )
-                    nlml = -compute_log_marginal_likelihood(K_i, logDetK, train_y)
-                    if nlml < best_nlml:
-                        best_nlml = nlml
-                        best_subtree_depth_combo = h_combo
-                        best_K = torch.clone(K)
-                for i, k in enumerate(self.combined_kernel.kernels):
-                    if isinstance(k, WeisfilerLehman):
-                        k.change_kernel_params({"h": best_subtree_depth_combo[i]})  # type: ignore[index]
-                self.combined_kernel._gram = best_K
-            else:
-                best_nlml = torch.tensor(np.inf)
-                best_subtree_depth = None
-                best_K = None
-                train_y = self.y
-
-                for h_i in list(h_):  # type: ignore[call-overload]
-                    # only optimize h in wl kernel
-                    if isinstance(self.combined_kernel.kernels[0], WeisfilerLehman):
-                        self.combined_kernel.kernels[0].change_kernel_params({"h": h_i})
-                        K = self.combined_kernel.fit_transform(
-                            weights,
-                            self.x_configs,
-                            normalize=self.normalize_combined_kernel,
-                            layer_weights=None,
-                            rebuild_model=True,
-                            save_gram_matrix=True,
-                        )
-                        K_i, logDetK = compute_pd_inverse(
-                            K, self.likelihood, self.gpytorch_kinv
-                        )
-                        nlml = -compute_log_marginal_likelihood(K_i, logDetK, train_y)
-                        if nlml < best_nlml:
-                            best_nlml = nlml
-                            best_subtree_depth = h_i
-                            best_K = torch.clone(K)
-                if isinstance(self.combined_kernel.kernels[0], WeisfilerLehman):
-                    self.combined_kernel.kernels[0].change_kernel_params(
-                        {"h": best_subtree_depth}
-                    )
-                    self.combined_kernel._gram = best_K
-
-    def fit(self, train_x: Iterable, train_y: Union[Iterable, torch.Tensor]):
-        self._fit(train_x, train_y, **self.surrogate_model_fit_args)
-
-    def _fit(
-        self,
-        train_x: Iterable,
-        train_y: Union[Iterable, torch.Tensor],
-        iters: int = 20,
-        optimizer: str = "adam",
-        wl_subtree_candidates: tuple = tuple(range(5)),
-        wl_lengthscales: tuple = tuple(
-            np.e**i
-            for i in range(-2, 3)  # type: ignore[name-defined]
-        ),
-        optimize_lik: bool = True,
-        max_lik: float = 0.5,
-        optimize_wl_layer_weights: bool = False,
-        optimizer_kwargs: dict = None,
-    ):
-        # Called by self._fit
-        self._reset_XY(train_x, train_y)
-
-        # Get the node weights, if needed
-        if optimizer_kwargs is None:
-            optimizer_kwargs = {"lr": 0.1}
-        if len(wl_subtree_candidates) > 0:
-            self._optimize_graph_kernels(
-                wl_subtree_candidates,  # type: ignore[arg-type]
-                wl_lengthscales,
-            )
-
-        weights = self.init_weights.clone()
-
-        if (not self.fixed_weights) and len(self.domain_kernels) > 1:
-            weights.requires_grad_(True)
-
-        # set the prior values for the lengthscales of the two global features of the final architecture graph
-        if self.graph_feature_ard:
-            theta_vector = torch.log(torch.tensor([0.6, 0.6]))
-        else:
-            theta_vector = torch.log(torch.tensor([0.6]))
-
-        # if use continuous graph properties and we set to use stationary kernels
-        if self.d_graph_features > 0 and len(self.hp_kernels) > 0:  # type: ignore[arg-type]
-            # TODO modify the code on theta_vector betlow to be compatibale with HPO
-            # theta in this case are the lengthscales for the two global property of
-            # the final architecture graph
-            # theta_vector = get_theta_vector(vectorial_features=self.vectorial_features)
-            theta_vector.requires_grad_(True)
-
-        # Whether to include the likelihood (jitter or noise variance) as a hyperparameter
-        likelihood = torch.tensor(
-            self.likelihood,
-        )
-        if optimize_lik:
-            likelihood.requires_grad_(True)
-
-        layer_weights = None
-        if optimize_wl_layer_weights:
-            for k in self.domain_kernels:
-                if isinstance(k, WeisfilerLehman):
-                    layer_weights = torch.ones(k.h + 1).requires_grad_(True)
-                    if layer_weights.shape[0] <= 1:
-                        layer_weights = None
-                    else:
-                        break
-
-        # Linking the optimizer variables to the sum kernel
-        optim_vars = []
-        # if theta_vector is not None: # TODO used for HPO
-        #     for a in theta_vector.values():
-        #         if a is not None and a.requires_grad:
-        #             optim_vars.append(a)
-        # if we use graph features, we will optimize the corresponding stationary kernel lengthscales
-        if self.d_graph_features > 0 and theta_vector.requires_grad:
-            optim_vars.append(theta_vector)
-
-        for a in [weights, likelihood, layer_weights]:
-            if a is not None and a.is_leaf and a.requires_grad:
-                optim_vars.append(a)
-
-        nlml = None
-        if len(optim_vars) == 0:  # Skip optimisation
-            K = self.combined_kernel.fit_transform(
-                weights,
-                self.x_configs,
-                normalize=self.normalize_combined_kernel,
-                feature_lengthscale=torch.exp(theta_vector),
-                layer_weights=layer_weights,
-                rebuild_model=True,
-            )
-            K_i, logDetK = compute_pd_inverse(K, likelihood, self.gpytorch_kinv)
-        else:
-            # Select the optimizer
-            assert optimizer.lower() in ["adam", "sgd"]
-            if optimizer.lower() == "adam":
-                optim = torch.optim.Adam(optim_vars, **optimizer_kwargs)
-            else:
-                optim = torch.optim.SGD(optim_vars, **optimizer_kwargs)
-
-            K = None
-            optim_vars_list = []
-            nlml_list = []
-            for i in range(iters):
-                optim.zero_grad()
-                K = self.combined_kernel.fit_transform(
-                    weights,
-                    self.x_configs,
-                    normalize=self.normalize_combined_kernel,
-                    feature_lengthscale=torch.exp(theta_vector),
-                    layer_weights=layer_weights,
-                    rebuild_model=True,
-                    save_gram_matrix=True,
-                )
-                K_i, logDetK = compute_pd_inverse(K, likelihood, self.gpytorch_kinv)
-                nlml = -compute_log_marginal_likelihood(K_i, logDetK, self.y)
-                nlml.backward(create_graph=True)
-                if self.verbose and i % 10 == 0:
-                    logger.info(
-                        "Iteration:",
-                        i,
-                        "/",
-                        iters,
-                        "Negative log-marginal likelihood:",
-                        nlml.item(),
-                        theta_vector,
-                        weights,
-                        likelihood,
-                    )
-                optim.step()
-
-                with torch.no_grad():
-                    likelihood.clamp_(
-                        1e-5, max_lik
-                    ) if likelihood is not None and likelihood.is_leaf else None
-
-                optim_vars_list.append(
-                    [
-                        theta_vector.clone().detach(),
-                        weights.clone().detach(),
-                        likelihood.clone().detach(),
-                    ]
-                )
-                nlml_list.append(nlml.item())
-
-                optim.zero_grad(set_to_none=True)
-
-            theta_vector, weights, likelihood = optim_vars_list[np.argmin(nlml_list)]
-            K = self.combined_kernel.fit_transform(
-                weights,
-                self.x_configs,
-                normalize=self.normalize_combined_kernel,
-                feature_lengthscale=torch.exp(theta_vector),
-                layer_weights=layer_weights,
-                rebuild_model=True,
-                save_gram_matrix=True,
-            )
-            K_i, logDetK = compute_pd_inverse(K, likelihood, self.gpytorch_kinv)
-
-        # Apply the optimal hyperparameters
-        # transform the weights in the combine_kernel function
-        self.weights = weights
-        self.K_i = K_i.clone()
-        self.K = K.clone()
-        self.logDetK = logDetK.clone()
-        self.likelihood = likelihood.item()
-        self.theta_vector = theta_vector
-        self.layer_weights = layer_weights
-        self.nlml = nlml.detach().cpu() if nlml is not None else None
-
-        for k in self.combined_kernel.kernels:
-            if isinstance(k, Stationary):
-                k.update_lengthscales(lengthscale=torch.exp(theta_vector))
-
-        self.combined_kernel.weights = weights.clone()
-
-    def predict(self, x_configs, preserve_comp_graph: bool = False):
-        """Kriging predictions"""
-
-        if not isinstance(x_configs, list):
-            # Convert a single input X_s to a singleton list
-            x_configs = [x_configs]
-
-        if self.K_i is None or self.logDetK is None:
-            raise ValueError(
-                "Inverse of Gram matrix is not instantiated. Please call the optimize "
-                "function to fit on the training data first!"
-            )
-
-        # Concatenate the full list
-        X_configs_all = self.x_configs + x_configs
-
-        # Make a copy of the sum_kernels for this step, to avoid breaking the autodiff
-        # if grad guided mutation is used
-        if preserve_comp_graph:
-            combined_kernel_copy = deepcopy(self.combined_kernel)
-        else:
-            combined_kernel_copy = self.combined_kernel
-
-        K_full = combined_kernel_copy.fit_transform(
-            self.weights,
-            X_configs_all,
-            layer_weights=self.layer_weights,
-            normalize=self.normalize_combined_kernel,
-            feature_lengthscale=torch.exp(self.theta_vector),
-            rebuild_model=True,
-            save_gram_matrix=False,
-            gp_fit=False,
-        )
-
-        K_s = K_full[: self.n :, self.n :]
-
-        K_ss = K_full[self.n :, self.n :] + self.likelihood * torch.eye(
-            len(x_configs),
-        )
-
-        mu_s = K_s.t() @ self.K_i @ self.y
-        cov_s = K_ss - K_s.t() @ self.K_i @ K_s
-        # TODO not taking the diag?
-        cov_s = torch.clamp(cov_s, self.likelihood, np.inf)
-        mu_s = unnormalize_y(mu_s, self.y_mean, self.y_std)
-        std_s = torch.sqrt(cov_s)
-        std_s = unnormalize_y(std_s, None, self.y_std, True)
-        cov_s = std_s**2
-        if preserve_comp_graph:
-            del combined_kernel_copy
-        return mu_s, cov_s
-
-    @property
-    def x(self):
-        return self.x_configs
-
-    def _reset_XY(self, train_x: Iterable, train_y: Union[Iterable, torch.Tensor]):
-        self.x_configs = train_x  # type: ignore[assignment]
-        self.n = len(self.x_configs)
-        train_y_tensor = (
-            train_y
-            if isinstance(train_y, torch.Tensor)
-            else torch.tensor(train_y, dtype=torch.get_default_dtype())
-        )
-        self.y_ = train_y_tensor
-        self.y, self.y_mean, self.y_std = normalize_y(train_y_tensor)
-        # The Gram matrix of the training data
-        self.K_i, self.logDetK = None, None
-
-
-def get_grad(grad_matrix, feature_matrix, average_occurrences=False):
-    r"""
-    Average across the samples via a Monte Carlo sampling scheme. Also estimates the
-    empirical variance. :param average_occurrences: if True, do a weighted summation
-    based on the frequency distribution of the occurrence to compute a gradient *per
-    each feature*. Otherwise, each different occurrence (\phi_i = k) will get a
-    different gradient estimate.
-    """
-    assert grad_matrix.shape == feature_matrix.shape
-    # Prune out the all-zero columns that pop up sometimes
-    valid_cols = []
-    for col_idx in range(feature_matrix.size(1)):
-        if not torch.all(feature_matrix[:, col_idx] == 0):
-            valid_cols.append(col_idx)
-    feature_matrix = feature_matrix[:, valid_cols]
-    grad_matrix = grad_matrix[:, valid_cols]
-
-    _, D = feature_matrix.shape
-    if average_occurrences:
-        avg_grad = torch.zeros(D)
-        avg_grad_var = torch.zeros(D)
-        for d in range(D):
-            current_feature = feature_matrix[:, d].clone().detach()
-            instances, indices, counts = torch.unique(
-                current_feature, return_inverse=True, return_counts=True
-            )
-            weight_vector = torch.tensor([counts[i] for i in indices]).type(torch.float)
-            weight_vector /= weight_vector.sum()
-            mean = torch.sum(weight_vector * grad_matrix[:, d])
-            # Compute the empirical variance of gradients
-            variance = torch.sum(weight_vector * grad_matrix[:, d] ** 2) - mean**2
-            avg_grad[d] = mean
-            avg_grad_var[d] = variance
-        return avg_grad, avg_grad_var, feature_matrix.sum(dim=0)
-    else:
-        # The maximum number possible occurrences -- 7 is an example, if problem occurs, maybe we can increase this
-        # number. But for now, for both NAS-Bench datasets, this should be more than enough!
-        max_occur = 7
-        avg_grad = torch.zeros(D, max_occur)
-        avg_grad_var = torch.zeros(D, max_occur)
-        incidences = torch.zeros(D, max_occur)
-        for d in range(D):
-            current_feature = feature_matrix[:, d].clone().detach()
-            instances, indices, counts = torch.unique(
-                current_feature, return_inverse=True, return_counts=True
-            )
-            for i, val in enumerate(instances):
-                # Find index of all feature counts that are equal to the current val
-                feature_at_val = grad_matrix[current_feature == val]
-                avg_grad[d, int(val)] = torch.mean(feature_at_val)
-                avg_grad_var[d, int(val)] = torch.var(feature_at_val)
-                incidences[d, int(val)] = counts[i]
-        return avg_grad, avg_grad_var, incidences
-
-
-# Optimize Graph kernel
-def getBack(var_grad_fn, logger):
-    logger.debug(var_grad_fn)
-    for n in var_grad_fn.next_functions:
-        if n[0]:
-            try:
-                tensor = getattr(n[0], "variable")
-                logger.debug(n[0])
-                logger.debug(f"Tensor with grad found: {tensor}")
-                logger.debug(f" - gradient: {tensor.grad}")
-            except AttributeError:
-                getBack(n[0], logger)
-
-
-def _grid_search_wl_kernel(
-    k: WeisfilerLehman,
-    subtree_candidates,
-    train_x: list,
-    train_y: torch.Tensor,
-    lik: float,
-    subtree_prior=None,
-    lengthscales=None,
-    lengthscales_prior=None,
-    gpytorch_kinv: bool = False,
-):
-    """Optimize the *discrete hyperparameters* of Weisfeiler Lehman kernel.
-    k: a Weisfeiler-Lehman kernel instance
-    hyperparameter_candidate: list of candidate hyperparameter to try
-    train_x: the train data
-    train_y: the train label
-    lik: likelihood
-    lengthscale: if using RBF kernel for successive embedding, the list of lengthscale to be grid searched over
-    """
-    # lik = 1e-6
-    assert len(train_x) == len(train_y)
-    best_nlml = torch.tensor(np.inf)
-    best_subtree_depth = None
-    best_lengthscale = None
-    best_K = None
-    if lengthscales is not None and k.se is not None:
-        candidates = [(h_, l_) for h_ in subtree_candidates for l_ in lengthscales]
-    else:
-        candidates = [(h_, None) for h_ in subtree_candidates]
-
-    for i in candidates:
-        if k.se is not None:
-            k.change_se_params({"lengthscale": i[1]})
-        k.change_kernel_params({"h": i[0]})
-        K = k.fit_transform(train_x, rebuild_model=True, save_gram_matrix=True)
-        K_i, logDetK = compute_pd_inverse(K, lik, gpytorch_kinv)
-        nlml = -compute_log_marginal_likelihood(K_i, logDetK, train_y)
-        if nlml < best_nlml:
-            best_nlml = nlml
-            best_subtree_depth, best_lengthscale = i
-            best_K = torch.clone(K)
-    k.change_kernel_params({"h": best_subtree_depth})
-    if k.se is not None:
-        k.change_se_params({"lengthscale": best_lengthscale})
-    k.gram_ = best_K
-
-
-def get_theta_vector(vectorial_features):
-    if vectorial_features is None:
-        return None
-    theta_vector = {}
-    for key, dim in vectorial_features.items():
-        t = torch.ones(dim)
-        if t.shape[0] > 1:
-            t.requires_grad_(True)
-        theta_vector[key] = t
-    return theta_vector
-
-
-def normalize_y(y: torch.Tensor):
-    y_mean = torch.mean(y) if isinstance(y, torch.Tensor) else np.mean(y)
-    y_std = torch.std(y) if isinstance(y, torch.Tensor) else np.std(y)
-    if y_std == 0:
-        y_std = 1
-    y = (y - y_mean) / y_std
-    return y, y_mean, y_std
-
-
-def unnormalize_y(y, y_mean, y_std, scale_std=False):
-    """Similar to the undoing of the pre-processing step above, but on the output predictions"""
-    if not scale_std:
-        y = y * y_std + y_mean
-    else:
-        y *= y_std
-    return y
-
-
-def standardize_x(
-    x: torch.Tensor, x_min: torch.Tensor = None, x_max: torch.Tensor = None
-):
-    """Standardize the vectorial input into a d-dimensional hypercube [0, 1]^d, where d is the number of features.
-    if x_min ond x_max are supplied, x2 will be standardised using these instead. This is used when standardising the
-    validation/test inputs.
-    """
-    if (x_min is not None and x_max is None) or (x_min is None and x_max is not None):
-        raise ValueError(
-            "Either *both* or *neither* of x_min, x_max need to be supplied!"
-        )
-    if x_min is None:
-        x_min = torch.min(x, 0)[0]
-        x_max = torch.max(x, 0)[0]
-    x = (x - x_min) / (x_max - x_min)
-    return x, x_min, x_max
-
-
-def compute_log_marginal_likelihood(
-    K_i: torch.Tensor,
-    logDetK: torch.Tensor,
-    y: torch.Tensor,
-    normalize: bool = True,
-    log_prior_dist=None,
-):
-    """Compute the zero mean Gaussian process log marginal likelihood given the inverse of Gram matrix K(x2,x2), its
-    log determinant, and the training label vector y.
-    Option:
-
-    normalize: normalize the log marginal likelihood by the length of the label vector, as per the gpytorch
-    routine.
-
-    prior: A pytorch distribution object. If specified, the hyperparameter prior will be taken into consideration and
-    we use Type-II MAP instead of Type-II MLE (compute log_posterior instead of log_evidence)
-    """
-    lml = (
-        -0.5 * y.t() @ K_i @ y
-        + 0.5 * logDetK
-        - y.shape[0]
-        / 2.0
-        * torch.log(
-            2
-            * torch.tensor(
-                np.pi,
-            )
-        )
-    )
-    if log_prior_dist is not None:
-        lml -= log_prior_dist
-    return lml / y.shape[0] if normalize else lml
-
-
-def generate_h_combo_candidates(hierarchy_consider):
-    h_range_all_hierarchy = [range(min(hier + 2, 4)) for hier in hierarchy_consider]
-    h_range_all_hierarchy = [range(5)] + h_range_all_hierarchy
-    h_combo_all = list(itertools.product(*h_range_all_hierarchy))
-    h_combo_sub = []
-    for h_combo in h_combo_all:
-        sorted_h_combo = sorted(h_combo)
-        if sorted_h_combo not in h_combo_sub:
-            h_combo_sub.append(sorted_h_combo)
-    return h_combo_sub
-
-
-def compute_pd_inverse(
-    K: torch.tensor, jitter: float = 1e-5, gpytorch_kinv: bool = False
-):
-    """Compute the inverse of a postive-(semi)definite matrix K using Cholesky inversion."""
-    if gpytorch_kinv:
-        Kc = psd_safe_cholesky(K)
-        try:
-            Kc.required_grad = True
-        except Exception:
-            Kc = torch.Tensor(Kc)
-    else:
-        n = K.shape[0]
-        assert (
-            isinstance(jitter, float) or jitter.ndim == 0
-        ), "only homoscedastic noise variance is allowed here!"
-        is_successful = False
-        fail_count = 0
-        max_fail = 3
-        while fail_count < max_fail and not is_successful:
-            try:
-                jitter_diag = jitter * torch.eye(n, device=K.device) * 10**fail_count
-                K_ = K + jitter_diag
-                Kc = torch.linalg.cholesky(K_)
-                is_successful = True
-            except RuntimeError:
-                fail_count += 1
-        if not is_successful:
-            raise RuntimeError(
-                f"Gram matrix not positive definite despite of jitter:\n{K}"
-            )
-
-    logDetK = -2 * torch.sum(torch.log(torch.diag(Kc)))
-    K_i = torch.cholesky_inverse(Kc)
-    return K_i.to(torch.get_default_dtype()), logDetK.to(torch.get_default_dtype())
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 2002aeab..c5c47332 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -3,8 +3,9 @@
 import random
 from typing import Any, TYPE_CHECKING, Literal
 from typing_extensions import override
+from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP
 
-from neps.state.optimizer import BudgetInfo, OptimizationState
+from neps.state.optimizer import BudgetInfo
 from neps.utils.types import ConfigResult, RawConfig
 from neps.utils.common import instance_from_map
 from neps.search_spaces import (
@@ -25,7 +26,6 @@
 from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
     AcquisitionSampler,
 )
-from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_default_kernels
 from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
 
 if TYPE_CHECKING:
@@ -51,11 +51,6 @@ def __init__(
         pipeline_space: SearchSpace,
         initial_design_size: int = 10,
         surrogate_model: str | Any = "gp",
-        surrogate_model_args: dict = None,
-        optimal_assignment: bool = False,
-        domain_se_kernel: str = None,
-        graph_kernels: list = None,
-        hp_kernels: list = None,
         acquisition: str | BaseAcquisition = "EI",
         log_prior_weighted: bool = False,
         acquisition_sampler: str | AcquisitionSampler = "mutation",
@@ -77,12 +72,6 @@ def __init__(
             initial_design_size: Number of 'x' samples that need to be evaluated before
                 selecting a sample using a strategy instead of randomly.
             surrogate_model: Surrogate model
-            surrogate_model_args: Arguments that will be given to the surrogate model
-                (the Gaussian processes model).
-            optimal_assignment: whether the optimal assignment kernel should be used.
-            domain_se_kernel: Stationary kernel name
-            graph_kernels: Kernels for NAS
-            hp_kernels: Kernels for HPO
             acquisition: Acquisition strategy
             log_prior_weighted: if to use log for prior
             acquisition_sampler: Acquisition function fetching strategy
@@ -141,36 +130,21 @@ def __init__(
         self._model_update_failed: bool = False
         self.sample_default_first = sample_default_first
 
-        surrogate_model_args = surrogate_model_args or {}
-        graph_kernels, hp_kernels = get_default_kernels(
-            self.pipeline_space,
-            domain_se_kernel,
-            graph_kernels,
-            hp_kernels,
-            optimal_assignment,
-        )
-        if "graph_kernels" not in surrogate_model_args:
-            surrogate_model_args["graph_kernels"] = graph_kernels
-        if "hp_kernels" not in surrogate_model_args:
-            surrogate_model_args["hp_kernels"] = hp_kernels
-
-        if (
-            not surrogate_model_args["graph_kernels"]
-            and not surrogate_model_args["hp_kernels"]
-        ):
-            raise ValueError("No kernels are provided!")
-
-        if "vectorial_features" not in surrogate_model_args:
-            surrogate_model_args["vectorial_features"] = (
-                self.pipeline_space.get_vectorial_dim()
-            )
-
-        self.surrogate_model = instance_from_map(
-            SurrogateModelMapping,
-            surrogate_model,
-            name="surrogate model",
-            kwargs=surrogate_model_args,
-        )
+        if isinstance(surrogate_model, str):
+            if surrogate_model == "gp":
+                self.surrogate_model = ComprehensiveGP.get_default(
+                    space=pipeline_space,
+                    include_fidelities=False,
+                )
+            else:
+                self.surrogate_model = instance_from_map(
+                    SurrogateModelMapping,
+                    surrogate_model,
+                    name="surrogate model",
+                    kwargs=surrogate_model_args,
+                )
+        else:
+            self.surrogate_model = surrogate_model
 
         self.acquisition = instance_from_map(
             AcquisitionMapping,
diff --git a/neps/optimizers/multi_fidelity/dyhpo.py b/neps/optimizers/multi_fidelity/dyhpo.py
index bb4879b9..1a063e39 100755
--- a/neps/optimizers/multi_fidelity/dyhpo.py
+++ b/neps/optimizers/multi_fidelity/dyhpo.py
@@ -20,7 +20,6 @@
 from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
     AcquisitionSampler,
 )
-from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_default_kernels
 from neps.optimizers.multi_fidelity.mf_bo import FreezeThawModel, PFNSurrogate
 from neps.optimizers.multi_fidelity.utils import MFObservedData
 
diff --git a/neps/optimizers/multi_fidelity/sampling_policy.py b/neps/optimizers/multi_fidelity/sampling_policy.py
index fceb44e9..8626f7ab 100644
--- a/neps/optimizers/multi_fidelity/sampling_policy.py
+++ b/neps/optimizers/multi_fidelity/sampling_policy.py
@@ -22,7 +22,6 @@
 from ..bayesian_optimization.acquisition_samplers.base_acq_sampler import (
     AcquisitionSampler,
 )
-from ..bayesian_optimization.kernels.get_kernels import get_default_kernels
 from ..bayesian_optimization.models import SurrogateModelMapping
 from ..multi_fidelity_prior.utils import (
     compute_config_dist,
@@ -269,9 +268,6 @@ def __init__(
         self,
         pipeline_space: SearchSpace,
         surrogate_model: str | Any = "gp",
-        domain_se_kernel: str = None,
-        graph_kernels: list = None,
-        hp_kernels: list = None,
         surrogate_model_args: dict = None,
         acquisition: str | BaseAcquisition = "EI",
         log_prior_weighted: bool = False,
@@ -282,25 +278,6 @@ def __init__(
         super().__init__(pipeline_space=pipeline_space, logger=logger)
 
         surrogate_model_args = surrogate_model_args or {}
-
-        graph_kernels, hp_kernels = get_default_kernels(
-            pipeline_space=pipeline_space,
-            domain_se_kernel=domain_se_kernel,
-            graph_kernels=graph_kernels,
-            hp_kernels=hp_kernels,
-            optimal_assignment=False,
-        )
-        if "graph_kernels" not in surrogate_model_args:
-            surrogate_model_args["graph_kernels"] = None
-        if "hp_kernels" not in surrogate_model_args:
-            surrogate_model_args["hp_kernels"] = hp_kernels
-        if not surrogate_model_args["hp_kernels"]:
-            raise ValueError("No kernels are provided!")
-        if "vectorial_features" not in surrogate_model_args:
-            surrogate_model_args["vectorial_features"] = (
-                pipeline_space.get_vectorial_dim()
-            )
-
         self.surrogate_model = instance_from_map(
             SurrogateModelMapping,
             surrogate_model,
diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
index 1ab1f92a..592cfa88 100644
--- a/neps/search_spaces/encoding.py
+++ b/neps/search_spaces/encoding.py
@@ -1,132 +1,360 @@
 from __future__ import annotations
 
-from collections.abc import Sized
-
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from grakel.utils import graph_from_networkx
 
-from typing import Sequence, Iterable, TypeAlias
-from typing_extensions import Self
-from more_itertools import split_when
+from typing import Any, TypeAlias, TypeVar, Generic
+from typing_extensions import Self, override, Self
 from itertools import chain
 import torch
+from neps.search_spaces import (
+    CategoricalParameter,
+    IntegerParameter,
+    FloatParameter,
+)
 
-from neps.search_spaces.search_space import SearchSpace
+from neps.search_spaces.search_space import SearchSpace, Parameter
 
 WLInput: TypeAlias = tuple[dict, dict | None, dict | None]
 
 
 @dataclass
-class TensorEncodedConfigs(Sized):
-    _tensor_pack: torch.Tensor | None
-    """Layout such that _tensor_pack[0] is the first config.
-
-    In the case that there are no numeric/categorical hyperparameters,
-    this is None.
-
-    index config_row_id | fidelities... | numericals... | one_hot_categoricals...
-           0
-           1
-           2
-          ...
-
-    NOTE: A slight memory innefficiency here is that we store the one-hot encoded
-    as a float tensor, rather than a byte tensor. This makes joint numerical/categorical
-    kernels more efficient, as well as entire config row access at the cost of memory.
-    This should not be a problem if we do not have a large number of categorical
-    hyperparameters with a high number of choices.
-    """
-    _graphs: dict[str, Sequence[WLInput]]
-    _col_lookup: dict[str, tuple[int, int]]  # range(inclusive, exclusive)
-
-    def __len__(self) -> int:
-        return self._tensor_pack.shape[0] if self._tensor_pack is not None else 0
-
-    def wl_graph_input(self, hp: str) -> Sequence[WLInput]:
-        return self._graphs[hp]
-
-    def tensor(self, hps: Iterable[str]) -> torch.Tensor:
-        if self._tensor_pack is None:
-            raise ValueError("No numerical/categorical hyperparameters were encoded.")
-
-        cols: list[tuple[int, int]] = []
-        for hp in hps:
-            _cols = self._col_lookup.get(hp)
-            if _cols is None:
-                raise ValueError(f"Hyperparameter {hp} not found in the lookup table.")
-            cols.append(_cols)
-
-        # OPTIM: This code with `split_when` and `chunks` makes sure to grab
-        # consecutive chunks of memory where possible. For example,
-        # if we want all categoricals, this will just return the entire
-        # categorical tensor, rather than subselecting each part and then concatenating.
-        # Also works for numericals.
-        sorted_indices = sorted(cols)
-        non_consecutive_tuple = lambda x, y: x[1] != y[0]
-        chunks = list(split_when(sorted_indices, non_consecutive_tuple))
-        slices = [slice(chunk[0][0], chunk[-1][1]) for chunk in chunks]
-        tensors = [self._tensor_pack[:, s] for s in slices]
-
-        if len(tensors) == 1:
-            return tensors[0].clone()
-
-        return torch.cat(tensors, dim=1)
+class GraphEncoder:
+    hps: tuple[str]
 
-    @classmethod
     def encode(
-        cls,
+        self,
+        x: list[dict[str, Any]],
         space: SearchSpace,
-        configs: list[SearchSpace],
-        *,
-        node_label: str = "op_name",
-        device: torch.device,
-    ) -> Self:
-        assert node_label == "op_name", "Only 'op_name' is supported for node_label"
+    ) -> dict[str, list[WLInput]]:
+        return {hp: [config[hp].value for config in x] for hp in self.hps}
+
+
+T = TypeVar("T")
+
+
+@dataclass
+class Transformer(Generic[T]):
+    hps: tuple[str]
+
+    def encode(self, x: list[dict[str, Any]], space: SearchSpace) -> T: ...
+
+    def value_decode(self, x: T, space: SearchSpace) -> dict[str, list[Any]]: ...
+
+    def decode(self, x: T, space: SearchSpace) -> list[dict[str, Any]]:
+        values = self.value_decode(x, space)
+        return [(dict(zip(values, t))) for t in zip(*values.values())]
+
 
-        _graphs: dict[str, Sequence[WLInput]] = {}
+@dataclass
+class WLInputTransformer(Transformer[WLInput]):
+    def encode(
+        self,
+        x: list[dict[str, Any]],
+        space: SearchSpace,
+    ) -> dict[str, list[WLInput]]:
+        _graphs: dict[str, list[WLInput]] = {}
         for hp_name in space.graphs.keys():
-            gs = [conf.graphs[hp_name].value for conf in configs]
-            if (
-                len(gs) > 0
-                and isinstance(gs[0], list)
-                and len(gs[0]) > 0
-                and isinstance(gs[0][0], list)
-            ):
-                gs = [_list for list_of_list in gs for _list in list_of_list]
+            gs = [conf[hp_name].value for conf in x]
             _graphs[hp_name] = graph_from_networkx(gs)  # type: ignore
 
-        _lookup: dict[str, tuple[int, int]] = {}
+        return _graphs
 
-        n_fids = len(space.fidelities)
-        n_nums = len(space.numerical)
-        n_cats = sum(len(hp.choices) for hp in space.categoricals.values())
+    def value_decode(
+        self,
+        x: dict[str, list[WLInput]],
+        space: SearchSpace,
+    ) -> dict[str, list[Any]]:
+        raise NotImplementedError("Cannot decode WLInput to values.")
 
-        width = n_fids + n_nums + n_cats
-        if width == 0:
-            return cls(_graphs=_graphs, _tensor_pack=None, _col_lookup={})
 
-        _tensor_pack = torch.empty(size=(len(configs), width), dtype=torch.float64)
+@dataclass
+class TensorTransformer(Transformer[torch.Tensor]):
+    def output_cols(self, space: SearchSpace) -> int: ...
 
-        offset = 0
-        for hp_name in chain(space.fidelities, space.numerical):
-            _lookup[hp_name] = (offset, offset + 1)
-            _xs = [config.fidelities[hp_name].normalized_value for config in configs]
-            values = torch.tensor(_xs, torch.float64, device=device)
+    def encode(
+        self,
+        x: list[dict[str, Any]],
+        space: SearchSpace,
+        *,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ) -> torch.Tensor:
+        width = len(self.hps)
+        buffer = torch.empty(size=(len(x), width), dtype=dtype, device=device)
+
+        for i, name in enumerate(self.hps):
+            hp = space[name]
+            assert isinstance(hp, CategoricalParameter)
+            values = torch.tensor(
+                [config[name]._value_index for config in x], dtype=dtype, device=device
+            )
+
+        return buffer
+
+
+@dataclass
+class IntegerCategoricalTransformer(TensorTransformer):
+    def output_cols(self, space: SearchSpace) -> int:
+        return len(self.hps)
+
+    @override
+    def encode(
+        self,
+        x: list[dict[str, Any]],
+        space: SearchSpace,
+        *,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        if dtype is None:
+            dtype = torch.int
+
+        buffer = torch.empty(size=(len(x), len(self.hps)), dtype=dtype, device=device)
+        for i, name in enumerate(self.hps):
+            hp = space[name]
+            assert isinstance(hp, CategoricalParameter)
+            values = torch.tensor(
+                [config[name].value for config in x], dtype=dtype, device=device
+            )
+            buffer[:, i] = values
+
+        return buffer
+
+    @override
+    def value_decode(self, x: torch.Tensor, space: SearchSpace) -> dict[str, list[Any]]:
+        values: dict[str, list[Any]] = {}
+        for i, name in enumerate(self.hps):
+            hp = space[name]
+            assert isinstance(hp, CategoricalParameter)
+            enc = x[:, i]
+            values[name] = [hp.choices[i] for i in enc.tolist()]
+
+        return values
+
+
+@dataclass
+class MinMaxNormalizer(TensorTransformer):
+    def output_cols(self, space: SearchSpace) -> int:
+        return len(self.hps)
+
+    @override
+    def encode(
+        self,
+        x: list[dict[str, Any]],
+        space: SearchSpace,
+        *,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        if dtype is None:
+            dtype = torch.float64
+
+        width = len(self.hps)
+        buffer = torch.empty(size=(len(x), width), dtype=dtype, device=device)
+
+        for i, name in enumerate(self.hps):
+            hp = space[name]
+            assert isinstance(hp, (FloatParameter, IntegerParameter))
+            values = torch.tensor(
+                [config[name].value for config in x], dtype=dtype, device=device
+            )
+            if hp.log_bounds:
+                lower, upper = hp.log_bounds
+                buffer[:, i] = (torch.log(values) - lower) / (upper - lower)
+            else:
+                lower, upper = hp.lower, hp.upper
+                buffer[:, i] = (values - lower) / (upper - lower)
+
+        return buffer
+
+    @override
+    def value_decode(
+        self,
+        x: torch.Tensor,
+        space: SearchSpace,
+    ) -> dict[str, list[Any]]:
+        values: dict[str, list[Any]] = {}
+
+        for i, name in enumerate(self.hps):
+            hp = space[name]
+            assert isinstance(hp, (FloatParameter, IntegerParameter))
+            enc = x[:, i]
+            if hp.log_bounds:
+                lower, upper = hp.log_bounds
+                enc = torch.exp(enc * (upper - lower) + lower)
+            else:
+                lower, upper = hp.lower, hp.upper
+                enc = enc * (upper - lower) + lower
 
-            _tensor_pack[:, offset] = values
+            if isinstance(hp, IntegerParameter):
+                enc = torch.round(enc).to(torch.int)
 
-            offset += 1
+            values[name] = enc.tolist()
 
-        for hp_name, cat in space.categoricals.items():
-            n_choices = len(cat.choices)
-            _lookup[hp_name] = (offset, offset + n_choices)
+        return values
 
-            # .. and insert one-hot encoding (ChatGPT solution, verified locally)
-            _xs = [config[hp_name].normalized_value for config in configs]
-            cat_tensor = torch.tensor(_xs, torch.float64, device=device).unsqueeze(1)
 
-            _tensor_pack[:, offset : offset + n_choices].scatter_(1, cat_tensor, 1)
+@dataclass
+class StandardNormalizer(TensorTransformer):
+    std_means: dict[str, tuple[float, float]] = field(default_factory=dict)
+
+    def output_cols(self, space: SearchSpace) -> int:
+        return len(self.hps)
+
+    @override
+    def encode(
+        self,
+        x: list[dict[str, Any]],
+        space: SearchSpace,
+        *,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        if dtype is None:
+            dtype = torch.float64
+
+        width = len(self.hps)
+        buffer = torch.empty(size=(len(x), width), dtype=dtype, device=device)
+        std_means: dict[str, tuple[float, float]] = {}
+
+        for i, name in enumerate(self.hps):
+            hp = space[name]
+            assert isinstance(hp, (FloatParameter, IntegerParameter))
+            values = torch.tensor(
+                [config[name].value for config in x], dtype=dtype, device=device
+            )
+            if hp.log_bounds:
+                values = torch.log(values)
+
+            mean, std = values.mean(), values.std()
+            std_means[name] = (mean.item(), std.item())
+
+            buffer[:, i] = (values - mean) / std
+
+        self.std_means = std_means
+        return buffer
+
+    @override
+    def value_decode(self, x: torch.Tensor, space: SearchSpace) -> dict[str, list[Any]]:
+        values: dict[str, list[Any]] = {}
+
+        for i, name in enumerate(self.hps):
+            hp = space[name]
+            assert isinstance(hp, Parameter)
+            enc = x[:, i]
+            if isinstance(hp, (FloatParameter, IntegerParameter)):
+                std, mean = self.std_means[name]
+                if hp.log_bounds:
+                    enc = torch.exp(enc * std + mean)
+                else:
+                    enc = enc * std + mean
+
+                if isinstance(hp, IntegerParameter):
+                    enc = torch.round(enc).to(torch.int)
+
+                values[name] = enc.tolist()
+            else:
+                raise ValueError(f"Invalid hyperparameter type: {type(hp)}")
+
+        return values
+
+
+@dataclass
+class OneHotEncoder(TensorTransformer):
+    def output_cols(self, space: SearchSpace) -> int:
+        return sum(len(hp.choices) for hp in (space[name] for name in self.hps))  # type: ignore
+
+    @override
+    def encode(
+        self,
+        x: list[dict[str, Any]],
+        space: SearchSpace,
+        *,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        if dtype is None:
+            dtype = torch.bool
+
+        categoricals: dict[str, CategoricalParameter] = {}
+        for name in self.hps:
+            hp = space[name]
+            assert isinstance(hp, CategoricalParameter)
+            categoricals[name] = hp
 
+        width = sum(len(hp.choices) for hp in categoricals.values())
+        buffer = torch.zeros(size=(len(x), width), dtype=dtype, device=device)
+
+        offset = 0
+        for name, hp in categoricals.items():
+            n_choices = len(hp.choices)
+            _xs = [config[name]._value_index for config in x]
+            cat_tensor = torch.tensor(_xs, dtype=torch.int64, device=device).unsqueeze(1)
+            buffer[:, offset : offset + n_choices].scatter_(1, cat_tensor, 1)
+            offset += n_choices
+
+        return buffer
+
+    @override
+    def value_decode(
+        self,
+        x: torch.Tensor,
+        space: SearchSpace,
+    ) -> dict[str, list[Any]]:
+        values: dict[str, list[Any]] = {}
+
+        offset = 0
+        for name in self.hps:
+            hp = space[name]
+            assert isinstance(hp, CategoricalParameter)
+            n_choices = len(hp.choices)
+            enc = x[:, offset : offset + n_choices].argmax(dim=1)
+
+            values[name] = [hp.choices[i] for i in enc]
             offset += n_choices
 
-        return cls(_graphs=_graphs, _tensor_pack=_tensor_pack, _col_lookup=_lookup)
+        return values
+
+
+@dataclass
+class JointTransformer(TensorTransformer):
+    transforms: tuple[TensorTransformer, ...]
+
+    def output_cols(self, space: SearchSpace) -> int:
+        return sum(t.output_cols(space) for t in self.transforms)
+
+    @classmethod
+    def join(cls, *transforms: TensorTransformer) -> Self:
+        hps = tuple(chain.from_iterable(t.hps for t in transforms))
+        return cls(hps, transforms)
+
+    @override
+    def encode(
+        self,
+        x: list[dict[str, Any]],
+        space: SearchSpace,
+        *,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        return torch.cat(
+            [t.encode(x, space, dtype=dtype, device=device) for t in self.transforms],
+            dim=1,
+        )
+
+    @override
+    def value_decode(
+        self,
+        x: torch.Tensor,
+        space: SearchSpace,
+    ) -> dict[str, list[Any]]:
+        values: dict[str, list[Any]] = {}
+        offset = 0
+        for t in self.transforms:
+            width = t.output_cols(space)
+            t_values = t.value_decode(x[:, offset : offset + width], space)
+            values.update(t_values)
+            offset += width
+
+        return values
diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py
index 164b49cb..2a20399d 100644
--- a/neps_examples/basic_usage/hyperparameters.py
+++ b/neps_examples/basic_usage/hyperparameters.py
@@ -26,5 +26,5 @@ def run_pipeline(float1, float2, categorical, integer1, integer2):
     pipeline_space=pipeline_space,
     root_directory="results/hyperparameters_example",
     post_run_summary=True,
-    max_evaluations_total=15,
+    max_evaluations_total=50,
 )
diff --git a/pyproject.toml b/pyproject.toml
index 06b4baa4..b5be06c2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -103,7 +103,6 @@ src = ["neps"]
 
 # TODO(eddiebergman): Include more of these as we go on in migration
 exclude = [
-  "neps/optimizers/**/*.py",
   "neps/search_spaces/architecture/**/*.py",
   "neps/search_spaces/yaml_search_space_utils.py",
   "neps/utils/run_args_from_yaml.py",

From 27b31196ace834750e26c05e87fdd2da4efbfbe8 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Tue, 20 Aug 2024 14:24:46 +0200
Subject: [PATCH 10/63] checkpoint: not working yet

---
 .../acquisition_functions/aq_functions.py     |  88 +++
 .../acquisition_functions/ei.py               |  20 +-
 .../acquisition_sampler_2/__init__.py         |   0
 .../acquisition_sampler_2/aq_samplers.py      |  22 +
 .../acquisition_sampler_2/mutation_sampler.py | 163 +++++
 .../acquisition_sampler_2/random_sampler.py   |  15 +
 .../acquisition_samplers/mutation_sampler.py  |  38 +-
 .../grakel_replace/vertex_histogram.py        |  64 +-
 .../grakel_replace/weisfeiler_lehman.py       | 294 ++++-----
 .../bayesian_optimization/kernels/kernel.py   |  18 +-
 .../kernels/vectorial_kernels.py              |  39 +-
 .../kernels/weisfilerlehman.py                |  40 +-
 .../bayesian_optimization/models/gp.py        | 143 ++---
 .../bayesian_optimization/optimizer.py        | 222 +++----
 neps/search_spaces/distributions/__init__.py  |  16 +
 .../distributions/distribution.py             |  21 +
 neps/search_spaces/distributions/truncnorm.py | 112 ++++
 .../distributions/uniform_float.py            |  47 ++
 .../distributions/uniform_int.py              |  46 ++
 .../distributions/weighted_ints.py            |  91 +++
 neps/search_spaces/domain.py                  | 316 +++++++++
 neps/search_spaces/encoding.py                | 604 ++++++++++--------
 neps/search_spaces/neighborhoods.py           | 281 ++++++++
 neps/search_spaces/samplers/__init__.py       |   9 +
 neps/search_spaces/samplers/model.py          | 186 ++++++
 neps/search_spaces/samplers/prior.py          | 110 ++++
 neps/search_spaces/samplers/sampler.py        |  22 +
 neps/search_spaces/samplers/uniform.py        |  79 +++
 .../samplers/weighted_sampler.py              |  51 ++
 neps/state/__init__.py                        |   4 +
 neps/state/optimizer.py                       |   1 -
 neps/state/trial.py                           |   6 +-
 neps/utils/types.py                           |   2 +-
 33 files changed, 2434 insertions(+), 736 deletions(-)
 create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py
 create mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/__init__.py
 create mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py
 create mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py
 create mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py
 create mode 100644 neps/search_spaces/distributions/__init__.py
 create mode 100644 neps/search_spaces/distributions/distribution.py
 create mode 100644 neps/search_spaces/distributions/truncnorm.py
 create mode 100644 neps/search_spaces/distributions/uniform_float.py
 create mode 100644 neps/search_spaces/distributions/uniform_int.py
 create mode 100644 neps/search_spaces/distributions/weighted_ints.py
 create mode 100644 neps/search_spaces/domain.py
 create mode 100644 neps/search_spaces/neighborhoods.py
 create mode 100644 neps/search_spaces/samplers/__init__.py
 create mode 100644 neps/search_spaces/samplers/model.py
 create mode 100644 neps/search_spaces/samplers/prior.py
 create mode 100644 neps/search_spaces/samplers/sampler.py
 create mode 100644 neps/search_spaces/samplers/uniform.py
 create mode 100644 neps/search_spaces/samplers/weighted_sampler.py

diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py b/neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py
new file mode 100644
index 00000000..70b6b4e6
--- /dev/null
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py
@@ -0,0 +1,88 @@
+from __future__ import annotations
+
+import math
+
+import torch
+
+
+def ei(
+    mu: torch.Tensor,
+    cov: torch.Tensor,
+    optimum: float | torch.Tensor,
+    *,
+    augmented_ei_regularizer: float | None = None,  # 0.01
+    xi: float = 0.0,
+    log_ei: bool = False,
+    log_ei_epsilon: float = 1e-6,
+) -> torch.Tensor:
+    improvement = optimum - mu - xi
+
+    sigma_sq = torch.diag(cov)
+    sigma = torch.sqrt(sigma_sq)
+
+    Z = improvement / sigma
+
+    # If we calculate it ourselves, we save some computation as mu = 0
+    # and sigma = 1 cancel a few terms out
+    # https://en.wikipedia.org/wiki/Normal_distribution
+    Z_cdf = 0.5 * (1 + torch.erf(Z / math.sqrt(2)))
+    Z_pdf = 1 / (math.sqrt(2 * math.pi)) * torch.exp(-0.5 * Z**2)
+    ei = improvement * Z_cdf + sigma * Z_pdf
+
+    if augmented_ei_regularizer is not None:
+        regularization_term = 1 + sigma_sq / augmented_ei_regularizer
+        ei = ei / regularization_term
+
+    if log_ei:
+        ei = torch.log(ei + log_ei_epsilon)
+
+    return ei
+
+
+def acq_by_confidence(
+    mu: torch.Tensor,
+    cov: torch.Tensor,
+    *,
+    confidence_scale: float = 1.0,
+) -> torch.Tensor:
+    # Assumes we are trying to minimize our objective but
+    # this acquisition function will be maximized, i.e. optimize
+    # this function to find the point which is most likely to be
+    # the minimum of the objective.
+
+    #         ****
+    #        * / \**
+    #   ***** /   \- ****
+    #  *     /      \    ***
+    # *     /        \   |  *   ***
+    #   ---/          \  |   +**
+    # -/               \ | /   \
+    #                   \|/     ---
+    #                    -  <- lcb = mu - c * sigma
+    # ______________________________
+    lcb = mu - confidence_scale * torch.sqrt(torch.diag(cov))
+
+    return -lcb  # Negate to make maximization
+
+
+def weight_by_cost(
+    acquisition_scores: torch.Tensor,
+) -> torch.Tensor:
+    # Assumes we are trying to minimize our objective but
+    # this acquisition function will be maximized, i.e. optimize
+    # this function to find the point which is most likely to be
+    # the minimum of the objective.
+
+    #         ****
+    #        * / \**
+    #   ***** /   \- ****
+    #  *     /      \    ***
+    # *     /        \   |  *   ***
+    #   ---/          \  |   +**
+    # -/               \ | /   \
+    #                   \|/     ---
+    #                    -  <- lcb = mu - c * sigma
+    # ______________________________
+    lcb = mu - cost_scale * torch.sqrt(torch.diag(cov))
+
+    return -lcb  # Negate to make maximization
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
index ba5eb38b..cc13cc8e 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
@@ -1,15 +1,18 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Sequence, Union
-import numpy as np
+from typing import TYPE_CHECKING, Sequence
+
 import torch
 from torch.distributions import Normal
 
 from .base_acquisition import BaseAcquisition
 
 if TYPE_CHECKING:
+    import numpy as np
+
     from neps.search_spaces import SearchSpace
 
+
 class ComprehensiveExpectedImprovement(BaseAcquisition):
     def __init__(
         self,
@@ -51,11 +54,11 @@ def __init__(
         self.optimize_on_max_fidelity = optimize_on_max_fidelity
 
     def eval(
-        self, x: Sequence[SearchSpace], asscalar: bool = False,
-    ) -> Union[np.ndarray, torch.Tensor, float]:
-        """
-        Return the negative expected improvement at the query point x2
-        """
+        self,
+        x: Sequence[SearchSpace],
+        asscalar: bool = False,
+    ) -> np.ndarray | torch.Tensor | float:
+        """Return the negative expected improvement at the query point x2."""
         assert self.incumbent is not None, "EI function not fitted on model"
 
         if x[0].has_fidelity and self.optimize_on_max_fidelity:
@@ -70,6 +73,7 @@ def eval(
         except ValueError as e:
             raise e
             # return -1.0  # in case of error. return ei of -1
+
         std = torch.sqrt(torch.diag(cov))
         mu_star = self.incumbent
         gauss = Normal(torch.zeros(1, device=mu.device), torch.ones(1, device=mu.device))
@@ -103,11 +107,9 @@ def set_state(self, surrogate_model, **kwargs):
 
         # Compute incumbent
         if self.in_fill == "best":
-            # return torch.max(surrogate_model.y_)
             self.incumbent = torch.min(self.surrogate_model.y_)
         else:
             x = self.surrogate_model.x
             mu_train, _ = self.surrogate_model.predict(x)
-            # incumbent_idx = torch.argmax(mu_train)
             incumbent_idx = torch.argmin(mu_train)
             self.incumbent = self.surrogate_model.y_[incumbent_idx]
diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/__init__.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py
new file mode 100644
index 00000000..f799252b
--- /dev/null
+++ b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import torch
+
+    from neps.search_spaces import SearchSpace
+
+
+def random_sample(search_space: SearchSpace, *, seed: torch.Generator) -> SearchSpace:
+    """Sample a random value from a search space.
+
+    Args:
+        search_space: The search space to sample from.
+        user_priors: Whether to sample from user priors.
+        seed: The seed to use for sampling.
+
+    Returns:
+        A search space with a sampled value.
+    """
+    return search_space.sample_value(user_priors=user_priors)
diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py
new file mode 100644
index 00000000..972ad6c3
--- /dev/null
+++ b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py
@@ -0,0 +1,163 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable, Sequence
+
+import numpy as np
+import torch
+from more_itertools import first
+from typing_extensions import override
+
+from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
+    AcquisitionSampler,
+)
+from neps.optimizers.bayesian_optimization.acquisition_samplers.random_sampler import (
+    RandomSampler,
+)
+
+if TYPE_CHECKING:
+    from neps.search_spaces.search_space import SearchSpace
+
+
+def _propose_location(
+    acquisition_function: Callable,
+    candidates: list[SearchSpace],
+    top_n: int = 5,
+    return_distinct: bool = True,
+) -> tuple[list[SearchSpace], np.ndarray | torch.Tensor, np.ndarray]:
+    """top_n: return the top n candidates wrt the acquisition function."""
+    if return_distinct:
+        eis = acquisition_function(candidates, asscalar=True)  # faster
+        eis_, unique_idx = np.unique(eis, return_index=True)
+        try:
+            i = np.argpartition(eis_, -top_n)[-top_n:]
+            indices = np.array([unique_idx[j] for j in i])
+        except ValueError:
+            eis = torch.tensor([acquisition_function(c) for c in candidates])
+            _, indices = eis.topk(top_n)
+    else:
+        eis = torch.tensor([acquisition_function(c) for c in candidates])
+        _, indices = eis.topk(top_n)
+
+    xs = [candidates[int(i)] for i in indices]
+    return xs, eis, indices
+
+
+class MutationSampler(AcquisitionSampler):
+    def __init__(
+        self,
+        pipeline_space,
+        pool_size: int = 250,
+        n_best: int = 10,
+        mutate_size: float | int = 0.5,
+        allow_isomorphism: bool = False,
+        check_isomorphism_history: bool = True,
+        patience: int = 50,
+    ):
+        super().__init__(pipeline_space=pipeline_space, patience=patience)
+        self.pool_size = pool_size
+        self.n_best = n_best
+        self.mutate_size = mutate_size
+        if isinstance(mutate_size, int):
+            assert (
+                pool_size >= mutate_size
+            ), " pool_size must be larger or equal to mutate_size"
+
+        self.allow_isomorphism = allow_isomorphism
+        self.check_isomorphism_history = (
+            check_isomorphism_history  # check for isomorphisms also in previous graphs
+        )
+        self.random_sampling = RandomSampler(
+            pipeline_space=pipeline_space, patience=patience
+        )
+
+    @override
+    def set_state(
+        self, x: list[SearchSpace], y: Sequence[float] | np.ndarray | torch.Tensor
+    ) -> None:
+        super().set_state(x, y)
+        self.random_sampling.set_state(x, y)
+
+    @override
+    def sample(self, acquisition_function: Callable) -> SearchSpace:
+        return first(self.sample_batch(acquisition_function, batch=1))
+
+    @override
+    def sample_batch(
+        self,
+        acquisition_function: Callable,
+        batch: int,
+    ) -> list[SearchSpace]:
+        pool = self.create_pool(
+            x=self.x,
+            y=self.y,
+            acquisition_function=acquisition_function,
+            pool_size=self.pool_size,
+        )
+
+        samples, _, _ = _propose_location(
+            acquisition_function=acquisition_function,
+            top_n=batch,
+            candidates=pool,
+        )
+        return samples
+
+    def create_pool(
+        self,
+        x: list[SearchSpace],
+        y: Sequence[float] | np.ndarray | torch.Tensor,
+        acquisition_function: Callable,
+        pool_size: int,
+    ) -> list[SearchSpace]:
+        if len(x) == 0:
+            return self.random_sampling.sample_batch(acquisition_function, pool_size)
+
+        if isinstance(self.mutate_size, int):
+            mutate_size = self.mutate_size
+        else:
+            mutate_size = int(self.mutate_size * pool_size)
+
+        n_best = len(x) if len(x) < self.n_best else self.n_best
+        best_configs = [x for (_, x) in sorted(zip(y, x), key=lambda pair: pair[0])][
+            :n_best
+        ]
+
+        seen: set[int] = set()
+
+        def _hash(_config: SearchSpace) -> int:
+            return hash(_config.hp_values().values())
+
+        evaluation_pool = []
+        per_arch = mutate_size // n_best
+
+        for config in best_configs:
+            remaining_patience = self.patience
+            for _ in range(per_arch):
+                while remaining_patience:
+                    try:
+                        # needs to throw an Exception if config is not valid, e.g., empty graph etc.!
+                        child = config.mutate()
+                    except Exception:
+                        remaining_patience -= 1
+                        continue
+                    hash_child = _hash(child)
+
+                    if not self.allow_isomorphism:
+                        # if disallow isomorphism, we enforce that each time, we mutate n distinct graphs.
+                        # For now we do not check the isomorphism in all of the previous graphs though
+                        if child == config or hash_child in seen:
+                            remaining_patience -= 1
+                            continue
+
+                    evaluation_pool.append(child)
+                    seen.add(hash_child)
+                    break
+
+        # Fill missing pool with random samples
+        nrandom_archs = max(pool_size - len(evaluation_pool), 0)
+        if nrandom_archs:
+            random_evaluation_pool = self.random_sampling.sample_batch(
+                acquisition_function, nrandom_archs
+            )
+            evaluation_pool += random_evaluation_pool
+
+        return evaluation_pool
diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py
new file mode 100644
index 00000000..f7a4da76
--- /dev/null
+++ b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py
@@ -0,0 +1,15 @@
+from __future__ import annotations
+
+import torch
+from neps.search_spaces import SearchSpace
+from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
+    AcquisitionSampler,
+)
+
+
+class RandomSampler(AcquisitionSampler):
+
+    def sample(self, n: int, space: SearchSpace) -> torch.Tensor:
+        return self.pipeline_space.sample(
+            patience=self.patience, user_priors=False, ignore_fidelity=False
+        )
diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py
index 4c6b17df..cafc05dd 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py
@@ -48,7 +48,7 @@ def __init__(
         pipeline_space,
         pool_size: int = 250,
         n_best: int = 10,
-        mutate_size: int | None = None,
+        mutate_size: float | int = 0.5,
         allow_isomorphism: bool = False,
         check_isomorphism_history: bool = True,
         patience: int = 50,
@@ -57,6 +57,11 @@ def __init__(
         self.pool_size = pool_size
         self.n_best = n_best
         self.mutate_size = mutate_size
+        if isinstance(mutate_size, int):
+            assert (
+                pool_size >= mutate_size
+            ), " pool_size must be larger or equal to mutate_size"
+
         self.allow_isomorphism = allow_isomorphism
         self.check_isomorphism_history = (
             check_isomorphism_history  # check for isomorphisms also in previous graphs
@@ -83,7 +88,12 @@ def sample_batch(
         acquisition_function: Callable,
         batch: int,
     ) -> list[SearchSpace]:
-        pool = self.create_pool(acquisition_function, self.pool_size)
+        pool = self.create_pool(
+            x=self.x,
+            y=self.y,
+            acquisition_function=acquisition_function,
+            pool_size=self.pool_size,
+        )
 
         samples, _, _ = _propose_location(
             acquisition_function=acquisition_function,
@@ -94,23 +104,23 @@ def sample_batch(
 
     def create_pool(
         self,
+        x: list[SearchSpace],
+        y: Sequence[float] | np.ndarray | torch.Tensor,
         acquisition_function: Callable,
         pool_size: int,
     ) -> list[SearchSpace]:
-        if len(self.x) == 0:
+        if len(x) == 0:
             return self.random_sampling.sample_batch(acquisition_function, pool_size)
 
-        mutate_size = (
-            int(0.5 * pool_size) if self.mutate_size is None else self.mutate_size
-        )
-        assert (
-            pool_size >= mutate_size
-        ), " pool_size must be larger or equal to mutate_size"
-
-        n_best = len(self.x) if len(self.x) < self.n_best else self.n_best
-        best_configs = [
-            x for (_, x) in sorted(zip(self.y, self.x), key=lambda pair: pair[0])
-        ][:n_best]
+        if isinstance(self.mutate_size, int):
+            mutate_size = self.mutate_size
+        else:
+            mutate_size = int(self.mutate_size * pool_size)
+
+        n_best = len(x) if len(x) < self.n_best else self.n_best
+        best_configs = [x for (_, x) in sorted(zip(y, x), key=lambda pair: pair[0])][
+            :n_best
+        ]
 
         seen: set[int] = set()
 
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
index e59b5433..4a4dfc79 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
@@ -1,8 +1,11 @@
 """The vertex kernel as defined in :cite:`sugiyama2015halting`."""
 
+from __future__ import annotations
+
 import logging
 from collections import Counter
 from collections.abc import Iterable
+from typing import TYPE_CHECKING
 from warnings import warn
 
 import numpy as np
@@ -14,7 +17,10 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.utils.validation import check_is_fitted
 
-from ..vectorial_kernels import Stationary
+if TYPE_CHECKING:
+    from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import (
+        NumericKernel,
+    )
 
 
 class VertexHistogram(Kernel):
@@ -42,7 +48,7 @@ class VertexHistogram(Kernel):
         If supplied, the Malahanobis distance with the precision matrix as supplied will be computed in the dot
         product, instead of the vanilla dot product.
 
-    Attributes
+    Attributes:
     ----------
     None.
 
@@ -55,7 +61,7 @@ def __init__(
         sparse="auto",
         oa=False,
         mahalanobis_precision=None,
-        se_kernel: Stationary | None = None,
+        se_kernel: NumericKernel | None = None,
         requires_ordered_features: bool = False,
         as_tensor: bool = True,
     ):
@@ -120,7 +126,7 @@ def parse_input(self, X, label_start_idx=0, label_end_idx=None):
 
 
 
-        Returns
+        Returns:
         -------
         out : np.array, shape=(len(X), n_labels)
             A np.array for frequency (cols) histograms for all Graphs (rows).
@@ -139,9 +145,9 @@ def parse_input(self, X, label_start_idx=0, label_end_idx=None):
         if not isinstance(X, Iterable):
             raise TypeError("input must be an iterable\n")
         else:
-            rows, cols, data = list(), list(), list()
+            rows, cols, data = [], [], []
             if self._method_calling in [0, 1, 2]:
-                labels = dict()
+                labels = {}
                 self._labels = labels
             elif self._method_calling == 3:
                 labels = dict(self._labels)
@@ -200,7 +206,7 @@ def parse_input(self, X, label_start_idx=0, label_end_idx=None):
                 ni += 1
 
             if self.require_ordered_features:
-                label_length = max(label_end_idx - label_start_idx, max(cols)) + 1
+                label_length = max(label_end_idx - label_start_idx, *cols) + 1
             else:
                 label_length = len(labels)
 
@@ -244,7 +250,7 @@ def _calculate_kernel_matrix(self, Y=None):
         Y : np.array, default=None
             The array between samples and features.
 
-        Returns
+        Returns:
         -------
         K : numpy array, shape = [n_targets, n_inputs]
             The kernel matrix: a calculation between all pairs of graphs
@@ -260,24 +266,19 @@ def _calculate_kernel_matrix(self, Y=None):
                     for j in range(i, self.X.shape[0]):
                         K[i, j] = np.sum(np.minimum(self.X[i, :], self.X[j, :]))
                         K[j, i] = K[i, j]
+            elif self.se_kernel is not None:
+                K = self.se_kernel._forward(self.X, self.X)
             else:
-                if self.se_kernel is not None:
-                    K = self.se_kernel._forward(self.X, self.X)
-                else:
-                    K = self.X @ self.X.T
+                K = self.X @ self.X.T
+        elif self.oa:
+            K = np.zeros((Y.shape[0], self.X.shape[0]))
+            for i in range(Y.shape[0]):
+                for j in range(self.X.shape[0]):
+                    K[i, j] = np.sum(np.minimum(self.X[j, :], Y[i, : self.X.shape[1]]))
+        elif self.se_kernel is not None:
+            K = self.se_kernel._forward(self.X, Y)
         else:
-            if self.oa:
-                K = np.zeros((Y.shape[0], self.X.shape[0]))
-                for i in range(Y.shape[0]):
-                    for j in range(self.X.shape[0]):
-                        K[i, j] = np.sum(
-                            np.minimum(self.X[j, :], Y[i, : self.X.shape[1]])
-                        )
-            else:
-                if self.se_kernel is not None:
-                    K = self.se_kernel._forward(self.X, Y)
-                else:
-                    K = Y[:, : self.X.shape[1]] @ self.X.T
+            K = Y[:, : self.X.shape[1]] @ self.X.T
 
         if self.sparse_:
             return K.toarray()
@@ -291,7 +292,7 @@ def diagonal(self, use_tensor=False):
         ----------
         None.
 
-        Returns
+        Returns:
         -------
         X_diag : np.array
             The diagonal of the kernel matrix, of the fitted. This consists
@@ -309,11 +310,10 @@ def diagonal(self, use_tensor=False):
             # Calculate diagonal of X
             if use_tensor:
                 self._X_diag = torch.einsum("ij,ij->i", [self.X_tensor, self.X_tensor])
+            elif self.sparse_:
+                self._X_diag = squeeze(array(self.X.multiply(self.X).sum(axis=1)))
             else:
-                if self.sparse_:
-                    self._X_diag = squeeze(array(self.X.multiply(self.X).sum(axis=1)))
-                else:
-                    self._X_diag = einsum("ij,ij->i", self.X, self.X)
+                self._X_diag = einsum("ij,ij->i", self.X, self.X)
         try:
             check_is_fitted(self, ["_Y"])
             if use_tensor:
@@ -346,7 +346,7 @@ def transform(self, X, return_embedding_only=False, **kwargs):
             computing the kernel function). This is used when computing the derivative
             of the kernel w.r.t. the test points/
 
-        Returns
+        Returns:
         -------
         K : numpy array, shape = [n_targets, n_input_graphs]
             corresponding to the kernel matrix, a calculation between
@@ -395,7 +395,7 @@ def fit_transform(self, X, **kwargs):
             There is no need of a target in a transformer, yet the pipeline API
             requires this parameter.
 
-        Returns
+        Returns:
         -------
         K : numpy array, shape = [n_targets, n_input_graphs]
             corresponding to the kernel matrix, a calculation between
@@ -431,7 +431,7 @@ def fit(self, X, y=None, **kwargs):
             There is no need of a target in a transformer, yet the pipeline API
             requires this parameter.
 
-        Returns
+        Returns:
         -------
         self : object
         Returns self.
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
index dd5dd829..f10e406f 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
@@ -70,7 +70,7 @@ def __init__(
         h: int = 5,
         base_graph_kernel=VertexHistogram,
         node_weights=None,
-        layer_weights: torch.Tensor | None = None,
+        layer_weights=None,
         as_tensor: bool = True,
     ):
         """Initialise a `weisfeiler_lehman` kernel."""
@@ -121,27 +121,16 @@ def initialize(self):
                 try:
                     base_graph_kernel, params = base_graph_kernel
                 except Exception as _error:
-                    raise TypeError(
-                        "Base kernel was not formulated in "
-                        "the correct way. "
-                        "Check documentation."
-                    ) from _error
+                    NOT_YET_IMPLEMENTED_StmtRaise
 
                 if not (
-                    type(base_graph_kernel) is type  # pylint: disable=C0123
+                    type(base_graph_kernel)
+                    is type  # pylint: disable=C0123
                     and issubclass(base_graph_kernel, Kernel)
                 ):
-                    raise TypeError(
-                        "The first argument must be a valid "
-                        "grakel.kernel.kernel Object"
-                    )
+                    NOT_YET_IMPLEMENTED_StmtRaise
                 if not isinstance(params, dict):
-                    raise ValueError(
-                        "If the second argument of base "
-                        "kernel exists, it must be a diction"
-                        "ary between parameters names and "
-                        "values"
-                    )
+                    NOT_YET_IMPLEMENTED_StmtRaise
                 params.pop("normalize", None)
 
             params["normalize"] = False
@@ -152,16 +141,14 @@ def initialize(self):
 
         if not self._initialized["h"]:
             if not isinstance(self.h, int) or self.h < 0:
-                raise TypeError(
-                    "'h' must be a non-negative integer. Got h:" + str(self.h)
-                )
+                NOT_YET_IMPLEMENTED_StmtRaise
             self._h = self.h + 1
             self._initialized["h"] = True
 
-            if self.layer_weights is None:
-                self.layer_weights = torch.ones((self._h,))
-            else:
-                assert len(self.layer_weights) == self._h
+            if self.layer_weights is None or self.layer_weights.shape[0] != self._h:
+                self.layer_weights = np.ones((self._h,))
+            if self.as_tensor and not isinstance(self.layer_weights, torch.Tensor):
+                self.layer_weights = torch.tensor(self.layer_weights)
 
             self._initialized["h"] = True
             self._initialized["layer_weights"] = True
@@ -204,9 +191,7 @@ def parse_input(
 
         """
         if self._method_calling not in [1, 2]:
-            raise ValueError(
-                "method call must be called either from fit " + "or fit-transform"
-            )
+            NOT_YET_IMPLEMENTED_StmtRaise
         elif hasattr(self, "_X_diag"):
             # Clean _X_diag value
             delattr(self, "_X_diag")
@@ -218,7 +203,7 @@ def parse_input(
         else:
             # Input validation and parsing
             if not isinstance(X, collections.abc.Iterable):
-                raise TypeError("input must be an iterable\n")
+                NOT_YET_IMPLEMENTED_StmtRaise
             else:
                 nx = 0
                 Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict()
@@ -228,26 +213,25 @@ def parse_input(
                         x = list(x)
                     if is_iter and (len(x) == 0 or len(x) >= 2):
                         if len(x) == 0:
-                            warnings.warn("Ignoring empty element on index: " + str(idx))
+                            warnings.warn(
+                                "Ignoring empty element on index: " + str(idx)
+                            )
                             continue
+                        elif len(x) > 2:
+                            extra = tuple()
+                            if len(x) > 3:
+                                extra = tuple(x[3:])
+                            x = Graph(x[0], x[1], x[2], graph_format=self._graph_format)
+                            extra = (
+                                x.get_labels(
+                                    purpose=self._graph_format,
+                                    label_type="edge",
+                                    return_none=True,
+                                ),
+                            ) + extra
                         else:
-                            if len(x) > 2:
-                                extra = tuple()
-                                if len(x) > 3:
-                                    extra = tuple(x[3:])
-                                x = Graph(
-                                    x[0], x[1], x[2], graph_format=self._graph_format
-                                )
-                                extra = (
-                                    x.get_labels(
-                                        purpose=self._graph_format,
-                                        label_type="edge",
-                                        return_none=True,
-                                    ),
-                                ) + extra
-                            else:
-                                x = Graph(x[0], x[1], {}, graph_format=self._graph_format)
-                                extra = tuple()
+                            x = Graph(x[0], x[1], {}, graph_format=self._graph_format)
+                            extra = tuple()
 
                     elif isinstance(x, Graph):
                         x.desired_format(self._graph_format)
@@ -262,19 +246,14 @@ def parse_input(
                             extra = (el,)
 
                     else:
-                        raise TypeError(
-                            "each element of X must be either a "
-                            + "graph object or a list with at least "
-                            + "a graph like object and node labels "
-                            + "dict \n"
-                        )
+                        NOT_YET_IMPLEMENTED_StmtRaise
                     Gs_ed[nx] = x.get_edge_dictionary()
                     L[nx] = x.get_labels(purpose="dictionary")
                     extras[nx] = extra
-                    distinct_values |= set(L[nx].values())
-                    nx += 1
+                    NOT_YET_IMPLEMENTED_StmtAugAssign
+                    NOT_YET_IMPLEMENTED_StmtAugAssign
                 if nx == 0:
-                    raise ValueError("parsed input is empty")
+                    NOT_YET_IMPLEMENTED_StmtRaise
 
             # Save the number of "fitted" graphs.
             self._nx = nx
@@ -284,33 +263,28 @@ def parse_input(
             label_count = 0
             for dv in sorted(list(distinct_values)):
                 WL_labels_inverse[dv] = label_count
-                label_count += 1
+                NOT_YET_IMPLEMENTED_StmtAugAssign
 
             # Initalize an inverse dictionary of labels for all iterations
-            self._inv_labels = (
-                OrderedDict()
-            )  # Inverse dictionary of labels, in term of the *previous layer*
+            self._inv_labels = OrderedDict()  # Inverse dictionary of labels, in term of the *previous layer*
             self._inv_labels[0] = deepcopy(WL_labels_inverse)
-            self.feature_dims.append(
-                len(WL_labels_inverse)
-            )  # Update the zeroth iteration feature dim
-
-            self._inv_label_node_attr = (
-                OrderedDict()
-            )  # Inverse dictionary of labels, in term of the *node attribute*
-            self._label_node_attr = (
-                OrderedDict()
-            )  # Same as above, but with key and value inverted
-            self._label_node_attr[0], self._inv_label_node_attr[0] = self.translate_label(
-                WL_labels_inverse, 0
-            )
+            self.feature_dims.append(len(WL_labels_inverse))  # Update the zeroth iteration feature dim
+
+            self._inv_label_node_attr = OrderedDict()  # Inverse dictionary of labels, in term of the *node attribute*
+            self._label_node_attr = OrderedDict()  # Same as above, but with key and value inverted
+            (
+                self._label_node_attr[0],
+                self._inv_label_node_attr[0],
+            ) = self.translate_label(WL_labels_inverse, 0)
 
             if self.node_weights is not None:
                 self._feature_weight = OrderedDict()
                 # Ensure the order is the same
                 self._feature_weight[0] = self._compute_feature_weight(
                     self.node_weights, 0, WL_labels_inverse
-                )[1]
+                )[
+                    1
+                ]
             else:
                 self._feature_weight = None
 
@@ -323,7 +297,7 @@ def generate_graphs(label_count: int, WL_labels_inverse):
                     L[j] = new_labels
                     # add new labels
                     new_graphs.append((Gs_ed[j], new_labels) + extras[j])
-                yield new_graphs
+                NOT_YET_IMPLEMENTED_ExprYield
 
                 for i in range(1, self._h):
                     label_set, WL_labels_inverse, L_temp = set(), dict(), dict()
@@ -333,10 +307,10 @@ def generate_graphs(label_count: int, WL_labels_inverse):
                         # Keep for each node the temporary
                         L_temp[j] = dict()
                         for v in Gs_ed[j].keys():
-                            credential = (
-                                str(L[j][v])
-                                + ","
-                                + str(sorted(L[j][n] for n in Gs_ed[j][v].keys()))
+                            credential = str(L[j][v]) + "," + str(
+                                sorted(
+                                    (NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in [])
+                                )
                             )
                             L_temp[j][v] = credential
                             label_set.add(credential)
@@ -344,7 +318,7 @@ def generate_graphs(label_count: int, WL_labels_inverse):
                     label_list = sorted(list(label_set))
                     for dv in label_list:
                         WL_labels_inverse[dv] = label_count
-                        label_count += 1
+                        NOT_YET_IMPLEMENTED_StmtAugAssign
 
                     # Recalculate labels
                     new_graphs = list()
@@ -370,9 +344,11 @@ def generate_graphs(label_count: int, WL_labels_inverse):
                     if self.node_weights is not None:
                         self._feature_weight[i] = self._compute_feature_weight(
                             self.node_weights, i, self._inv_label_node_attr[i]
-                        )[1]
+                        )[
+                            1
+                        ]
                     # assert len(self._feature_weight[i] == len(WL_labels_inverse))
-                    yield new_graphs
+                    NOT_YET_IMPLEMENTED_ExprYield
 
             # Initialise the base graph kernel.
             base_graph_kernel = {}
@@ -391,22 +367,21 @@ def generate_graphs(label_count: int, WL_labels_inverse):
                             label_end_idx=self.feature_dims[i + 1],
                         )
                     )
+                elif self._method_calling == 1:
+                    base_graph_kernel[i].fit(
+                        g,
+                        label_start_idx=self.feature_dims[i],
+                        label_end_idx=self.feature_dims[i + 1],
+                    )
                 else:
-                    if self._method_calling == 1:
-                        base_graph_kernel[i].fit(
+                    K.append(
+                        self.layer_weights[i]
+                        * base_graph_kernel[i].fit_transform(
                             g,
                             label_start_idx=self.feature_dims[i],
                             label_end_idx=self.feature_dims[i + 1],
                         )
-                    else:
-                        K.append(
-                            self.layer_weights[i]
-                            * base_graph_kernel[i].fit_transform(
-                                g,
-                                label_start_idx=self.feature_dims[i],
-                                label_end_idx=self.feature_dims[i + 1],
-                            )
-                        )
+                    )
 
             if gp_fit:
                 self.X_fit[self._h] = X
@@ -453,7 +428,7 @@ def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True):  # pylint: di
             0,
         ]  # Flush the feature dimensions
         if X is None:
-            raise ValueError("transform input cannot be None")
+            NOT_YET_IMPLEMENTED_StmtRaise
         else:
             km, self.X = self.parse_input(X, gp_fit=gp_fit)
 
@@ -488,47 +463,39 @@ def transform(self, X: Iterable, return_embedding_only: bool = True):
 
         # Input validation and parsing
         if X is None:
-            raise ValueError("transform input cannot be None")
+            NOT_YET_IMPLEMENTED_StmtRaise
+        elif not isinstance(X, collections.abc.Iterable):
+            NOT_YET_IMPLEMENTED_StmtRaise
         else:
-            if not isinstance(X, collections.abc.Iterable):
-                raise ValueError("input must be an iterable\n")
-            else:
-                nx = 0
-                distinct_values = set()
-                Gs_ed, L = dict(), dict()
-                for i, x in enumerate(iter(X)):
-                    is_iter = isinstance(x, collections.abc.Iterable)
-                    if is_iter:
-                        x = list(x)
-                    if is_iter and len(x) in [0, 2, 3]:
-                        if len(x) == 0:
-                            warnings.warn("Ignoring empty element on index: " + str(i))
-                            continue
-
-                        elif len(x) in [2, 3]:
-                            x = Graph(x[0], x[1], {}, self._graph_format)
-                    elif isinstance(x, Graph):
-                        x.desired_format("dictionary")
-                    else:
-                        raise ValueError(
-                            "each element of X must have at "
-                            + "least one and at most 3 elements\n"
-                        )
-                    Gs_ed[nx] = x.get_edge_dictionary()
-                    L[nx] = x.get_labels(purpose="dictionary")
+            nx = 0
+            distinct_values = set()
+            Gs_ed, L = dict(), dict()
+            for i, x in enumerate(iter(X)):
+                is_iter = isinstance(x, collections.abc.Iterable)
+                if is_iter:
+                    x = list(x)
+                if is_iter and len(x) in [0, 2, 3]:
+                    if len(x) == 0:
+                        warnings.warn("Ignoring empty element on index: " + str(i))
+                        continue
+
+                    elif len(x) in [2, 3]:
+                        x = Graph(x[0], x[1], {}, self._graph_format)
+                elif isinstance(x, Graph):
+                    x.desired_format("dictionary")
+                else:
+                    NOT_YET_IMPLEMENTED_StmtRaise
+                Gs_ed[nx] = x.get_edge_dictionary()
+                L[nx] = x.get_labels(purpose="dictionary")
 
-                    # Hold all the distinct values
-                    distinct_values |= {
-                        v for v in L[nx].values() if v not in self._inv_labels[0]
-                    }
-                    nx += 1
-                if nx == 0:
-                    raise ValueError("parsed input is empty")
+                # Hold all the distinct values
+                NOT_YET_IMPLEMENTED_StmtAugAssign
+                NOT_YET_IMPLEMENTED_StmtAugAssign
+            if nx == 0:
+                NOT_YET_IMPLEMENTED_StmtRaise
 
         nl = len(self._inv_labels[0])
-        WL_labels_inverse = {
-            dv: idx for (idx, dv) in enumerate(sorted(list(distinct_values)), nl)
-        }
+        WL_labels_inverse = {NOT_IMPLEMENTED_dict_key: NOT_IMPLEMENTED_dict_value for key, value in NOT_IMPLEMENTED_dict}
         WL_labels_inverse = OrderedDict(WL_labels_inverse)
 
         def generate_graphs_transform(WL_labels_inverse, nl):
@@ -544,21 +511,21 @@ def generate_graphs_transform(WL_labels_inverse, nl):
                 L[j] = new_labels
                 # produce the new graphs
                 new_graphs.append([Gs_ed[j], new_labels])
-            yield new_graphs
+            NOT_YET_IMPLEMENTED_ExprYield
 
             for i in range(1, self._h):
                 new_graphs = list()
                 L_temp, label_set = dict(), set()
-                nl += len(self._inv_labels[i])
+                NOT_YET_IMPLEMENTED_StmtAugAssign
                 for j in range(nx):
                     # Find unique labels and sort them for both graphs
                     # Keep for each node the temporary
                     L_temp[j] = dict()
                     for v in Gs_ed[j].keys():
-                        credential = (
-                            str(L[j][v])
-                            + ","
-                            + str(sorted(L[j][n] for n in Gs_ed[j][v].keys()))
+                        credential = str(L[j][v]) + "," + str(
+                            sorted(
+                                (NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in [])
+                            )
                         )
                         L_temp[j][v] = credential
                         if credential not in self._inv_labels[i]:
@@ -583,7 +550,7 @@ def generate_graphs_transform(WL_labels_inverse, nl):
                     L[j] = new_labels
                     # Create the new graphs with the new labels.
                     new_graphs.append([Gs_ed[j], new_labels])
-                yield new_graphs
+                NOT_YET_IMPLEMENTED_ExprYield
 
         if return_embedding_only:
             K = []
@@ -600,29 +567,11 @@ def generate_graphs_transform(WL_labels_inverse, nl):
 
         # Calculate the kernel matrix without parallelization
         if self.as_tensor:
-            summand = [
-                self.layer_weights[i]
-                * self.X[i].transform(
-                    g,
-                    label_start_idx=self.feature_dims[i],
-                    label_end_idx=self.feature_dims[i + 1],
-                )
-                for i, g in enumerate(generate_graphs_transform(WL_labels_inverse, nl))
-            ]
+            summand = [NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []]
             K = torch.stack(summand, dim=0).sum(dim=0)
         else:
             K = np.sum(
-                (
-                    self.layer_weights[i]
-                    * self.X[i].transform(
-                        g,
-                        label_start_idx=self.feature_dims[i],
-                        label_end_idx=self.feature_dims[i + 1],
-                    )
-                    for (i, g) in enumerate(
-                        generate_graphs_transform(WL_labels_inverse, nl)
-                    )
-                ),
+                (NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []),
                 axis=0,
             )
 
@@ -631,7 +580,7 @@ def generate_graphs_transform(WL_labels_inverse, nl):
             X_diag, Y_diag = self.diagonal()
             if self.as_tensor:
                 div_ = torch.sqrt(torch.ger(Y_diag, X_diag))
-                K /= div_
+                NOT_YET_IMPLEMENTED_StmtAugAssign
             else:
                 old_settings = np.seterr(divide="ignore")
                 K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag))))
@@ -667,7 +616,7 @@ def diagonal(self):
             if self._is_transformed:
                 Y_diag = self.X[0].diagonal()[1]
                 for i in range(1, self._h):
-                    Y_diag += self.X[i].diagonal()[1]
+                    NOT_YET_IMPLEMENTED_StmtAugAssign
         except NotFittedError:
             # Calculate diagonal of X
             if self._is_transformed:
@@ -676,8 +625,8 @@ def diagonal(self):
                 X_diag.flags.writeable = True
                 for i in range(1, self._h):
                     x, y = self.X[i].diagonal()
-                    X_diag += x
-                    Y_diag += y
+                    NOT_YET_IMPLEMENTED_StmtAugAssign
+                    NOT_YET_IMPLEMENTED_StmtAugAssign
                     self._X_diag = X_diag
 
                 # case sub kernel is only fitted
@@ -686,7 +635,7 @@ def diagonal(self):
                 X_diag.flags.writeable = True
                 for i in range(1, self._n_iter):
                     x = self.X[i].diagonal()
-                    X_diag += x
+                    NOT_YET_IMPLEMENTED_StmtAugAssign
                 self._X_diag = X_diag
 
         if self.as_tensor:
@@ -710,15 +659,18 @@ def translate_label(curr_layer: dict, h: int, prev_layer: dict = None):
 
         """
         if h == 0:
-            return {v: str(k) for k, v in curr_layer.items()}, curr_layer
+            return (
+                {NOT_IMPLEMENTED_dict_key: NOT_IMPLEMENTED_dict_value for key, value in NOT_IMPLEMENTED_dict},
+                curr_layer,
+            )
         else:
-            assert prev_layer is not None
+            NOT_YET_IMPLEMENTED_StmtAssert
             label_in_node_attr, inv_label_in_node_attr = OrderedDict(), OrderedDict()
             for pattern, encoding in curr_layer.items():
                 # current pattern is in terms of the encoding previous layer. Find the pattern from the prev_layer
                 root, leaf = literal_eval(pattern)
                 root_ = prev_layer[root]
-                leaf_ = [prev_layer[i] for i in leaf]
+                leaf_ = [NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []]
                 label_in_node_attr.update({encoding: "~".join([root_] + leaf_)})
                 inv_label_in_node_attr.update({"~".join([root_] + leaf_): encoding})
             return label_in_node_attr, inv_label_in_node_attr
@@ -739,18 +691,22 @@ def _compute_feature_weight(
         feature_weights_flattened = []
         if h == 0:
             feature_weight = OrderedDict(
-                {k: (node_weight[k]) ** 2 for k in inv_label_node_attr.keys()}
+                {NOT_IMPLEMENTED_dict_key: NOT_IMPLEMENTED_dict_value for key, value in NOT_IMPLEMENTED_dict}
             )
-            feature_weights_flattened = np.array(list(feature_weight.values())).flatten()
+            feature_weights_flattened = np.array(
+                list(feature_weight.values())
+            ).flatten()
         else:
             for k, _ in inv_label_node_attr.items():
                 # k is the pattern, v is the encoding
                 k_sep = k.split("~")
-                average_weight = np.mean([(node_weight[i]) ** 2 for i in k_sep])
+                average_weight = np.mean(
+                    [NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []]
+                )
                 feature_weights.update({k: average_weight})
                 feature_weights_flattened.append(average_weight)
         feature_weights_flattened = np.array(feature_weights_flattened).flatten()
-        assert len(feature_weights_flattened) == len(inv_label_node_attr)
+        NOT_YET_IMPLEMENTED_StmtAssert
         return feature_weights, feature_weights_flattened
 
     def dK_dX(self, X_test: None):
diff --git a/neps/optimizers/bayesian_optimization/kernels/kernel.py b/neps/optimizers/bayesian_optimization/kernels/kernel.py
index 57cd4895..42382a51 100644
--- a/neps/optimizers/bayesian_optimization/kernels/kernel.py
+++ b/neps/optimizers/bayesian_optimization/kernels/kernel.py
@@ -2,8 +2,8 @@
 
 import copy
 import inspect
-from abc import ABC, abstractmethod
 import math
+from abc import ABC, abstractmethod
 from typing import Any, ClassVar, Generic, Mapping, Sequence, TypeVar
 from typing_extensions import Self
 
@@ -56,12 +56,11 @@ def grid_search(
         x: T,
         y: torch.Tensor,
         *,
-        grid: Sequence[Mapping[str, Any]],
+        grid: Sequence[Mapping[str, Any]] | None = None,
         noise_variances: Sequence[float] = (1e-6,),
-    ) -> tuple[Self, float] | Exception:
+    ) -> tuple[Self, float]:
         # Returns: (Kernel[T], float) | None if failed
-        if len(grid) == 0:
-            raise ValueError("Grid must have at least one element.")
+        grid = grid or self.suggested_grid
 
         def _fit_and_eval(
             _params: Mapping[str, Any],
@@ -70,7 +69,6 @@ def _fit_and_eval(
             K = cloned_kernel.forward(x)
 
             best_lml = -float("inf")
-            exception: Exception | None = None
             for noise_variance in noise_variances:
                 K.diag().add_(noise_variance)
 
@@ -81,18 +79,14 @@ def _fit_and_eval(
 
                 K.diag().sub_(noise_variance)
 
-            if exception is None:
-                return cloned_kernel, best_lml
-
-            return exception
+            return cloned_kernel, best_lml
 
         evals = [_fit_and_eval(params) for params in grid]
         evals_with_score = [e for e in evals if not isinstance(e, Exception)]
         if not any(evals_with_score):
             raise evals[-1]  # type: ignore
 
-        best_eval = max(evals_with_score, key=lambda e: e[1])  # type: ignore
-        return best_eval
+        return max(evals_with_score, key=lambda e: e[1])  # type: ignore
 
 
 class NumericKernel(Kernel[torch.Tensor]): ...
diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
index 8bcbd45b..07b56333 100644
--- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
+++ b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
@@ -1,12 +1,13 @@
 from __future__ import annotations
 
+from abc import ABC
+from itertools import product
 from math import sqrt
-from typing import Any, Mapping, Sequence, ClassVar
-from typing_extensions import override, Self
+from typing import Any, ClassVar, Mapping, Sequence
+from typing_extensions import Self, override
 
-from itertools import product
 import torch
-import torch.nn as nn
+from torch import nn
 
 from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel
 
@@ -19,10 +20,10 @@
 STD_ENCODED_OUTPUT_SCALE = (1e-2, 1e-1, 1, 1e1, 1e2)
 
 
-class Stationary(Kernel[torch.Tensor]):
+class NumericKernel(Kernel[torch.Tensor], ABC):
     suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] = [
-        {"lengthscale": l, "output_scale": o}
-        for l, o in product(LENGTHSCALE_GRID, STD_ENCODED_OUTPUT_SCALE)
+        {"lengthscale": _l, "output_scale": o}
+        for _l, o in product(LENGTHSCALE_GRID, STD_ENCODED_OUTPUT_SCALE)
     ]
 
     def __init__(
@@ -59,25 +60,33 @@ def as_optimizable(self) -> Self:
 
     def forward(self, x: torch.Tensor, x2: torch.Tensor | None = None) -> torch.Tensor:
         # NOTE: I don't think this is the right way to do this...
-        with torch.no_grad():
-            self.lengthscale.data.clamp_(*self.lengthscale_bounds)
-            self.outputscale.data.clamp_(*self.outputscale_bounds)
+        if self.lengthscale_bounds is not None or self.outputscale_bounds is not None:
+            with torch.no_grad():
+                if self.lengthscale_bounds is not None:
+                    self.lengthscale.data.clamp_(*self.lengthscale_bounds)
+                if self.outputscale_bounds is not None:
+                    self.outputscale.data.clamp_(*self.outputscale_bounds)
 
         x2 = x if x2 is None else x2
         return self._forward(x, x2)
 
+    def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: ...
+
+
+class Stationary(NumericKernel):
+    @override
     def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
-        return self.outputscale * torch.cdist(x1, x2, p=2)
+        return self.outputscale * torch.cdist(x1, x2, p=2) / self.lengthscale
 
 
-class RBFKernel(Stationary):
+class RBFKernel(NumericKernel):
     @override
     def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
         dist_sq = torch.cdist(x1, x2, p=2) ** 2
         return self.outputscale * torch.exp(-dist_sq / (2 * self.lengthscale**2))
 
 
-class Matern32Kernel(Stationary):
+class Matern32Kernel(NumericKernel):
     @override
     def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
         dist = torch.cdist(x1, x2, p=2) / self.lengthscale
@@ -86,7 +95,7 @@ def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
         return self.outputscale * matern32
 
 
-class HammingKernel(Stationary):
+class HammingKernel(NumericKernel):
     @override
     def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
         dists = (x1.unsqueeze(1) != x2.unsqueeze(0)).float().sum(-1) / x1.shape[-1]
@@ -94,7 +103,7 @@ def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
         return self.outputscale * torch.exp(-scaled_dists)
 
 
-class Matern52Kernel(Stationary):
+class Matern52Kernel(NumericKernel):
     @override
     def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
         dist = torch.cdist(x1, x2, p=2) / self.lengthscale
diff --git a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
index 68d257b1..8c1feb26 100644
--- a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
+++ b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
@@ -1,37 +1,38 @@
 from __future__ import annotations
 
+from itertools import product
 from typing import Any, ClassVar, Mapping, Sequence
 from typing_extensions import Self
 
+import numpy as np
+import numpy.typing as npt
 import torch
-import torch.nn as nn
-from itertools import product
+from torch import nn
 
-import numpy as np
 from neps.optimizers.bayesian_optimization.kernels.grakel_replace import (
     VertexHistogram,
     WeisfeilerLehman as _WL,
 )
 from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel
-from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary
-from neps.search_spaces.encoding import WLInput
+from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import NumericKernel
 
 GRID_WL_LENGTHSCALES = torch.tensor([np.e**i for i in range(-2, 3)])
 GRID_WL_SUBTREE_CANDIDATES = (1, 2, 3, 4, 5)
 
 
-class WeisfilerLehman(Kernel[Sequence[WLInput]]):
+class WeisfilerLehman(Kernel[npt.NDArray[np.object_]]):
     """Weisfiler Lehman kernel using grakel functions."""
 
     suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] = [
-        {"h": h, "se_kernel": Stationary(lengthscale=l)}
+        {"h": h, "se_kernel": NumericKernel(lengthscale=l)}
         for h, l in product(GRID_WL_SUBTREE_CANDIDATES, GRID_WL_LENGTHSCALES)
     ]
 
     def __init__(
         self,
+        *,
         h: int = 0,
-        se_kernel: Stationary | None = None,
+        se_kernel: NumericKernel | None = None,
         layer_weights: torch.Tensor | None = None,
         oa: bool = False,
         node_label: str = "op_name",
@@ -70,7 +71,7 @@ def __init__(
     def as_optimizable(self) -> Self:
         return self.clone_with(layer_weights=nn.Parameter(self.layer_weights))
 
-    def fit_transform(self, gr: Sequence[WLInput]) -> torch.Tensor:
+    def fit_transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor:
         self.layer_weights.clamp_(0, 1)
         self.wl_kernel_ = _WL(
             h=self.h,
@@ -87,12 +88,27 @@ def fit_transform(self, gr: Sequence[WLInput]) -> torch.Tensor:
             normalize=True,
         )
 
-        K = self.wl_kernel_.fit_transform(gr)
+        K = self.wl_kernel_.fit_transform(iter(gr))
         return torch.as_tensor(K, dtype=torch.float64)
 
-    def transform(self, gr: Sequence[WLInput]) -> torch.Tensor:
+    def transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor:
         assert self.wl_kernel_ is not None
         self.layer_weights.clamp_(0, 1)
 
-        K = self.wl_kernel_.transform(gr)
+        K = self.wl_kernel_.transform(iter(gr))
         return torch.as_tensor(K, dtype=torch.float64)
+
+    def forward(
+        self,
+        x: npt.NDArray[np.object_],
+        x2: npt.NDArray[np.object_] | None = None,
+    ) -> torch.Tensor:
+        if x2 is None:
+            K = self.fit_transform(x)
+            self.wl_kernel_ = None
+            return K
+
+        self.fit_transform(x)
+        K = self.transform(x2)
+        self.wl_kernel_ = None
+        return K
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index f46e89b9..e63c033f 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -3,36 +3,36 @@
 import logging
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Literal, Mapping, Sequence
-from typing_extensions import Literal
-import torch.nn as nn
 
 import numpy as np
 import torch
+from torch import nn
+from torch.optim import SGD, Adam  # type: ignore
 
 from neps.optimizers.bayesian_optimization.kernels.kernel import (
     Kernel,
-    log_marginal_likelihood,
     compute_pd_inverse,
+    log_marginal_likelihood,
 )
 from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import (
     HammingKernel,
     Matern52Kernel,
+    NumericKernel,
 )
 from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import (
     WeisfilerLehman,
 )
 from neps.search_spaces import SearchSpace
 from neps.search_spaces.encoding import (
-    IntegerCategoricalTransformer,
-    JointTransformer,
+    CategoricalToIntegerTransformer,
+    DataPack,
     MinMaxNormalizer,
     OneHotEncoder,
     TensorTransformer,
     Transformer,
     WLInputTransformer,
 )
-from neps.search_spaces.hyperparameters.float import FloatParameter
-from neps.search_spaces.hyperparameters.integer import IntegerParameter
+from neps.search_spaces.hyperparameters import FloatParameter, IntegerParameter
 
 if TYPE_CHECKING:
     from neps.search_spaces.search_space import SearchSpace
@@ -47,9 +47,9 @@
 @dataclass
 class ComprehensiveGP:
     space: SearchSpace
-    kernels: dict[str, tuple[Kernel, Transformer]]
-    combined_kernel: Literal["sum", "product"] = "sum"
+    kernels: dict[str, tuple[Sequence[str], Kernel]]
 
+    combined_kernel: Literal["sum", "product"] = "sum"
     noise_variance: Sequence[float] = NOISE_VARIANCE_GRID
     kernel_parameter_grid: Mapping[str, Sequence[Mapping[str, Any]]] | bool = True
 
@@ -60,20 +60,24 @@ class ComprehensiveGP:
 
     # Post fit attributes
     K_inv_: torch.Tensor | None = None
-    n_train_: int | None = None
     likelihood_: float | None = None
     y_: torch.Tensor | None = None
     y_normalized_: torch.Tensor | None = None
     y_mean_: float | None = None
     y_std_: float | None = None
-    optimized_kernels_: dict[str, Kernel] | None = None
-    train_data_: dict[str, Any] | None = None
+    opt_kernels_: dict[str, tuple[Sequence[str], Kernel]] | None = None
+    train_x_: DataPack | None = None
 
     def __post_init__(self):
         # TODO: Remove when search space is just definition and does not hold values.
         self.space = self.space.clone()
 
-    def fit(self, x: list[dict[str, Any]], train_y: torch.Tensor) -> None:
+    def fit(
+        self,
+        *,
+        x: DataPack,
+        train_y: torch.Tensor,
+    ) -> None:
         # Preprocessing
         y_ = torch.as_tensor(train_y, device=self.device, dtype=torch.float64)
 
@@ -83,40 +87,19 @@ def fit(self, x: list[dict[str, Any]], train_y: torch.Tensor) -> None:
         self.y_normalized_ = (y_ - self.y_mean_) / self.y_std_
         self.y_ = y_
 
-        _data = {
-            key: transformer.encode(x, self.space)
-            for key, (_, transformer) in self.kernels.items()
-        }
-
         # optimized kernel parameters + noise variance
         optim_vars: list[nn.Parameter] = []
+        opt_kernels: dict[str, tuple[Sequence[str], Kernel]] = {}
 
-        grids = {
-            name: k.suggested_grid
-            for name, (k, _) in self.kernels.items()
-            if k.suggested_grid is not None
-        }
-
-        kernels: dict[str, Kernel] = {}
-        for kernel_name, (kernel, _) in self.kernels.items():
-            xs = _data[kernel_name]
-            grid = grids[kernel_name]
-
-            maybe_optimized_kernel = kernel.grid_search(
-                x=xs,
+        N: int
+        for _kernel_name, (hps, kernel) in self.kernels.items():
+            data = x.select(hps)
+            opt_kernel, _ = kernel.grid_search(
+                x=data,  # type: ignore
                 y=self.y_normalized_,
-                grid=grid,
             )
-            if isinstance(maybe_optimized_kernel, Exception):
-                raise ValueError(
-                    f"Failed to optimize kernel {kernel_name} with grid {grid}."
-                ) from maybe_optimized_kernel
-
-            opt_kernel, _ = maybe_optimized_kernel
-            gradient_enabled_kernel = opt_kernel.as_optimizable()
-            kernels[kernel_name] = gradient_enabled_kernel
-
-            optim_vars.extend(gradient_enabled_kernel.parameters())
+            optim_vars.extend(opt_kernel.parameters())
+            opt_kernels[_kernel_name] = (hps, opt_kernel)
 
         # Now that we've optimized the kernels, we convert go convert their
         # parameters into a tensor we can further refine with some optimizer iterations
@@ -128,30 +111,25 @@ def fit(self, x: list[dict[str, Any]], train_y: torch.Tensor) -> None:
         optim_vars.append(noise_variance)
 
         if self.optimizer == "adam":
-            optim = torch.optim.Adam(optim_vars, **self.optimizer_kwargs)  # type: ignore
+            optim = Adam(optim_vars, **self.optimizer_kwargs)  # type: ignore
         elif self.optimizer == "sgd":
-            optim = torch.optim.SGD(optim_vars, **self.optimizer_kwargs)  # type: ignore
+            optim = SGD(optim_vars, **self.optimizer_kwargs)  # type: ignore
         else:
             raise ValueError(f"Invalid optimizer {self.optimizer}")
 
         K_inv: torch.Tensor | None = None
+        _init = torch.zeros if self.combined_kernel == "sum" else torch.ones
         N = len(x)
-        for i in range(self.optimizer_iters):
+        K = _init((N, N), device=self.device, dtype=torch.float64)
+        for _i in range(self.optimizer_iters):
             optim.zero_grad()
-            # Now we iterate over kernels to build up K
-            _init = torch.zeros if self.combined_kernel == "sum" else torch.ones
-            K = _init(N, N, device=self.device, dtype=torch.float64)
-            for kernel_name, kernel in kernels.items():
-                data = _data[kernel_name]
-                gram = kernel.forward(data, data)
-
-                if self.combined_kernel == "sum":
-                    K.add_(gram)
-                else:
-                    K.mul_(gram)
 
-            K.diag().add_(noise_variance)
+            for _kernel_name, (hps, opt_kernel) in opt_kernels.items():
+                data = x.select(hps)
+                k = opt_kernel.forward(data)
+                K.add_(k) if self.combined_kernel == "sum" else K.mul_(k)
 
+            K.diag().add_(noise_variance)
             K_inv, logDetK = compute_pd_inverse(K)
             nlml = -log_marginal_likelihood(K_inv, logDetK, y=self.y_normalized_)
 
@@ -166,52 +144,50 @@ def fit(self, x: list[dict[str, Any]], train_y: torch.Tensor) -> None:
         assert K_inv is not None
         self.K_inv_ = K_inv.clone()
         self.noise_variance_ = noise_variance.item()
-        self.optimized_kernels_ = kernels
-        self.n_train_ = N
-        self.train_data_ = _data
-
-    def predict(self, x: list[dict[str, Any]]) -> tuple[torch.Tensor, torch.Tensor]:
+        self.opt_kernels_ = opt_kernels
+        self.train_x_ = x
+
+    def predict(
+        self,
+        *,
+        x: DataPack,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """Kriging predictions."""
         if (
             self.K_inv_ is None
-            or self.n_train_ is None
-            or self.optimized_kernels_ is None
-            or self.train_data_ is None
+            or self.train_x_ is None
             or self.y_normalized_ is None
             or self.y_std_ is None
+            or self.opt_kernels_ is None
         ):
             raise ValueError(
                 "Inverse of Gram matrix is not instantiated. Please call the optimize "
                 "function to fit on the training data first!"
             )
-        _data = {
-            key: transformer.encode(x, self.space)
-            for key, (_, transformer) in self.kernels.items()
-        }
 
         _init = torch.zeros if self.combined_kernel == "sum" else torch.ones
         n_test = len(x)
 
         K_train_test = _init(
-            self.n_train_, n_test, device=self.device, dtype=torch.float64
+            len(self.train_x_), n_test, device=self.device, dtype=torch.float64
         )
-        K_test_test = _init(n_test, n_test, device=self.device, dtype=torch.float64)
-
-        for kernel_name, kernel in self.optimized_kernels_.items():
-            train_x = self.train_data_[kernel_name]
-            test_x = _data[kernel_name]
-
-            gram = kernel.forward(train_x, test_x)
+        for _kernel_name, (hps, opt_kernel) in self.opt_kernels_.items():
+            train = self.train_x_.select(hps)
+            test = x.select(hps)
+            k = opt_kernel.forward(train, test)
             if self.combined_kernel == "sum":
-                K_train_test.add_(gram)
+                K_train_test.add_(k)
             else:
-                K_train_test.mul_(gram)
+                K_train_test.mul_(k)
 
-            gram = kernel.forward(test_x, test_x)
+        K_test_test = _init(n_test, n_test, device=self.device, dtype=torch.float64)
+        for _kernel_name, (hps, opt_kernel) in self.opt_kernels_.items():
+            test = x.select(hps)
+            k = opt_kernel.forward(test, test)
             if self.combined_kernel == "sum":
-                K_test_test.add_(gram)
+                K_test_test.add_(k)
             else:
-                K_test_test.mul_(gram)
+                K_test_test.mul_(k)
 
         # Compute the predictive mean
 
@@ -220,7 +196,6 @@ def predict(self, x: list[dict[str, Any]]) -> tuple[torch.Tensor, torch.Tensor]:
         mu_s = mu_s * self.y_std_ + self.y_mean_
 
         cov_s = K_test_test - K_train_test.t() @ self.K_inv_ @ K_train_test
-        cov_s.diagonal().clamp_(self.noise_variance_, np.inf)
         cov_s *= self.y_std_**2
 
         return mu_s, cov_s
@@ -280,7 +255,7 @@ def get_default_kernels(
             transformer = JointTransformer.join(one_hot_encoder, fid_normalizer)
             kernels["vectorial"] = (Matern52Kernel(), transformer)
         else:
-            transformer = IntegerCategoricalTransformer(tuple(space.categoricals))
+            transformer = CategoricalToIntegerTransformer(tuple(space.categoricals))
             kernels["categorical"] = (HammingKernel(), transformer)
 
     return kernels
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index c5c47332..19efa6b6 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -1,21 +1,12 @@
 from __future__ import annotations
 
 import random
-from typing import Any, TYPE_CHECKING, Literal
-from typing_extensions import override
-from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP
+from itertools import chain
+from typing import TYPE_CHECKING, Any, Literal, Mapping
 
-from neps.state.optimizer import BudgetInfo
-from neps.utils.types import ConfigResult, RawConfig
-from neps.utils.common import instance_from_map
-from neps.search_spaces import (
-    CategoricalParameter,
-    ConstantParameter,
-    FloatParameter,
-    IntegerParameter,
-    SearchSpace,
-)
-from neps.optimizers.base_optimizer import BaseOptimizer
+import torch
+
+from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig
 from neps.optimizers.bayesian_optimization.acquisition_functions import (
     AcquisitionMapping,
     DecayingPriorWeightedAcquisition,
@@ -23,15 +14,26 @@
 from neps.optimizers.bayesian_optimization.acquisition_samplers import (
     AcquisitionSamplerMapping,
 )
-from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-    AcquisitionSampler,
-)
 from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
+from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP
+from neps.search_spaces import (
+    CategoricalParameter,
+    ConstantParameter,
+    FloatParameter,
+    IntegerParameter,
+    SearchSpace,
+)
+from neps.search_spaces.encoding import Encoder
+from neps.utils.common import instance_from_map
 
 if TYPE_CHECKING:
     from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
         BaseAcquisition,
     )
+    from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
+        AcquisitionSampler,
+    )
+    from neps.state import BudgetInfo, Trial
 
 # TODO(eddiebergman): Why not just include in the definition of the parameters.
 CUSTOM_FLOAT_CONFIDENCE_SCORES = dict(FloatParameter.DEFAULT_CONFIDENCE_SCORES)
@@ -49,6 +51,7 @@ class BayesianOptimization(BaseOptimizer):
     def __init__(
         self,
         pipeline_space: SearchSpace,
+        *,
         initial_design_size: int = 10,
         surrogate_model: str | Any = "gp",
         acquisition: str | BaseAcquisition = "EI",
@@ -62,7 +65,7 @@ def __init__(
         cost_value_on_error: None | float = None,
         logger=None,
         disable_priors: bool = False,
-        prior_confidence: Literal["low", "medium", "high"] = None,
+        prior_confidence: Literal["low", "medium", "high"] | None = None,
         sample_default_first: bool = False,
     ):
         """Initialise the BO loop.
@@ -124,10 +127,7 @@ def __init__(
 
         self._initial_design_size = initial_design_size
         self._random_interleave_prob = random_interleave_prob
-        self._num_train_x: int = 0
         self._num_error_evaluations: int = 0
-        self._pending_evaluations: list = []
-        self._model_update_failed: bool = False
         self.sample_default_first = sample_default_first
 
         if isinstance(surrogate_model, str):
@@ -136,7 +136,11 @@ def __init__(
                     space=pipeline_space,
                     include_fidelities=False,
                 )
+                self._encoder = Encoder.default(self.pipeline_space)
             else:
+                raise NotImplementedError(
+                    "Only 'gp' is supported as a surrogate model for now."
+                )
                 self.surrogate_model = instance_from_map(
                     SurrogateModelMapping,
                     surrogate_model,
@@ -162,132 +166,100 @@ def __init__(
             name="acquisition sampler function",
             kwargs={"patience": self.patience, "pipeline_space": self.pipeline_space},
         )
-        self._enhance_priors()
-
-    def _enhance_priors(self, confidence_score: dict = None) -> None:
-        """Only applicable when priors are given along with a confidence.
-
-        Args:
-            confidence_score: dict
-                The confidence scores for the 2 major variable types.
-                Example: {"categorical": 5.2, "numeric": 0.15}
-        """
-        if self.prior_confidence is None:
-            return
-        if (
-            hasattr(self.pipeline_space, "has_prior")
-            and not self.pipeline_space.has_prior
-        ):
-            return
-        for k, v in self.pipeline_space.items():
-            if v.is_fidelity or isinstance(v, ConstantParameter):
-                continue
-            elif isinstance(v, (FloatParameter, IntegerParameter)):
-                if confidence_score is None:
+        if self.pipeline_space.has_prior:
+            for k, v in self.pipeline_space.items():
+                if v.is_fidelity or isinstance(v, ConstantParameter):
+                    continue
+                elif isinstance(v, (FloatParameter, IntegerParameter)):
                     confidence = CUSTOM_FLOAT_CONFIDENCE_SCORES[self.prior_confidence]
-                else:
-                    confidence = confidence_score["numeric"]
-                self.pipeline_space[k].default_confidence_score = confidence
-            elif isinstance(v, CategoricalParameter):
-                if confidence_score is None:
+                    self.pipeline_space[k].default_confidence_score = confidence
+                elif isinstance(v, CategoricalParameter):
                     confidence = CUSTOM_CATEGORICAL_CONFIDENCE_SCORES[
                         self.prior_confidence
                     ]
-                else:
-                    confidence = confidence_score["categorical"]
-                self.pipeline_space[k].default_confidence_score = confidence
-        return
-
-    def is_init_phase(self) -> bool:
-        """Decides if optimization is still under the warmstart phase/model-based search."""
-        if self._num_train_x >= self._initial_design_size:
-            return False
-        return True
+                    self.pipeline_space[k].default_confidence_score = confidence
 
-    @override
-    def load_optimization_state(
+    def ask(
         self,
-        previous_results: dict[str, ConfigResult],
-        pending_evaluations: dict[str, SearchSpace],
+        trials: Mapping[str, Trial],
         budget_info: BudgetInfo | None,
         optimizer_state: dict[str, Any],
-    ) -> None:
-        train_x = [el.config for el in previous_results.values()]
-        train_y = [self.get_loss(el.result) for el in previous_results.values()]
-        if self.ignore_errors:
-            train_x = [x for x, y in zip(train_x, train_y) if y != "error"]
-            train_y_no_error = [y for y in train_y if y != "error"]
-            self._num_error_evaluations = len(train_y) - len(train_y_no_error)
-            train_y = train_y_no_error
-        self._num_train_x = len(train_x)
-        self._pending_evaluations = [el for el in pending_evaluations.values()]
-        if not self.is_init_phase():
+    ) -> tuple[SampledConfig, dict[str, Any]]:
+        # TODO: Lift this into runtime, let the
+        # optimizer advertise the encoding wants...
+        completed = [
+            t
+            for t in trials.values()
+            if t.report is not None and t.report.loss is not None
+        ]
+        train_x = [t.config for t in completed]
+        train_y: torch.Tensor = torch.as_tensor([t.report.loss for t in completed])  # type: ignore
+
+        pending = [t.config for t in trials.values() if t.state.pending()]
+
+        space = self.pipeline_space
+
+        # TODO: This would be better if we could serialize these
+        # in their encoded form. later...
+        for name, hp in space.categoricals.items():
+            for config in chain(train_x, pending):
+                config[name] = hp.choices.index(config[name])
+        for name, hp in space.graphs.items():
+            for config in chain(train_x, pending):
+                config[name] = hp.clone().load_from(config[name])
+
+        if len(trials) == 0 and self.sample_default_first and space.has_prior:
+            config = space.sample_default_configuration(
+                patience=self.patience, ignore_fidelity=False
+            )
+        elif len(trials) <= self._initial_design_size:
+            config = space.sample(
+                patience=self.patience, user_priors=True, ignore_fidelity=False
+            )
+        elif random.random() < self._random_interleave_prob:
+            config = space.sample(
+                patience=self.patience, user_priors=False, ignore_fidelity=False
+            )
+        else:
             try:
-                if len(self._pending_evaluations) > 0:
+                if len(pending) > 0:
                     # We want to use hallucinated results for the evaluations that have
                     # not finished yet. For this we fit a model on the finished
                     # evaluations and add these to the other results to fit another model.
                     self.surrogate_model.fit(train_x, train_y)
-                    ys, _ = self.surrogate_model.predict(self._pending_evaluations)
-                    train_x += self._pending_evaluations
+                    ys, _ = self.surrogate_model.predict(pending)
+                    train_x += pending
                     train_y += list(ys.detach().numpy())
 
+                # TODO: When using a GP, if we've already fit the
+                # model due to the if stamet above, we only
+                # need to update the model with the new points.
+                # fit on all the data again, only the new points...
                 self.surrogate_model.fit(train_x, train_y)
                 self.acquisition.set_state(self.surrogate_model)
                 self.acquisition_sampler.set_state(x=train_x, y=train_y)
+                for _ in range(self.patience):
+                    config = self.acquisition_sampler.sample(self.acquisition)
+                    if config not in pending:
+                        break
+                else:
+                    config = space.sample(
+                        patience=self.patience, user_priors=True, ignore_fidelity=False
+                    )
 
-                self._model_update_failed = False
-            except RuntimeError as runtime_error:
+            except RuntimeError as e:
                 self.logger.exception(
                     "Model could not be updated due to below error. Sampling will not use"
-                    " the model."
+                    " the model.",
+                    exc_info=e,
                 )
-                if self.loss_value_on_error is None or self.cost_value_on_error is None:
-                    raise ValueError(
-                        "A RuntimeError happened and "
-                        "loss_value_on_error or cost_value_on_error "
-                        "value is not provided, please fix the error or "
-                        "provide the values to continue without "
-                        "updating the model"
-                    ) from runtime_error
-                self._model_update_failed = True
-
-    def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]:
-        if (
-            self._num_train_x == 0
-            and self.sample_default_first
-            and self.pipeline_space.has_prior
-        ):
-            config = self.pipeline_space.sample_default_configuration(
-                patience=self.patience, ignore_fidelity=False
-            )
-        elif self._num_train_x == 0 and self._initial_design_size >= 1:
-            config = self.pipeline_space.sample(
-                patience=self.patience, user_priors=True, ignore_fidelity=False
-            )
-        elif random.random() < self._random_interleave_prob:
-            config = self.pipeline_space.sample(
-                patience=self.patience, ignore_fidelity=False
-            )
-        elif self.is_init_phase() or self._model_update_failed:
-            # initial design space
-            config = self.pipeline_space.sample(
-                patience=self.patience, user_priors=True, ignore_fidelity=False
-            )
-        else:
-            for _ in range(self.patience):
-                config = self.acquisition_sampler.sample(self.acquisition)
-                if config not in self._pending_evaluations:
-                    break
-            else:
-                config = self.pipeline_space.sample(
+                config = space.sample(
                     patience=self.patience, user_priors=True, ignore_fidelity=False
                 )
 
-        config_id = str(
-            self._num_train_x
-            + self._num_error_evaluations
-            + len(self._pending_evaluations)
-            + 1
-        )
-        return config.hp_values(), config_id, None
+        config_id = str(len(trials) + 1)
+        return SampledConfig(
+            id=config_id,
+            config=config.hp_values(),
+            previous_config_id=None,
+        ), optimizer_state
diff --git a/neps/search_spaces/distributions/__init__.py b/neps/search_spaces/distributions/__init__.py
new file mode 100644
index 00000000..65151e66
--- /dev/null
+++ b/neps/search_spaces/distributions/__init__.py
@@ -0,0 +1,16 @@
+from neps.search_spaces.distributions.distribution import Distribution
+from neps.search_spaces.distributions.truncnorm import TruncNormDistribution
+from neps.search_spaces.distributions.uniform_float import UniformFloatDistribution
+from neps.search_spaces.distributions.uniform_int import UniformIntDistribution
+from neps.search_spaces.distributions.weighted_ints import WeightedIntsDistribution
+
+UNIT_UNIFORM = UniformFloatDistribution.new(0.0, 1.0)
+
+__all__ = [
+    "Distribution",
+    "TruncNormDistribution",
+    "UniformFloatDistribution",
+    "UniformIntDistribution",
+    "UNIT_UNIFORM",
+    "WeightedIntsDistribution",
+]
diff --git a/neps/search_spaces/distributions/distribution.py b/neps/search_spaces/distributions/distribution.py
new file mode 100644
index 00000000..7ab4dd6f
--- /dev/null
+++ b/neps/search_spaces/distributions/distribution.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, TypeVar
+from typing_extensions import Protocol
+
+V = TypeVar("V", int, float)
+
+
+if TYPE_CHECKING:
+    from torch import Generator, Tensor
+
+    from neps.search_spaces.domain import Domain
+
+
+class Distribution(Protocol[V]):
+    @property
+    def domain(self) -> Domain[V]: ...
+
+    def sample(self, n: int, to: Domain, *, seed: Generator) -> Tensor: ...
+
+    def likelihood(self, value: Tensor) -> Tensor: ...
diff --git a/neps/search_spaces/distributions/truncnorm.py b/neps/search_spaces/distributions/truncnorm.py
new file mode 100644
index 00000000..3938cf1c
--- /dev/null
+++ b/neps/search_spaces/distributions/truncnorm.py
@@ -0,0 +1,112 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import TYPE_CHECKING, Any
+from typing_extensions import override
+
+import torch
+from torch import Tensor
+
+from neps.search_spaces.distributions.distribution import Distribution
+from neps.search_spaces.domain import Domain
+
+if TYPE_CHECKING:
+    from neps.utils.types import Number
+
+INT_HIGH = 1_000_000
+
+
+@lru_cache
+def _truncnorm(a: float, b: float, loc: float, scale: float) -> Any:
+    from scipy.stats import truncnorm
+
+    return truncnorm(a=a, b=b, loc=loc, scale=scale)
+
+
+@dataclass(frozen=True)
+class TruncNormDistribution(Distribution[float]):
+    domain: Domain[float]
+    center: float
+    std: float
+    truncnorm: Any
+
+    @override
+    def sample(self, n: int, seed: torch.Generator) -> Tensor:
+        random_state = torch.randint(INT_HIGH, size=(1,), generator=seed)
+        rv = self.truncnorm.rvs(size=n, random_state=random_state.item())
+        return torch.tensor(rv, dtype=self.domain.dtype)
+
+    @override
+    def likelihood(self, value: Tensor) -> Tensor:
+        return self.truncnorm.pdf(value.numpy())
+
+    def normalize(self) -> TruncNormDistribution:
+        # Send to unit domain
+        center = float(self.domain.from_unit(torch.tensor(self.center)).item())
+        std = self.std / self.domain.length
+
+        return TruncNormDistribution(
+            domain=Domain.unit_float(),
+            center=center,
+            std=std,
+            truncnorm=_truncnorm(
+                a=(0 - center) / std,
+                b=(1 - center) / std,
+                loc=center,
+                scale=std,
+            ),
+        )
+
+    def with_center_and_confidence(
+        self,
+        center: Number,
+        confidence: float,
+    ) -> TruncNormDistribution:
+        assert 0 <= confidence <= 1
+        assert self.domain.lower <= center <= self.domain.upper
+        std = 1 - confidence
+        center = float(center)
+        return TruncNormDistribution(
+            domain=self.domain,
+            center=center,
+            std=std,
+            truncnorm=_truncnorm(
+                a=(self.domain.lower - center) / std,
+                b=(self.domain.upper - center) / std,
+                loc=center,
+                scale=std,
+            ),
+        )
+
+    @classmethod
+    def new(
+        cls,
+        lower: Number,
+        center: Number,
+        upper: Number,
+        *,
+        std: Number,
+        std_is_normalized: bool,
+    ) -> TruncNormDistribution:
+        assert lower <= center <= upper, f"{lower} <= {center} <= {upper}"
+        center = float(center)
+
+        if std_is_normalized:
+            assert 0 <= std <= 1
+            std = float((upper - lower) * std)
+        else:
+            assert std > 0
+            std = float(std)
+
+        return cls(
+            domain=Domain.float(float(lower), float(upper)),
+            center=center,
+            std=std,
+            truncnorm=_truncnorm(
+                a=(lower - center) / std,
+                b=(upper - center) / std,
+                loc=center,
+                scale=std,
+            ),
+        )
diff --git a/neps/search_spaces/distributions/uniform_float.py b/neps/search_spaces/distributions/uniform_float.py
new file mode 100644
index 00000000..bdb43ee8
--- /dev/null
+++ b/neps/search_spaces/distributions/uniform_float.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing_extensions import override
+
+import torch
+from torch import Tensor
+
+from neps.search_spaces.distributions.distribution import Distribution
+from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
+
+INT_HIGH = 1_000_000
+
+
+@dataclass(frozen=True)
+class UniformFloatDistribution(Distribution[float]):
+    domain: Domain[float]
+    _pdf: float = field(repr=False)
+
+    @override
+    def sample(self, n: int, to: Domain, seed: torch.Generator) -> Tensor:
+        # This creates samples in a unit float domain, rather than
+        # the `.domain` attribute of this distribution. Rather than scale
+        # up twice, we just scale directly form the UNIT_FLOAT_DOMAIN
+        # We still however need the `.domain` attribute for `likelihood`
+        unit_samples = torch.rand(n, generator=seed)
+        return to.cast(unit_samples, UNIT_FLOAT_DOMAIN)
+
+    @override
+    def likelihood(self, value: Tensor) -> Tensor:
+        return torch.where(
+            (value >= self.domain.lower) & (value <= self.domain.upper),
+            self._pdf,
+            0.0,
+        )
+
+    @classmethod
+    def new(cls, lower: int | float, upper: int | float) -> UniformFloatDistribution:
+        _pdf = 1.0 / (upper - lower)
+        return cls(Domain.float(lower, upper), _pdf=_pdf)
+
+    @classmethod
+    def unit_distribution(cls) -> UniformFloatDistribution:
+        return UNIT_UNIFORM_FLOAT
+
+
+UNIT_UNIFORM_FLOAT = UniformFloatDistribution.new(0.0, 1.0)
diff --git a/neps/search_spaces/distributions/uniform_int.py b/neps/search_spaces/distributions/uniform_int.py
new file mode 100644
index 00000000..8fd7b043
--- /dev/null
+++ b/neps/search_spaces/distributions/uniform_int.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+from typing_extensions import override
+
+import torch
+from torch import Tensor
+
+from neps.search_spaces.distributions.distribution import Distribution
+from neps.search_spaces.domain import Domain
+
+if TYPE_CHECKING:
+    from neps.utils.types import Number
+
+
+@dataclass(frozen=True)
+class UniformIntDistribution(Distribution[int]):
+    domain: Domain[int]
+    _pdf: float = field(repr=False)
+
+    @override
+    def sample(self, n: int, to: Domain, *, seed: torch.Generator) -> Tensor:
+        samples = torch.randint(
+            self.domain.lower,
+            self.domain.upper,
+            size=(n,),
+            generator=seed,
+        )
+        return to.cast(samples, frm=self.domain)
+
+    @override
+    def likelihood(self, value: Tensor) -> Tensor:
+        return torch.where(
+            (value >= self.domain.lower) & (value <= self.domain.upper),
+            self._pdf,
+            0.0,
+        )
+
+    @classmethod
+    def indices(cls, n: int) -> UniformIntDistribution:
+        return cls(Domain.int(0, n - 1), _pdf=1.0 / n)
+
+    @classmethod
+    def new(cls, lower: Number, upper: Number) -> UniformIntDistribution:
+        return cls(Domain.int(lower, upper), _pdf=1.0 / (upper - lower))
diff --git a/neps/search_spaces/distributions/weighted_ints.py b/neps/search_spaces/distributions/weighted_ints.py
new file mode 100644
index 00000000..3c8c60c5
--- /dev/null
+++ b/neps/search_spaces/distributions/weighted_ints.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+import warnings
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, ClassVar, Sequence
+from typing_extensions import override
+
+import torch
+from torch import Tensor
+
+from neps.search_spaces.distributions.distribution import Distribution
+from neps.search_spaces.domain import Domain
+
+if TYPE_CHECKING:
+    from neps.utils.types import Number
+
+
+@dataclass(frozen=True)
+class WeightedIntsDistribution(Distribution[int]):
+    # NOTE: Having a Million weights is very resource intense and super slow
+    # for sampling, especially given our common use case is to have only one weight
+    # with the rest being uniform. 100 is well out of scope for what was intended,
+    # as this is mostly intended for categoricals.
+    # If we need this, then we should make a more efficient implementation,
+    # such as one that uniform samples and then with probability `weight`
+    # replaces the value with the favoured value.
+    LIMIT_FOR_WEIGHTED_INTS: ClassVar[int] = 200
+
+    domain: Domain[int]
+    weights: Tensor
+
+    @override
+    def sample(self, n: int, to: Domain, *, seed: torch.Generator) -> Tensor:
+        rand_tensor = torch.multinomial(
+            self.weights,
+            n,
+            replacement=True,
+            generator=seed,
+        )
+        return to.cast(rand_tensor, frm=self.domain)
+
+    @override
+    def likelihood(self, value: Tensor) -> Tensor:
+        valid_indices = torch.logical_and(
+            value >= self.domain.lower, value <= self.domain.upper
+        )
+        psuedo_indices = torch.where(valid_indices, value, 0)
+        probs = self.weights[psuedo_indices]
+        return torch.where(valid_indices, probs, 0)
+
+    @classmethod
+    def new(cls, weights: Sequence[Number] | Tensor) -> WeightedIntsDistribution:
+        if len(weights) > cls.LIMIT_FOR_WEIGHTED_INTS:
+            raise ValueError(
+                f"Having {len(weights)} weights is very resource intense and slow"
+                " for sampling. Consider using a more efficient implementation"
+                " if you need this many weights.",
+            )
+        return cls(
+            weights=torch.as_tensor(weights, dtype=torch.float64),
+            domain=Domain.indices(len(weights)),
+        )
+
+    @classmethod
+    def with_favoured(
+        cls,
+        n: int,
+        favoured: int,
+        confidence: float,
+    ) -> WeightedIntsDistribution:
+        if n > cls.LIMIT_FOR_WEIGHTED_INTS:
+            raise ValueError(
+                f"Having {n} weights is very resource intense and slow"
+                " for sampling. Consider using a more efficient implementation"
+                " if you need this many weights.",
+            )
+
+        assert 0.0 <= confidence <= 1.0
+        remaining = 1.0 - confidence
+        rest = remaining / (n - 1)
+        if confidence < rest:
+            warnings.warn(
+                f"Weight {confidence} is less than the rest {rest}."
+                " This will make the favoured value less likely to be sampled"
+                " than the rest of the values.",
+                UserWarning,
+                stacklevel=2,
+            )
+        dist = torch.full(size=(n,), fill_value=rest, dtype=torch.float64)
+        dist[favoured] = confidence
+        return cls(weights=dist, domain=Domain.indices(n))
diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py
new file mode 100644
index 00000000..06814862
--- /dev/null
+++ b/neps/search_spaces/domain.py
@@ -0,0 +1,316 @@
+# TODO: Could theoretically implement dtype,device,out for all methods here but
+# would need to be careful not to accidentally send to and from GPU.
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Generic, TypeVar
+from typing_extensions import TypeAlias
+
+import torch
+from torch import Tensor
+
+if TYPE_CHECKING:
+    from neps.search_spaces.distributions.truncnorm import TruncNormDistribution
+    from neps.search_spaces.distributions.uniform_float import (
+        UniformFloatDistribution,
+    )
+    from neps.search_spaces.distributions.uniform_int import UniformIntDistribution
+    from neps.search_spaces.distributions.weighted_ints import WeightedIntsDistribution
+
+
+Number = int | float
+V = TypeVar("V", int, float)
+V2 = TypeVar("V2", int, float)
+
+
+@dataclass(frozen=True)
+class NumberDomain(Generic[V]):
+    lower: V
+    upper: V
+    round: bool
+    log_bounds: tuple[float, float] | None = None
+    bins: int | None = None
+
+    dtype: torch.dtype = field(init=False, repr=False)
+    is_unit: bool = field(init=False, repr=False)
+    midpoint: V = field(init=False, repr=False)
+    is_log: bool = field(init=False, repr=False)
+    length: V = field(init=False, repr=False)
+    cardinality: int | None = field(init=False, repr=False)
+
+    def __post_init__(self):
+        assert isinstance(self.lower, type(self.upper))
+        object.__setattr__(self, "is_unit", self.lower == 0 and self.upper == 1)
+        object.__setattr__(self, "is_log", self.log_bounds is not None)
+        object.__setattr__(
+            self, "dtype", torch.int64 if isinstance(self.lower, int) else torch.float64
+        )
+        object.__setattr__(self, "length", self.upper - self.lower)
+
+        if self.bins:
+            cardinality = self.bins
+        elif self.round:
+            cardinality = int(self.upper - self.lower + 1)
+        else:
+            cardinality = None
+
+        object.__setattr__(self, "cardinality", cardinality)
+        mid = self.from_unit(torch.tensor(0.5)).item()
+        if self.dtype == torch.int64:
+            mid = int(round(mid))
+        object.__setattr__(self, "midpoint", mid)
+
+    @classmethod
+    def float(
+        cls,
+        lower: Number,
+        upper: Number,
+        *,
+        log: bool = False,
+        bins: int | None = None,
+    ) -> NumberDomain[float]:
+        return NumberDomain(
+            lower=float(lower),
+            upper=float(upper),
+            log_bounds=(math.log(lower), math.log(upper)) if log else None,
+            bins=bins,
+            round=False,
+        )
+
+    @classmethod
+    def int(
+        cls,
+        lower: Number,
+        upper: Number,
+        *,
+        log: bool = False,
+        bins: int | None = None,
+    ) -> NumberDomain[int]:
+        return NumberDomain(
+            lower=int(round(lower)),
+            upper=int(round(upper)),
+            log_bounds=(math.log(lower), math.log(upper)) if log else None,
+            round=True,
+            bins=bins,
+        )
+
+    @classmethod
+    def indices(cls, n: int) -> NumberDomain[int]:
+        """Create a domain for a range of indices.
+
+        Like range based functions this domain is inclusive of the lower bound
+        and exclusive of the upper bound.
+
+        Use this method to create a domain for indices
+        """
+        return NumberDomain.int(0, n - 1)
+
+    def to_unit(self, x: Tensor) -> Tensor:
+        if self.is_unit:
+            return x  # type: ignore
+
+        if self.log_bounds is not None:
+            x = torch.log(x)
+            lower, upper = self.log_bounds
+        else:
+            lower, upper = self.lower, self.upper
+
+        return (x - lower) / (upper - lower)
+
+    def from_unit(self, x: Tensor) -> Tensor:
+        if self.is_unit:
+            return x
+
+        bins = self.bins
+        if bins is not None:
+            quantization_levels = torch.floor(x * bins).clip(0, bins - 1)
+            x = quantization_levels / (bins - 1)
+
+        # Now we scale to the new domain
+        if self.log_bounds is not None:
+            lower, upper = self.log_bounds
+            x = x * (upper - lower) + lower
+            x = torch.exp(x)
+        else:
+            lower, upper = self.lower, self.upper
+            x = x * (upper - lower) + lower
+
+        if self.round:
+            x = torch.round(x)
+
+        return x.type(self.dtype)
+
+    def cast(
+        self,
+        x: Tensor,
+        frm: Domain,
+    ) -> Tensor:
+        if isinstance(frm, OneHotDomain):
+            x = torch.argmax(x, dim=1)
+            frm = frm.int_domain
+
+        # NOTE: In general, we should always be able to go through the unit interval
+        # [0, 1] to be able to transform between domains. However sometimes we can
+        # bypass some steps, dependant on the domains, hence the ugliness...
+
+        # Shortcut 1. (Same Domain)
+        # We can shortcut out going through normalized space if all the boundaries and
+        # they live on the same scale. However, if their bins don't line up, we will
+        # have to go through unit space to figure out the bins
+        same_bounds = self.lower == frm.lower and self.upper == frm.upper
+        same_log_bounds = self.log_bounds == frm.log_bounds
+        same_bins = self.bins == frm.bins
+        if same_bounds and same_log_bounds and (self.bins is None or same_bins):
+            if self.round:
+                x = torch.round(x)
+            return x.type(self.dtype)
+
+        # Shortcut 2. (From normalized)
+        # The domain we are coming from is already normalized, we only need to lift
+        if frm.is_unit:
+            return self.from_unit(x)  # type: ignore
+
+        # Shortcut 3. (Log lift)
+        # We can also shortcut out if the only diffrence is that we are coming frm the
+        # log bounds of this domain. We dont care if where we came from was binned or not,
+        # we just lift it up with `np.exp` and round if needed
+        if (self.lower, self.upper) == frm.log_bounds and self.bins is None:
+            x = torch.exp(x)
+            if self.round:
+                x = torch.round(x)
+            return x.type(self.dtype)
+
+        # Otherwise, through the unit interval we go
+        norm = frm.to_unit(x)
+        lift = self.from_unit(norm)
+        return lift  # noqa: RET504
+
+    def uniform_distribution(self) -> UniformFloatDistribution | UniformIntDistribution:
+        from neps.search_spaces.distributions import (
+            UNIT_UNIFORM,
+            UniformFloatDistribution,
+            UniformIntDistribution,
+        )
+
+        # (Log Lift) - sample on it's log domain
+        if self.log_bounds is not None:
+            return UniformFloatDistribution.new(*self.log_bounds)
+
+        # (Same Domain) - Just sample integers
+        if self.dtype == torch.int64 and self.bins is None:
+            return UniformIntDistribution.new(self.lower, self.upper)
+
+        # NOTE: There's a possibility where you could use an integer distribution for
+        # binned domains, however the cost of sampling integers and casting is likely
+        # higher than just casting from normalized domain. Would need to verify this
+        # In any case, Normalized Uniform Float is a safe choice
+
+        # (From Normalized)
+        return UNIT_UNIFORM
+
+    def unit_uniform_distribution(self) -> UniformFloatDistribution:
+        from neps.search_spaces.distributions import UNIT_UNIFORM
+
+        return UNIT_UNIFORM
+
+    def truncnorm_distribution(
+        self,
+        center: Number,
+        *,
+        confidence: float | None = None,
+        std: float | None = None,
+    ) -> TruncNormDistribution:
+        from neps.search_spaces.distributions import TruncNormDistribution
+
+        # If you need a unit one, create this and then call `normalize()` on it.
+        if std is None and confidence is None:
+            raise ValueError(
+                "Must specify either `std` in (lower, upper) or `confidence` in (0, 1)"
+            )
+
+        if std is None:
+            assert 0 <= confidence <= 1  # type: ignore
+            _std = float(1 - confidence)  # type: ignore
+            _is_normalized = True
+        else:
+            _std = float(std)
+            _is_normalized = False
+
+        # (Log Lift) - sample on it's log domain
+        if self.log_bounds is not None:
+            return TruncNormDistribution.new(
+                lower=self.log_bounds[0],
+                center=math.log(center),
+                upper=self.log_bounds[1],
+                std=_std,
+                std_is_normalized=_is_normalized,
+            )
+
+        # NOTE: There's a possibility where you could use an integer distribution for
+        # binned domains, however the cost of sampling integers and casting is likely
+        # higher than just casting from normalized domain. Would need to verify this
+        # In any case, Normalized Uniform Float is a safe choice
+
+        # (From Normalized)
+        truncnorm = TruncNormDistribution.new(
+            lower=self.lower,
+            center=math.log(center),
+            upper=self.upper,
+            std=_std,
+            std_is_normalized=_is_normalized,
+        )
+        return truncnorm.normalize()
+
+    def weighted_indices_distribution(
+        self, center_index: int, *, confidence: float
+    ) -> WeightedIntsDistribution:
+        from neps.search_spaces.distributions import WeightedIntsDistribution
+
+        if self.cardinality is None:
+            raise ValueError(
+                "Cannot create a weighted distribution for a continuous domain!"
+            )
+        if not isinstance(center_index, int):
+            raise ValueError(
+                f"Center index must be an integer of type {self.dtype} to"
+                " create a weighted distribution!"
+            )
+        assert 0 <= confidence <= 1
+
+        return WeightedIntsDistribution.with_favoured(
+            n=self.cardinality,
+            favoured=int(round(center_index)),
+            confidence=confidence,
+        )
+
+    @classmethod
+    def unit_float(cls) -> NumberDomain[float]:
+        return UNIT_FLOAT_DOMAIN
+
+
+@dataclass(frozen=True)
+class OneHotDomain:
+    cardinality: int
+    int_domain: NumberDomain[int] = field(init=False, repr=False)
+
+    def __post_init__(self):
+        object.__setattr__(
+            self,
+            "int_domain",
+            NumberDomain.indices(self.cardinality),
+        )
+
+    def cast(self, x: Tensor, frm: NumberDomain[int]) -> Tensor:
+        # Convert to integers first
+        x = self.int_domain.cast(x, frm)
+
+        # Then one hot encode
+        buffer = torch.zeros((len(x), self.cardinality))
+        buffer.scatter_(1, x.unsqueeze(1), 1)
+        return buffer
+
+
+UNIT_FLOAT_DOMAIN = NumberDomain.float(0.0, 1.0)
+
+Domain: TypeAlias = NumberDomain | OneHotDomain
diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
index 592cfa88..adcaa121 100644
--- a/neps/search_spaces/encoding.py
+++ b/neps/search_spaces/encoding.py
@@ -1,360 +1,442 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from grakel.utils import graph_from_networkx
-
-from typing import Any, TypeAlias, TypeVar, Generic
-from typing_extensions import Self, override, Self
-from itertools import chain
-import torch
-from neps.search_spaces import (
-    CategoricalParameter,
-    IntegerParameter,
-    FloatParameter,
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Generic,
+    Sequence,
+    Sized,
+    TypeAlias,
+    TypeVar,
+    overload,
 )
+from typing_extensions import Protocol, override
 
-from neps.search_spaces.search_space import SearchSpace, Parameter
-
-WLInput: TypeAlias = tuple[dict, dict | None, dict | None]
-
+import numpy as np
+import numpy.typing as npt
+import torch
+from grakel.utils import graph_from_networkx
 
-@dataclass
-class GraphEncoder:
-    hps: tuple[str]
+from neps.search_spaces.domain import (
+    UNIT_FLOAT_DOMAIN,
+    Domain,
+    NumberDomain,
+    OneHotDomain,
+)
 
-    def encode(
-        self,
-        x: list[dict[str, Any]],
-        space: SearchSpace,
-    ) -> dict[str, list[WLInput]]:
-        return {hp: [config[hp].value for config in x] for hp in self.hps}
+if TYPE_CHECKING:
+    import networkx as nx
 
+    from neps.search_spaces.search_space import SearchSpace
 
+WLInput: TypeAlias = tuple[dict, dict | None, dict | None]
+V = TypeVar("V", int, float)
 T = TypeVar("T")
 
 
-@dataclass
-class Transformer(Generic[T]):
-    hps: tuple[str]
-
-    def encode(self, x: list[dict[str, Any]], space: SearchSpace) -> T: ...
+class Transformer(Protocol[T]):
+    def encode(self, x: Sequence[Any]) -> T: ...
 
-    def value_decode(self, x: T, space: SearchSpace) -> dict[str, list[Any]]: ...
+    def decode(self, x: T) -> list[Any]: ...
 
-    def decode(self, x: T, space: SearchSpace) -> list[dict[str, Any]]:
-        values = self.value_decode(x, space)
-        return [(dict(zip(values, t))) for t in zip(*values.values())]
 
+class TensorTransformer(Transformer[torch.Tensor], Protocol):
+    domain: Domain
+    output_cols: int
 
-@dataclass
-class WLInputTransformer(Transformer[WLInput]):
     def encode(
         self,
-        x: list[dict[str, Any]],
-        space: SearchSpace,
-    ) -> dict[str, list[WLInput]]:
-        _graphs: dict[str, list[WLInput]] = {}
-        for hp_name in space.graphs.keys():
-            gs = [conf[hp_name].value for conf in x]
-            _graphs[hp_name] = graph_from_networkx(gs)  # type: ignore
+        x: list[Any],
+        *,
+        out: torch.Tensor | None = None,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+    ) -> torch.Tensor: ...
 
-        return _graphs
 
-    def value_decode(
-        self,
-        x: dict[str, list[WLInput]],
-        space: SearchSpace,
-    ) -> dict[str, list[Any]]:
-        raise NotImplementedError("Cannot decode WLInput to values.")
+@dataclass
+class CategoricalToIntegerTransformer(TensorTransformer):
+    choices: list[Any]
 
+    domain: NumberDomain = field(init=False)
+    output_cols: int = field(init=False)
+    _lookup: dict[Any, int] | None = field(init=False)
 
-@dataclass
-class TensorTransformer(Transformer[torch.Tensor]):
-    def output_cols(self, space: SearchSpace) -> int: ...
+    def __post_init__(self):
+        assert len(self.choices) > 0
+
+        self.domain = NumberDomain.indices(len(self.choices))
+        self.output_cols = 1
+        if len(self.choices) > 3:
+            try:
+                self._lookup = {c: i for i, c in enumerate(self.choices)}
+            except TypeError:
+                self._lookup = None
 
+    @override
     def encode(
         self,
-        x: list[dict[str, Any]],
-        space: SearchSpace,
+        x: list[Any],
         *,
-        device: torch.device | None = None,
+        out: torch.Tensor | None = None,
         dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
     ) -> torch.Tensor:
-        width = len(self.hps)
-        buffer = torch.empty(size=(len(x), width), dtype=dtype, device=device)
-
-        for i, name in enumerate(self.hps):
-            hp = space[name]
-            assert isinstance(hp, CategoricalParameter)
-            values = torch.tensor(
-                [config[name]._value_index for config in x], dtype=dtype, device=device
-            )
+        if dtype is None:
+            dtype = torch.int if out is None else out.dtype
 
-        return buffer
+        values = (
+            [self._lookup[c] for c in x]
+            if self._lookup
+            else [self.choices.index(c) for c in x]
+        )
+
+        if out is None:
+            return torch.tensor(values, dtype=dtype, device=device)
 
+        assert out.shape == (len(x),), f"{out.shape} != {(len(x),)}"
+        out[:] = torch.tensor(values, dtype=out.dtype, device=out.device)
+        return out
 
+    @override
+    def decode(self, x: torch.Tensor) -> list[Any]:
+        return [self.choices[i] for i in x]
+
+
+# TODO: Maybe add a shift argument, could be useful to have `0` as midpoint
+# and `-0.5` as lower bound with `0.5` as upper bound.
 @dataclass
-class IntegerCategoricalTransformer(TensorTransformer):
-    def output_cols(self, space: SearchSpace) -> int:
-        return len(self.hps)
+class MinMaxNormalizer(TensorTransformer, Generic[V]):
+    original_domain: NumberDomain[V]
+
+    domain: NumberDomain[float] = field(init=False)
+    output_cols: int = field(init=False)
+
+    def __post_init__(self):
+        self.domain = UNIT_FLOAT_DOMAIN
+        self.output_cols = 1
 
     @override
     def encode(
         self,
-        x: list[dict[str, Any]],
-        space: SearchSpace,
+        x: list[V],
         *,
+        out: torch.Tensor | None = None,
         dtype: torch.dtype | None = None,
         device: torch.device | None = None,
     ) -> torch.Tensor:
-        if dtype is None:
-            dtype = torch.int
-
-        buffer = torch.empty(size=(len(x), len(self.hps)), dtype=dtype, device=device)
-        for i, name in enumerate(self.hps):
-            hp = space[name]
-            assert isinstance(hp, CategoricalParameter)
-            values = torch.tensor(
-                [config[name].value for config in x], dtype=dtype, device=device
-            )
-            buffer[:, i] = values
+        if out is not None:
+            dtype = out.dtype
+            device = out.device
+        else:
+            dtype = torch.float64 if dtype is None else dtype
 
-        return buffer
+        values = torch.tensor(list(x), dtype=dtype, device=device)
+        values = self.domain.cast(values, frm=self.original_domain)
+        if out is None:
+            return values
 
-    @override
-    def value_decode(self, x: torch.Tensor, space: SearchSpace) -> dict[str, list[Any]]:
-        values: dict[str, list[Any]] = {}
-        for i, name in enumerate(self.hps):
-            hp = space[name]
-            assert isinstance(hp, CategoricalParameter)
-            enc = x[:, i]
-            values[name] = [hp.choices[i] for i in enc.tolist()]
+        assert out.shape == (len(x),), f"{out.shape} != {(len(x),)}"
+        out[:] = values
+        return out
 
-        return values
+    @override
+    def decode(self, x: torch.Tensor) -> list[V]:
+        values = self.original_domain.from_unit(x)
+        return values.tolist()
 
 
 @dataclass
-class MinMaxNormalizer(TensorTransformer):
-    def output_cols(self, space: SearchSpace) -> int:
-        return len(self.hps)
+class OneHotEncoder(TensorTransformer):
+    choices: list[Any]
+
+    domain: OneHotDomain = field(init=False)
+    output_cols: int = field(init=False)
+    categorical_to_integer: CategoricalToIntegerTransformer = field(init=False)
+
+    def __post_init__(self):
+        self.categorical_to_integer = CategoricalToIntegerTransformer(self.choices)
+        self.output_cols = len(self.choices)
 
     @override
     def encode(
         self,
-        x: list[dict[str, Any]],
-        space: SearchSpace,
+        x: list[Any],
         *,
+        out: torch.Tensor | None = None,
         dtype: torch.dtype | None = None,
         device: torch.device | None = None,
     ) -> torch.Tensor:
-        if dtype is None:
-            dtype = torch.float64
+        if out is not None:
+            dtype = out.dtype
+            device = out.device
+        else:
+            dtype = torch.float64 if dtype is None else dtype
+
+        ints = self.categorical_to_integer.encode(x, dtype=torch.int64, device=device)
+        shape = (len(x), self.output_cols)
+        if out is None:
+            buffer = torch.zeros(size=shape, dtype=dtype, device=device)
+        else:
+            assert out.shape == shape, f"{out.shape} != {shape}"
+            buffer = out
+
+        cat_tensor = torch.tensor(ints, dtype=torch.int64, device=device).unsqueeze(1)
+        buffer.scatter_(1, cat_tensor, 1)
+        return buffer
 
-        width = len(self.hps)
-        buffer = torch.empty(size=(len(x), width), dtype=dtype, device=device)
+    @override
+    def decode(self, x: torch.Tensor) -> list[Any]:
+        ints = torch.argmax(x, dim=1)
+        return self.categorical_to_integer.decode(ints)
 
-        for i, name in enumerate(self.hps):
-            hp = space[name]
-            assert isinstance(hp, (FloatParameter, IntegerParameter))
-            values = torch.tensor(
-                [config[name].value for config in x], dtype=dtype, device=device
-            )
-            if hp.log_bounds:
-                lower, upper = hp.log_bounds
-                buffer[:, i] = (torch.log(values) - lower) / (upper - lower)
-            else:
-                lower, upper = hp.lower, hp.upper
-                buffer[:, i] = (values - lower) / (upper - lower)
 
-        return buffer
+@dataclass
+class WLInputTransformer(Transformer[WLInput]):
+    hp: str
 
-    @override
-    def value_decode(
-        self,
-        x: torch.Tensor,
-        space: SearchSpace,
-    ) -> dict[str, list[Any]]:
-        values: dict[str, list[Any]] = {}
+    def encode(self, x: Sequence[nx.Graph]) -> list[WLInput]:
+        return [graph_from_networkx(g) for g in x]  # type: ignore
 
-        for i, name in enumerate(self.hps):
-            hp = space[name]
-            assert isinstance(hp, (FloatParameter, IntegerParameter))
-            enc = x[:, i]
-            if hp.log_bounds:
-                lower, upper = hp.log_bounds
-                enc = torch.exp(enc * (upper - lower) + lower)
-            else:
-                lower, upper = hp.lower, hp.upper
-                enc = enc * (upper - lower) + lower
+    def decode(self, x: dict[str, list[WLInput]]) -> dict[str, list[Any]]:
+        raise NotImplementedError("Cannot decode WLInput to values.")
 
-            if isinstance(hp, IntegerParameter):
-                enc = torch.round(enc).to(torch.int)
 
-            values[name] = enc.tolist()
+@dataclass
+class GraphEncoder:
+    transformers: dict[str, WLInputTransformer]
+    column_lookup: dict[str, int] = field(init=False)
+
+    def __post_init__(self):
+        transformers = sorted(self.transformers.items(), key=lambda t: t[0])
+        self.transformers = dict(transformers)
+        self.column_lookup: dict[str, int] = {
+            name: i for i, (name, _) in enumerate(self.transformers.items())
+        }
+
+    def select(
+        self, x: npt.NDArray[np.object_], hp: str | Sequence[str]
+    ) -> npt.NDArray[np.object_]:
+        # Kind of a redundant function but made to be compatible with TensorPack
+        if isinstance(hp, str):
+            return x[:, self.column_lookup[hp]]
+
+        return x[:, [self.column_lookup[h] for h in hp]]
+
+    def encode(self, x: list[SearchSpace]) -> npt.NDArray[np.object_]:
+        buffer = np.empty((len(x), len(self.transformers)), dtype=np.object_)
+        for hp, transformer in self.transformers.items():
+            values = [conf[hp].value for conf in x]
+            buffer[:, self.column_lookup[hp]] = transformer.encode(values)  # type: ignore
+        return buffer
 
-        return values
+    def decode_dicts(self, x: npt.NDArray[np.object_]) -> list[dict[str, Any]]:
+        raise NotImplementedError("Cannot decode graph embeddings.")
 
 
 @dataclass
-class StandardNormalizer(TensorTransformer):
-    std_means: dict[str, tuple[float, float]] = field(default_factory=dict)
+class TensorEncoder:
+    transformers: dict[str, TensorTransformer]
+    column_lookup: dict[str, tuple[int, int]] = field(init=False)
 
-    def output_cols(self, space: SearchSpace) -> int:
-        return len(self.hps)
+    def __post_init__(self):
+        transformers = sorted(
+            self.transformers.items(), key=lambda t: (t[1].output_cols, t[0])
+        )
+        self.transformers = dict(transformers)
+        self.column_lookup: dict[str, tuple[int, int]] = {}
+        offset = 0
+        for name, transformer in self.transformers.items():
+            self.column_lookup[name] = (offset, offset + transformer.output_cols)
+            offset += transformer.output_cols
+
+    def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor:
+        if isinstance(hp, str):
+            return x[:, slice(*self.column_lookup[hp])]
+        cols = torch.concatenate([torch.arange(*self.column_lookup[h]) for h in hp])
+        return x[:, cols]
 
-    @override
     def encode(
         self,
-        x: list[dict[str, Any]],
-        space: SearchSpace,
+        x: list[SearchSpace],
         *,
-        dtype: torch.dtype | None = None,
         device: torch.device | None = None,
     ) -> torch.Tensor:
-        if dtype is None:
-            dtype = torch.float64
-
-        width = len(self.hps)
-        buffer = torch.empty(size=(len(x), width), dtype=dtype, device=device)
-        std_means: dict[str, tuple[float, float]] = {}
-
-        for i, name in enumerate(self.hps):
-            hp = space[name]
-            assert isinstance(hp, (FloatParameter, IntegerParameter))
-            values = torch.tensor(
-                [config[name].value for config in x], dtype=dtype, device=device
+        width = sum(t.output_cols for t in self.transformers.values())
+        buffer = torch.empty((len(x), width), dtype=torch.float64, device=device)
+
+        for hp_name, transformer in self.transformers.items():
+            values = [conf[hp_name] for conf in x]
+            lookup = self.column_lookup[hp_name]
+
+            # Encode directly into buffer
+            transformer.encode(
+                values,
+                out=buffer[:, slice(*lookup)],
+                dtype=torch.float64,
+                device=device,
             )
-            if hp.log_bounds:
-                values = torch.log(values)
-
-            mean, std = values.mean(), values.std()
-            std_means[name] = (mean.item(), std.item())
-
-            buffer[:, i] = (values - mean) / std
 
-        self.std_means = std_means
         return buffer
 
-    @override
-    def value_decode(self, x: torch.Tensor, space: SearchSpace) -> dict[str, list[Any]]:
+    def decode_dicts(self, x: torch.Tensor) -> list[dict[str, Any]]:
         values: dict[str, list[Any]] = {}
+        for hp_name, transformer in self.transformers.items():
+            lookup = self.column_lookup[hp_name]
+            values[hp_name] = transformer.decode(x[:, slice(*lookup)])
 
-        for i, name in enumerate(self.hps):
-            hp = space[name]
-            assert isinstance(hp, Parameter)
-            enc = x[:, i]
-            if isinstance(hp, (FloatParameter, IntegerParameter)):
-                std, mean = self.std_means[name]
-                if hp.log_bounds:
-                    enc = torch.exp(enc * std + mean)
-                else:
-                    enc = enc * std + mean
-
-                if isinstance(hp, IntegerParameter):
-                    enc = torch.round(enc).to(torch.int)
-
-                values[name] = enc.tolist()
-            else:
-                raise ValueError(f"Invalid hyperparameter type: {type(hp)}")
-
-        return values
+        keys = list(values.keys())
+        return [dict(zip(keys, vals)) for vals in zip(*values.values())]
 
 
 @dataclass
-class OneHotEncoder(TensorTransformer):
-    def output_cols(self, space: SearchSpace) -> int:
-        return sum(len(hp.choices) for hp in (space[name] for name in self.hps))  # type: ignore
+class DataEncoder:
+    tensors: TensorEncoder | None = None
+    graphs: GraphEncoder | None = None
 
-    @override
     def encode(
         self,
-        x: list[dict[str, Any]],
-        space: SearchSpace,
+        x: list[SearchSpace],
         *,
-        dtype: torch.dtype | None = None,
         device: torch.device | None = None,
-    ) -> torch.Tensor:
-        if dtype is None:
-            dtype = torch.bool
+    ) -> tuple[torch.Tensor | None, npt.NDArray[np.object_] | None]:
+        tensor = self.tensors.encode(x, device=device) if self.tensors else None
+        graphs = self.graphs.encode(x) if self.graphs else None
+        return tensor, graphs
 
-        categoricals: dict[str, CategoricalParameter] = {}
-        for name in self.hps:
-            hp = space[name]
-            assert isinstance(hp, CategoricalParameter)
-            categoricals[name] = hp
+    @overload
+    def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor: ...
 
-        width = sum(len(hp.choices) for hp in categoricals.values())
-        buffer = torch.zeros(size=(len(x), width), dtype=dtype, device=device)
+    @overload
+    def select(
+        self, x: npt.NDArray[np.object_], hp: str | Sequence[str]
+    ) -> npt.NDArray[np.object_]: ...
 
-        offset = 0
-        for name, hp in categoricals.items():
-            n_choices = len(hp.choices)
-            _xs = [config[name]._value_index for config in x]
-            cat_tensor = torch.tensor(_xs, dtype=torch.int64, device=device).unsqueeze(1)
-            buffer[:, offset : offset + n_choices].scatter_(1, cat_tensor, 1)
-            offset += n_choices
-
-        return buffer
-
-    @override
-    def value_decode(
+    def select(
         self,
-        x: torch.Tensor,
-        space: SearchSpace,
-    ) -> dict[str, list[Any]]:
-        values: dict[str, list[Any]] = {}
+        x: torch.Tensor | npt.NDArray[np.object_],
+        hp: str | Sequence[str],
+    ) -> torch.Tensor | npt.NDArray[np.object_]:
+        if isinstance(x, torch.Tensor):
+            assert self.tensors is not None
+            return self.tensors.select(x, hp)
 
-        offset = 0
-        for name in self.hps:
-            hp = space[name]
-            assert isinstance(hp, CategoricalParameter)
-            n_choices = len(hp.choices)
-            enc = x[:, offset : offset + n_choices].argmax(dim=1)
+        assert self.graphs is not None
+        return self.graphs.select(x, hp)
 
-            values[name] = [hp.choices[i] for i in enc]
-            offset += n_choices
-
-        return values
+    def decode_dicts(
+        self,
+        x: torch.Tensor
+        | npt.NDArray[np.object_]
+        | tuple[torch.Tensor | None, npt.NDArray[np.object_] | None],
+    ) -> list[dict[str, Any]]:
+        if isinstance(x, tuple):
+            tensors, graphs = x
+        elif isinstance(x, torch.Tensor):
+            tensors, graphs = x, None
+        else:
+            tensors, graphs = None, x
+
+        tensor_values: list[dict[str, Any]] | None = None
+        if tensors is not None:
+            assert self.tensors is not None
+            tensor_values = self.tensors.decode_dicts(tensors)
+
+        graph_values: list[dict[str, Any]] | None = None
+        if graphs is not None:
+            assert self.graphs is not None
+            graph_values = self.graphs.decode_dicts(graphs)
+
+        if tensor_values is not None and graph_values is not None:
+            assert len(tensor_values) == len(graph_values)
+            return [{**t, **g} for t, g in zip(tensor_values, graph_values)]
+
+        if tensor_values is not None:
+            return tensor_values
+
+        assert graph_values is not None
+        return graph_values
 
 
 @dataclass
-class JointTransformer(TensorTransformer):
-    transforms: tuple[TensorTransformer, ...]
-
-    def output_cols(self, space: SearchSpace) -> int:
-        return sum(t.output_cols(space) for t in self.transforms)
+class DataPack(Sized):
+    space: SearchSpace
+    encoder: DataEncoder
+    numerical: torch.Tensor | None = None
+    graphs: npt.NDArray[np.object_] | None = None
+    _len: int = field(init=False)
+
+    def __post_init__(self):
+        if self.numerical is not None and self.graphs is not None:
+            assert len(self.numerical) == len(self.graphs)
+            self._len = len(self.numerical)
+        elif self.numerical is not None:
+            self._len = len(self.numerical)
+        elif self.graphs is not None:
+            self._len = len(self.graphs)
+        else:
+            raise ValueError("At least one of numerical or graphs must be provided")
+
+    def __len__(self) -> int:
+        return self._len
+
+    def select(self, hp: str | Sequence[str]) -> torch.Tensor | npt.NDArray[np.object_]:
+        if isinstance(hp, str):
+            if self.encoder.tensors and hp in self.encoder.tensors.transformers:
+                assert self.numerical is not None
+                return self.encoder.tensors.select(self.numerical, hp)
+
+            if self.encoder.graphs and hp in self.encoder.graphs.transformers:
+                assert self.graphs is not None
+                return self.encoder.graphs.select(self.graphs, hp)
+
+            tkeys = (
+                None
+                if self.encoder.tensors is None
+                else self.encoder.tensors.transformers.keys()
+            )
+            gkeys = (
+                None
+                if self.encoder.graphs is None
+                else self.encoder.graphs.transformers.keys()
+            )
+            raise KeyError(
+                f"Unknown hyperparameter {hp}. Not in either tensors or graphs"
+                f"\nTensors: {tkeys}"
+                f"\nGraphs: {gkeys}"
+            )
 
-    @classmethod
-    def join(cls, *transforms: TensorTransformer) -> Self:
-        hps = tuple(chain.from_iterable(t.hps for t in transforms))
-        return cls(hps, transforms)
+        all_in_tensors = False
+        all_in_graphs = False
+        tkeys = None
+        gkeys = None
+        if self.encoder.tensors:
+            all_in_tensors = all(h in self.encoder.tensors.transformers for h in hp)
+
+        if self.encoder.graphs:
+            all_in_graphs = all(h in self.encoder.graphs.transformers for h in hp)
+            gkeys = self.encoder.graphs.transformers.keys()
+
+        if not all_in_tensors and not all_in_graphs:
+            raise ValueError(
+                "Cannot select from both tensors and graphs!"
+                f"Got keys: {hp}"
+                f"\nTensors: {tkeys}"
+                f"\nGraphs: {gkeys}"
+            )
 
-    @override
-    def encode(
-        self,
-        x: list[dict[str, Any]],
-        space: SearchSpace,
-        *,
-        dtype: torch.dtype | None = None,
-        device: torch.device | None = None,
-    ) -> torch.Tensor:
-        return torch.cat(
-            [t.encode(x, space, dtype=dtype, device=device) for t in self.transforms],
-            dim=1,
-        )
+        if all_in_tensors:
+            assert self.numerical is not None
+            assert self.encoder.tensors is not None
+            return self.encoder.tensors.select(self.numerical, hp)
 
-    @override
-    def value_decode(
-        self,
-        x: torch.Tensor,
-        space: SearchSpace,
-    ) -> dict[str, list[Any]]:
-        values: dict[str, list[Any]] = {}
-        offset = 0
-        for t in self.transforms:
-            width = t.output_cols(space)
-            t_values = t.value_decode(x[:, offset : offset + width], space)
-            values.update(t_values)
-            offset += width
+        assert self.graphs is not None
+        assert self.encoder.graphs is not None
+        return self.encoder.graphs.select(self.graphs, hp)
 
-        return values
+    def decode(self) -> list[SearchSpace]:
+        return [
+            self.space.from_dict(d)
+            for d in self.encoder.decode_dicts((self.numerical, self.graphs))
+        ]
diff --git a/neps/search_spaces/neighborhoods.py b/neps/search_spaces/neighborhoods.py
new file mode 100644
index 00000000..91c34a6f
--- /dev/null
+++ b/neps/search_spaces/neighborhoods.py
@@ -0,0 +1,281 @@
+from __future__ import annotations
+
+from typing import TypeVar
+
+import numpy as np
+
+from neps.search_spaces.domain import Domain
+from neps.utils.types import Arr, f64, i64
+
+V = TypeVar("V", f64, i64)
+
+UNIQUE_NEIGHBOR_GENERATOR_N_RETRIES = 8
+UNIQUE_NEIGHBOR_GENERATOR_SAMPLE_MULTIPLIER = 4
+
+NON_UNIQUE_NEIGHBORS_N_RETRIES = 8
+NON_UNIQUE_NEIGHBORS_SAMPLE_MULTIPLIER = 4
+
+# Small enough but prevents needing to keep re-allocating temporary memory
+# 50 * 8 = 400 bytes
+_SMALL = 50
+_SMALL_CACHED_ARANGE = np.arange(_SMALL, dtype=i64)
+
+
+def unorded_finite_neighbors(
+    pivot: V,
+    domain: Domain[V],
+    *,
+    n: int,
+    seed: np.random.Generator,
+) -> Arr[V]:
+    N = domain.cardinality
+    assert N is not None, "Domain must be finite."
+    if N <= _SMALL:
+        full_range = _SMALL_CACHED_ARANGE[: domain.cardinality]
+    else:
+        full_range = np.arange(N, dtype=i64)
+
+    range_domain = Domain.indices(N)
+    _pivot = range_domain.cast(pivot, frm=domain)
+
+    left = full_range[:_pivot]
+    right = full_range[_pivot + 1 :]
+    _range = np.concatenate((left, right))
+
+    seed.shuffle(_range)
+
+    return domain.cast(_range[:n], frm=range_domain)
+
+
+def neighbors(
+    pivot: V,
+    domain: Domain[V],
+    *,
+    n: int,
+    std: float,
+    seed: np.random.Generator,
+    n_retries: int = NON_UNIQUE_NEIGHBORS_N_RETRIES,
+    sample_multiplier: int = NON_UNIQUE_NEIGHBORS_SAMPLE_MULTIPLIER,
+) -> Arr[V]:
+    """Create a neighborhood of `n` neighbors around `pivot` with a normal distribution.
+
+    If you need unique neighbors, you should use
+    [`unique_neighborhood`][neps.search_spaces.neighborhoods.unique_neighborhood].
+
+    !!! tip
+
+        [`unique_neighborhood`][neps.search_spaces.neighborhoods.unique_neighborhood]
+        is quite expensive in certain situations as it has to repeatedly sample and check
+        for uniqueness. If you can afford duplicates, use this function instead.
+
+        If [`domain.cardinality == None`][neps.search_spaces.domain.Domain.cardinality],
+        and you can afford an infentesimally small percentage change of duplicates,
+        you should use this function instead.
+
+    !!! warning
+
+        It is up to the caller to ensure that the pivot lies within the domain,
+        including at one of the bins if the domain is quantized.
+
+    Args:
+        pivot: The center of the neighborhood.
+        domain: The domain to get neighbors from.
+        n: The number of neighbors to generate.
+        std: The standard deviation of the normal distribution.
+        seed: The random seed to use.
+        n_retries:
+            The number of retries to attempt to generate unique neighbors.
+            Each retry increases the standard deviation of the normal distribution to
+            prevent rejection sampling from failing.
+        sample_multiplier:
+            A multiplier which multiplies by `n` to determine the number of samples to
+            generate for try. By oversampling, we prevent having to repeated calls to
+            sampling. This prevents having to do more rounds of sampling when too many
+            samples are out of bounds, useful for when the `pivot` is near the bounds.
+
+            Tuning this may be beneficial in unique circumstances, however we advise
+            leaving this as a default.
+
+    Returns:
+        An array of `n` neighbors around `pivot`.
+    """
+    # Generate batches of n * BUFFER_MULTIPLIER candidates, filling the above
+    # buffer until we have enough valid candidates.
+    # We should not overflow as the buffer
+    offset = 0
+    SAMPLE_SIZE = n * sample_multiplier
+    BUFFER_SIZE = (n + 1) * sample_multiplier
+
+    # We extend the range of stds to try to find neighbors
+    neighbors: Arr[V] = np.empty(BUFFER_SIZE, dtype=domain.dtype)
+    stds = np.linspace(std, 1.0, n_retries + 1, endpoint=True)
+
+    lower = domain.lower
+    upper = domain.upper
+    range_size = upper - lower
+    sample_domain = Domain.float(lower, upper)
+
+    for _std in stds:
+        candidates = seed.normal(pivot, _std * range_size, size=(SAMPLE_SIZE,))
+
+        bounded_candidates = candidates[(candidates >= lower) & (candidates <= upper)]
+        maybe_valid = domain.cast(bounded_candidates, frm=sample_domain)
+
+        # High chance of overlap with original point if there's a finite amount of
+        # possible elements
+        if domain.cardinality is not None:
+            valid = maybe_valid[maybe_valid != pivot]
+        else:
+            valid = maybe_valid
+
+        n_candidates = len(valid)
+        neighbors[offset : offset + n_candidates] = valid
+        offset += n_candidates
+
+        if offset >= n:
+            return neighbors[:n]
+
+    raise ValueError(
+        f"Failed to find enough neighbors with {n_retries} retries."
+        f" Given {n} neighbors, we only found {offset}."
+        f" The `Normals` for sampling neighbors were"
+        f" Normal(mu={pivot}, sigma={list(stds)})"
+        f" which were meant to find vectorized neighbors of the vector {pivot},"
+        " which was expected to be in the range"
+        f" ({lower}, {lower}).",
+    )
+
+
+def unique_neighborhood(
+    pivot: V,
+    domain: Domain[V],
+    *,
+    n: int,
+    seed: np.random.Generator,
+    std: float,
+    n_retries: int = UNIQUE_NEIGHBOR_GENERATOR_N_RETRIES,
+    sample_multiplier: int = UNIQUE_NEIGHBOR_GENERATOR_SAMPLE_MULTIPLIER,
+) -> Arr[V]:
+    """Create a neighborhood of `n` neighbors around `pivot` with a normal distribution.
+
+    The neighborhood is created by sampling from a normal distribution centered around
+    `pivot` with a standard deviation of `std`. The samples are then quantized to the
+    range `[lower, upper]` with `bins` bins. The number of samples is `n`.
+
+    !!! tip
+
+        [`unique_neighborhood`][neps.search_spaces.neighborhoods.unique_neighborhood]
+        is quite expensive in certain situations as it has to repeatedly sample and check
+        for uniqueness. If you can afford duplicates, use this function instead.
+
+        If [`domain.cardinality == None`][neps.search_spaces.domain.Domain.cardinality],
+        and you can afford an infentesimally small percentage change of duplicates,
+        you should use [`neighbors`][neps.search_spaces.neighborhoods.neighbors] instead.
+
+    !!! warning
+
+        If there are not enough unique neighbors to sample from, the function will
+        return less than `n` neighbors.
+
+    !!! warning
+
+        It is up to the caller to ensure that the pivot lies within the domain,
+        including at one of the bins if the domain is quantized.
+
+
+    Args:
+        pivot: The center of the neighborhood.
+        domain: The domain to get neighbors from.
+        n: The number of neighbors to generate.
+        std: The standard deviation of the normal distribution.
+        seed: The random seed to use.
+        n_retries:
+            The number of retries to attempt to generate unique neighbors.
+            Each retry increases the standard deviation of the normal distribution to prevent
+            rejection sampling from failing.
+        sample_multiplier:
+            A multiplier which multiplies by `n` to determine the number of samples to
+            generate for try. By oversampling, we prevent having to repeated calls to
+            both sampling and unique checking.
+
+            However, oversampling makes a tradeoff when the `std` is not high enough to
+            generate `n` unique neighbors, effectively sampling more of the same duplicates.
+
+            Tuning this may be beneficial in unique circumstances, however we advise leaving
+            this as a default.
+
+    Returns:
+        An array of `n` neighbors around `pivot`, or less than `n` if not enough unique
+        neighbors could be generated.
+    """  # noqa: E501
+    # Different than other neighborhoods as it's unnormalized and
+    # the quantization is directly integers.
+    assert n < 1000000, "Can only generate less than 1 million neighbors."
+    assert 0 < std < 1.0, "Standard deviation must be in the range (0, 1)."
+    lower = domain.lower
+    upper = domain.upper
+
+    # In the easiest case, we have a domain with finite elements and we need
+    # more neighbors than are possible. We then generate all of them.
+    # We can do this simply with a range and removing the pivot.
+    if domain.cardinality is not None and n >= domain.cardinality - 1:
+        range_domain = Domain.indices(domain.cardinality)
+        int_pivot = range_domain.cast(pivot, frm=domain)
+
+        if int_pivot == 0:
+            _range = np.arange(1, domain.cardinality, dtype=i64)
+            return domain.cast(_range, frm=range_domain)
+
+        if int_pivot == domain.cardinality - 1:
+            _range = np.arange(0, domain.cardinality - 1, dtype=i64)
+            return domain.cast(_range, frm=range_domain)
+
+        left = np.arange(0, int_pivot, dtype=i64)
+        right = np.arange(int_pivot + 1, domain.cardinality, dtype=i64)
+        _range = np.concatenate((left, right))
+
+        return domain.cast(_range, frm=range_domain)
+
+    # Otherwise, we use a repeated sampling strategy where we slowly increase the
+    # std of a normal, centered on `center`, slowly expanding `std` such that
+    # rejection won't fail.
+
+    # We set up a buffer that can hold the number of neighbors we need, plus some
+    # extra excess from sampling, preventing us from having to reallocate memory.
+    # We also include the initial value in the buffer, as we will remove it later.
+    SAMPLE_SIZE = n * sample_multiplier
+    BUFFER_SIZE = n * (sample_multiplier + 1)
+    neighbors = np.empty(BUFFER_SIZE + 1, dtype=domain.dtype)
+    neighbors[0] = pivot
+    offset = 1  # Indexes into current progress of filling buffer
+    stds = np.linspace(std, 1.0, n_retries + 1, endpoint=True)
+    sample_domain = Domain.float(lower, upper)
+
+    range_size = upper - lower
+    for _std in stds:
+        # Generate candidates in vectorized space
+        candidates = seed.normal(pivot, _std * range_size, size=SAMPLE_SIZE)
+        valid = (candidates >= lower) & (candidates <= upper)
+
+        candidates = domain.cast(x=candidates[valid], frm=sample_domain)
+
+        # Find new unique neighbors
+        uniq = np.unique(candidates)
+        new_uniq = np.setdiff1d(uniq, neighbors[:offset], assume_unique=True)
+
+        n_new_unique = len(new_uniq)
+        neighbors[offset : offset + n_new_unique] = new_uniq
+        offset += n_new_unique
+
+        # We have enough neighbors, we can stop
+        if offset - 1 >= n:
+            # Ensure we don't include the initial value point
+            return neighbors[1 : n + 1]
+
+    raise ValueError(
+        f"Failed to find enough neighbors with {n_retries} retries."
+        f" Given {n=} neighbors to generate, we only found {offset - 1}."
+        f" The normal's for sampling neighbors were Normal({pivot}, {list(stds)})"
+        f" which were meant to find neighbors of {pivot}. in the range"
+        f" ({lower}, {upper}).",
+    )
diff --git a/neps/search_spaces/samplers/__init__.py b/neps/search_spaces/samplers/__init__.py
new file mode 100644
index 00000000..784b5aa4
--- /dev/null
+++ b/neps/search_spaces/samplers/__init__.py
@@ -0,0 +1,9 @@
+from neps.search_spaces.samplers.prior import PriorSampler
+from neps.search_spaces.samplers.sampler import Sampler
+from neps.search_spaces.samplers.uniform import UniformSampler
+
+__all__ = [
+    "Sampler",
+    "UniformSampler",
+    "PriorSampler",
+]
diff --git a/neps/search_spaces/samplers/model.py b/neps/search_spaces/samplers/model.py
new file mode 100644
index 00000000..c413b6bf
--- /dev/null
+++ b/neps/search_spaces/samplers/model.py
@@ -0,0 +1,186 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Mapping
+
+import numpy as np
+
+from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping
+from neps.optimizers.bayesian_optimization.acquisition_samplers import (
+    AcquisitionSamplerMapping,
+)
+from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_kernels
+from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
+from neps.search_spaces.samplers.sampler import Sampler
+from neps.search_spaces.samplers.uniform import UniformSampler
+from neps.utils.common import instance_from_map
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
+        BaseAcquisition,
+    )
+    from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
+        AcquisitionSampler,
+    )
+    from neps.search_spaces import SearchSpace
+    from neps.utils.types import Number
+
+
+class ModelPolicy(Sampler):
+    """A policy for sampling configuration, i.e. the default for SH / hyperband.
+
+    Args:
+        SamplingPolicy ([type]): [description]
+    """
+
+    def __init__(
+        self,
+        *,
+        space: SearchSpace,
+        surrogate_model: str | Any = "gp",
+        surrogate_model_args: Mapping[str, Any] | None = None,
+        domain_se_kernel: str | None = None,
+        graph_kernels: list | None = None,
+        hp_kernels: list | None = None,
+        acquisition: str | BaseAcquisition | type[BaseAcquisition] = "EI",
+        acquisition_sampler: (
+            str | AcquisitionSampler | type[AcquisitionSampler]
+        ) = "random",
+        patience: int = 100,
+    ):
+        surrogate_model_args = dict(surrogate_model_args) if surrogate_model_args else {}
+
+        graph_kernels, hp_kernels = get_kernels(
+            pipeline_space=space,
+            domain_se_kernel=domain_se_kernel,
+            graph_kernels=graph_kernels,
+            hp_kernels=hp_kernels,
+            optimal_assignment=False,
+        )
+
+        if "graph_kernels" not in surrogate_model_args:
+            surrogate_model_args["graph_kernels"] = None
+
+        if "hp_kernels" not in surrogate_model_args:
+            surrogate_model_args["hp_kernels"] = hp_kernels
+
+        if not surrogate_model_args["hp_kernels"]:
+            raise ValueError("No kernels are provided!")
+
+        if "vectorial_features" not in surrogate_model_args:
+            # TODO: Graph gets ignored?
+            surrogate_model_args["vectorial_features"] = {
+                "continuous": len(space.numericals),
+                "categorical": len(space.categoricals),
+            }
+
+        # TODO: What the hell type is this
+        self.surrogate_model: Any = instance_from_map(
+            SurrogateModelMapping,
+            surrogate_model,
+            name="surrogate model",
+            kwargs=surrogate_model_args,
+        )
+
+        self.acquisition: BaseAcquisition = instance_from_map(
+            AcquisitionMapping,
+            acquisition,  # type: ignore
+            name="acquisition function",
+        )
+
+        self.acquisition_sampler: AcquisitionSampler = instance_from_map(
+            AcquisitionSamplerMapping,
+            acquisition_sampler,  # type: ignore
+            name="acquisition sampler function",
+            kwargs={"patience": patience, "pipeline_space": space},
+        )
+        self.uniform_sampler = UniformSampler.new(space)
+
+    def _fantasize_pending(self, train_x, train_y, pending_x):
+        if len(pending_x) == 0:
+            return train_x, train_y
+
+        self.surrogate_model.fit(train_x, train_y)
+        # hallucinating: predict for the pending evaluations
+        _y, _ = self.surrogate_model.predict(pending_x)
+        _y = _y.detach().numpy().tolist()
+        # appending to training data
+        train_x.extend(pending_x)
+        train_y.extend(_y)
+        return train_x, train_y
+
+    def update_model(self, train_x, train_y, pending_x, decay_t=None):
+        if decay_t is None:
+            decay_t = len(train_x)
+        train_x, train_y = self._fantasize_pending(train_x, train_y, pending_x)
+        self.surrogate_model.fit(train_x, train_y)
+        self.acquisition.set_state(self.surrogate_model, decay_t=decay_t)
+        # TODO: set_state should generalize to all options
+        #  no needed to set state of sampler when using `random`
+        # self.acquisition_sampler.set_state(x=train_x, y=train_y)
+
+    def sample(
+        self,
+        n: int,
+        *,
+        active_max_fidelity: Mapping[str, Number] | None = None,
+        fidelity: Mapping[str, Number] | None = None,
+        seed: np.random.Generator,
+    ) -> SearchSpace:
+        """Performs the equivalent of optimizing the acquisition function.
+
+        Performs 2 strategies as per the arguments passed:
+            * If fidelity is not None, triggers the case when the surrogate has been
+              trained jointly with the fidelity dimension, i.e., all observations ever
+              recorded. In this case, the EI for random samples is evaluated at the
+              `fidelity` where the new sample will be evaluated. The top-10 are selected,
+              and the EI for them is evaluated at the target/mmax fidelity.
+            * If active_max_fidelity is not None, triggers the case when a surrogate is
+              trained per fidelity. In this case, all samples have their fidelity
+              variable set to the same value. This value is same as that of the fidelity
+              value of the configs in the training data.
+        """
+        logger.info("Acquiring...")
+
+        # sampling random configurations
+        samples = [
+            self.space.sample(user_priors=False, ignore_fidelity=True)
+            for _ in range(SAMPLE_THRESHOLD)
+        ]
+
+        if fidelity is not None:
+            # w/o setting this flag, the AF eval will set all fidelities to max
+            self.acquisition.optimize_on_max_fidelity = False
+            _inc_copy = self.acquisition.incumbent
+            # TODO: better design required, for example, not import torch
+            #  right now this case handles the 2-step acquisition in `sample`
+            if "incumbent" in kwargs:
+                # sets the incumbent to the best score at the required fidelity for
+                # correct computation of EI scores
+                self.acquisition.incumbent = torch.tensor(kwargs["incumbent"])
+            # updating the fidelity of the sampled configurations
+            samples = list(map(update_fidelity, samples, [fidelity] * len(samples)))
+            # computing EI at the given `fidelity`
+            eis = self.acquisition.eval(x=samples, asscalar=True)
+            # extracting the 10 highest scores
+            _ids = np.argsort(eis)[-TOP_EI_SAMPLE_COUNT:]
+            samples = pd.Series(samples).iloc[_ids].values.tolist()
+            # setting the fidelity to the maximum fidelity
+            self.acquisition.optimize_on_max_fidelity = True
+            self.acquisition.incumbent = _inc_copy
+
+        if active_max_fidelity is not None:
+            # w/o setting this flag, the AF eval will set all fidelities to max
+            self.acquisition.optimize_on_max_fidelity = False
+            fidelity = active_max_fidelity
+            samples = list(map(update_fidelity, samples, [fidelity] * len(samples)))
+
+        # computes the EI for all `samples`
+        eis = self.acquisition.eval(x=samples, asscalar=True)
+        # extracting the highest scored sample
+        return samples[np.argmax(eis)]
+        # TODO: can generalize s.t. sampler works for all types, currently,
+        #  random sampler in NePS does not do what is required here
+        # return self.acquisition_sampler.sample(self.acquisition)
diff --git a/neps/search_spaces/samplers/prior.py b/neps/search_spaces/samplers/prior.py
new file mode 100644
index 00000000..65165cae
--- /dev/null
+++ b/neps/search_spaces/samplers/prior.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Mapping
+from typing_extensions import Self, override
+
+from neps.search_spaces.config import Config
+from neps.search_spaces.distributions.uniform_int import UniformIntDistribution
+from neps.search_spaces.distributions.weighted_ints import WeightedIntsDistribution
+from neps.search_spaces.samplers.sampler import Sampler
+
+if TYPE_CHECKING:
+    import numpy as np
+
+    from neps.search_spaces.distributions.distribution import Distribution
+    from neps.search_spaces.search_space import SearchSpace
+
+
+@dataclass
+class PriorSampler(Sampler):
+    search_space: SearchSpace
+
+    _numerical_distributions: Mapping[str, Distribution]
+    _categorical_distributions: Mapping[str, Distribution]
+
+    @override
+    def sample_configs(
+        self,
+        n: int,
+        *,
+        fidelity: Mapping[str, float] | None,
+        seed: np.random.Generator,
+        with_constants: bool = True,
+    ) -> list[Config]:
+        numerical_samples = {}
+        for k, dist in self._numerical_distributions.items():
+            param = self.search_space.numericals[k]
+            numerical_samples[k] = dist.sample(n, to=param.domain, seed=seed)
+
+        categorical_samples = {}
+        for k, dist in self._categorical_distributions.items():
+            cat = self.search_space.categoricals[k]
+            domain = cat.domain
+            samples = dist.sample(n, to=domain, seed=seed)
+            choices = cat.lookup(samples)
+            categorical_samples[k] = choices
+
+        graph_samples = {}
+        for k, v in self.search_space.graphs.items():
+            graph_samples[k] = [v.sample() for _ in range(n)]
+
+        _constants = self.search_space.constants if with_constants else {}
+
+        return [
+            Config(
+                values={
+                    **{k: samples[i] for k, samples in numerical_samples.items()},
+                    **{k: samples[i] for k, samples in categorical_samples.items()},
+                    **{k: samples[i] for k, samples in graph_samples.items()},
+                    **_constants,
+                },
+                fidelity=fidelity,
+            )
+            for i in range(n)
+        ]
+
+    @classmethod
+    def new(
+        cls,
+        space: SearchSpace,
+        prior: Mapping[str, tuple[Any, float]],
+        *,
+        replace_missing_with_uniform: bool = True,
+    ) -> Self:
+        missing = set(space.hyperparameters) - set(prior.keys())
+        if not replace_missing_with_uniform and any(missing):
+            raise ValueError(
+                "If `replace_missing_with_uniform` is False, the prior must be defined"
+                f" for all parameters. Missing prior for: {missing}"
+            )
+
+        numerical_distributions = {
+            hp_name: (
+                hp.domain.truncnorm_distribution(center=p[0], confidence=p[1])
+                if (p := prior.get(hp_name))
+                else hp.domain.uniform_distribution()
+            )
+            for hp_name, hp in space.numericals.items()
+        }
+        # NOTE: It would be nice to somehow check if the prior given for
+        # a categorical was an index or a value in the categorical.
+        # Since it's much more efficient to hold on to the index, we will
+        # assume that for now.
+        categorical_distribution = {
+            hp_name: (
+                WeightedIntsDistribution.with_favoured(
+                    n=cat.size,
+                    favoured=cat.index(p[0]),
+                    confidence=p[1],
+                )
+                if (p := prior.get(hp_name))
+                else UniformIntDistribution.indices(cat.size)
+            )
+            for hp_name, cat in space.categoricals.items()
+        }
+        return cls(
+            space,
+            _numerical_distributions=numerical_distributions,
+            _categorical_distributions=categorical_distribution,
+        )
diff --git a/neps/search_spaces/samplers/sampler.py b/neps/search_spaces/samplers/sampler.py
new file mode 100644
index 00000000..f104a3a5
--- /dev/null
+++ b/neps/search_spaces/samplers/sampler.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Mapping
+from typing_extensions import Protocol
+
+if TYPE_CHECKING:
+    import numpy as np
+
+    from neps.search_spaces.config import Config
+    from neps.utils.types import Number
+
+
+@dataclass
+class Sampler(Protocol):
+    def sample_configs(
+        self,
+        n: int,
+        *,
+        fidelity: Mapping[str, Number] | None,
+        seed: np.random.Generator,
+    ) -> list[Config]: ...
diff --git a/neps/search_spaces/samplers/uniform.py b/neps/search_spaces/samplers/uniform.py
new file mode 100644
index 00000000..88060932
--- /dev/null
+++ b/neps/search_spaces/samplers/uniform.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Mapping
+from typing_extensions import Self, override
+
+from neps.search_spaces.config import Config
+from neps.search_spaces.distributions.uniform_int import UniformIntDistribution
+from neps.search_spaces.samplers.sampler import Sampler
+
+if TYPE_CHECKING:
+    import numpy as np
+
+    from neps.search_spaces.distributions.distribution import Distribution
+    from neps.search_spaces.search_space import SearchSpace
+
+
+@dataclass
+class UniformSampler(Sampler):
+    search_space: SearchSpace
+
+    _numerical_distributions: Mapping[str, Distribution]
+    _categorical_distributions: Mapping[str, Distribution]
+
+    @override
+    def sample_configs(
+        self,
+        n: int,
+        *,
+        fidelity: Mapping[str, float] | None = None,
+        seed: np.random.Generator,
+        with_constants: bool = True,
+    ) -> list[Config]:
+        numerical_samples = {}
+        for k, dist in self._numerical_distributions.items():
+            param = self.search_space.numericals[k]
+            numerical_samples[k] = dist.sample(n, to=param.domain, seed=seed)
+
+        categorical_samples = {}
+        for k, dist in self._categorical_distributions.items():
+            cat = self.search_space.categoricals[k]
+            domain = cat.domain
+            samples = dist.sample(n, to=domain, seed=seed)
+            choices = cat.lookup(samples)
+            categorical_samples[k] = choices
+
+        graph_samples = {}
+        for k, v in self.search_space.graphs.items():
+            graph_samples[k] = [v.sample() for _ in range(n)]
+
+        _constants = self.search_space.constants if with_constants else {}
+
+        return [
+            Config(
+                {
+                    **{k: samples[i] for k, samples in numerical_samples.items()},
+                    **{k: samples[i] for k, samples in categorical_samples.items()},
+                    **{k: samples[i] for k, samples in graph_samples.items()},
+                    **_constants,
+                },
+                fidelity=fidelity,
+            )
+            for i in range(n)
+        ]
+
+    @classmethod
+    def new(cls, space: SearchSpace) -> Self:
+        numerical_distributions = {
+            k: p.domain.uniform_distribution() for k, p in space.numericals.items()
+        }
+        categorical_distribution = {
+            k: UniformIntDistribution.indices(p.size)
+            for k, p in space.categoricals.items()
+        }
+        return cls(
+            space,
+            _numerical_distributions=numerical_distributions,
+            _categorical_distributions=categorical_distribution,
+        )
diff --git a/neps/search_spaces/samplers/weighted_sampler.py b/neps/search_spaces/samplers/weighted_sampler.py
new file mode 100644
index 00000000..32e51908
--- /dev/null
+++ b/neps/search_spaces/samplers/weighted_sampler.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Mapping
+from typing_extensions import Self, override
+
+import numpy as np
+
+from neps.search_spaces.samplers.sampler import Sampler
+from neps.utils.types import Arr, Number, f64
+
+if TYPE_CHECKING:
+    from neps.search_spaces.config import Config
+
+
+@dataclass
+class WeightedSampler(Sampler):
+    weights: dict[str, float]
+    samplers: dict[str, Sampler]
+
+    _probabilities: Arr[f64] = field(init=False, repr=False, compare=False)
+    _samplers: Arr[np.str_] = field(init=False, repr=False, compare=False)
+
+    def __post_init__(self):
+        probs = np.array(list(self.weights.values()), dtype=f64)
+        probs /= probs.sum()
+        self._probabilities = probs
+        self._samplers = np.asarray(sorted(self.samplers.keys()), dtype=np.str_)
+
+    @override
+    def sample_configs(
+        self,
+        n: int,
+        *,
+        fidelity: Mapping[str, Number] | None,
+        seed: np.random.Generator,
+    ) -> list[Config]:
+        choices = seed.choice(self._samplers, size=n, p=self._probabilities)
+        keys, counts = np.unique(choices, return_counts=True)
+
+        configs: list[Config] = []
+        for key, count in zip(keys, counts):
+            sampler = self.samplers[key]
+            config_samples = sampler.sample_configs(count, fidelity=fidelity, seed=seed)
+            configs.extend(config_samples)
+
+        return configs
+
+    @classmethod
+    def equally_weighted(cls, samples: dict[str, Sampler]) -> Self:
+        return cls(weights={k: 1.0 for k in samples}, samplers=samples)
diff --git a/neps/state/__init__.py b/neps/state/__init__.py
index 6508dba2..7a85c7d4 100644
--- a/neps/state/__init__.py
+++ b/neps/state/__init__.py
@@ -5,6 +5,7 @@
     VersionedResource,
     Versioner,
 )
+from neps.state.optimizer import BudgetInfo, OptimizationState, OptimizerInfo
 from neps.state.seed_snapshot import SeedSnapshot
 from neps.state.trial import Trial
 
@@ -12,6 +13,9 @@
     "Locker",
     "SeedSnapshot",
     "Synced",
+    "BudgetInfo",
+    "OptimizationState",
+    "OptimizerInfo",
     "Trial",
     "ReaderWriter",
     "Versioner",
diff --git a/neps/state/optimizer.py b/neps/state/optimizer.py
index f4000b07..bd8cbc2e 100644
--- a/neps/state/optimizer.py
+++ b/neps/state/optimizer.py
@@ -19,7 +19,6 @@ def remaining_cost_budget(self) -> float:
         return self.max_cost_budget - self.used_cost_budget
 
     def clone(self) -> BudgetInfo:
-        """Clone the budget info."""
         return BudgetInfo(
             max_cost_budget=self.max_cost_budget,
             used_cost_budget=self.used_cost_budget,
diff --git a/neps/state/trial.py b/neps/state/trial.py
index 862e2bbb..3cd9b9c1 100644
--- a/neps/state/trial.py
+++ b/neps/state/trial.py
@@ -37,6 +37,10 @@ class State(Enum):
     CORRUPTED = "corrupted"
     UNKNOWN = "unknown"
 
+    def pending(self) -> bool:
+        """Return True if the trial is pending."""
+        return self in (State.PENDING, State.SUBMITTED, State.EVALUATING)
+
 
 @dataclass
 class MetaData:
@@ -129,7 +133,7 @@ class Trial:
     MetaData: ClassVar = MetaData
     NotReportedYetError: ClassVar = NotReportedYetError
 
-    config: Mapping[str, Any]
+    config: dict[str, Any]
     metadata: MetaData
     state: State
     report: Report | None
diff --git a/neps/utils/types.py b/neps/utils/types.py
index a6b6c540..be1f103b 100644
--- a/neps/utils/types.py
+++ b/neps/utils/types.py
@@ -15,7 +15,7 @@
 # TODO(eddiebergman): We can turn this to an enum at some
 # point to prevent having to isinstance and str match
 ERROR: TypeAlias = Literal["error"]
-Number: TypeAlias = Union[int, float, np.number]
+Number: TypeAlias = Union[int, float]
 ConfigID: TypeAlias = str
 RawConfig: TypeAlias = Mapping[str, Any]
 Metadata: TypeAlias = Dict[str, Any]

From 27bb0a3bf67e7d4549cbc4b46e7bac3cbbbc5fe1 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Mon, 26 Aug 2024 15:15:52 +0200
Subject: [PATCH 11/63] refactor: Switch to botorch

---
 .gitignore                                    |   2 +-
 .../acquisition_functions/ei.py               |  17 +-
 .../grakel_replace/weisfeiler_lehman.py       | 293 ++++++-----
 .../bayesian_optimization/kernels/kernel.py   | 161 ------
 .../kernels/vectorial_kernels.py              | 112 ----
 .../kernels/weisfilerlehman.py                |  31 +-
 .../bayesian_optimization/models/__init__.py  |   6 +-
 .../bayesian_optimization/models/gp.py        | 484 +++++++++---------
 .../bayesian_optimization/optimizer.py        | 202 ++++----
 .../optimizers/bayesian_optimization/sobol.py |   0
 neps/runtime.py                               |   4 +-
 neps/search_spaces/domain.py                  |  45 +-
 neps/search_spaces/encoding.py                | 262 +++++++---
 neps/search_spaces/hyperparameters/float.py   |   2 +
 neps/search_spaces/hyperparameters/integer.py |   2 +
 .../hyperparameters/numerical.py              |   3 +
 neps_examples/basic_usage/hyperparameters.py  |  17 +-
 17 files changed, 768 insertions(+), 875 deletions(-)
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/kernel.py
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
 create mode 100644 neps/optimizers/bayesian_optimization/sobol.py

diff --git a/.gitignore b/.gitignore
index e8be93e7..09a1430c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-# Python
+#False Python
 __pycache__
 dist
 
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
index cc13cc8e..1a4e24d0 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
@@ -68,14 +68,11 @@ def eval(
         else:
             _x = x
 
-        try:
-            mu, cov = self.surrogate_model.predict(_x)
-        except ValueError as e:
-            raise e
-            # return -1.0  # in case of error. return ei of -1
+        mu, cov = self.surrogate_model.predict(_x)
 
         std = torch.sqrt(torch.diag(cov))
         mu_star = self.incumbent
+
         gauss = Normal(torch.zeros(1, device=mu.device), torch.ones(1, device=mu.device))
         # u = (mu - mu_star - self.xi) / std
         # ei = std * updf + (mu - mu_star - self.xi) * ucdf
@@ -88,7 +85,15 @@ def eval(
             ) * gauss.cdf(v - std)
         else:
             u = (mu_star - mu - self.xi) / std
-            ucdf = gauss.cdf(u)
+            try:
+                ucdf = gauss.cdf(u)
+            except ValueError as e:
+                print(f"u: {u}")  # noqa: T201
+                print(f"mu_star: {mu_star}")  # noqa: T201
+                print(f"mu: {mu}")  # noqa: T201
+                print(f"std: {std}")  # noqa: T201
+                print(f"diag: {cov.diag()}")  # noqa: T201
+                raise e
             updf = torch.exp(gauss.log_prob(u))
             ei = std * updf + (mu_star - mu - self.xi) * ucdf
         if self.augmented_ei:
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
index f10e406f..8c4baf64 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
@@ -1,5 +1,7 @@
 """The weisfeiler lehman kernel :cite:`shervashidze2011weisfeiler`."""
 
+from __future__ import annotations
+
 import collections
 import collections.abc
 import logging
@@ -42,7 +44,7 @@ class WeisfeilerLehman(Kernel):
         {'node_name1': weight1, 'node_name2': weight2 ... }
         Must be of the same length as the number of different node attributes
 
-    Attributes
+    Attributes:
     ----------
     X : dict
      Holds a dictionary of fitted subkernel modules for all levels.
@@ -101,9 +103,9 @@ def __init__(
         self.X = None
         self._X_diag = None
 
-        self.X_fit = dict()
-        self.K_precomputed = dict()
-        self.base_graph_kernel_precomputed = dict()
+        self.X_fit = {}
+        self.K_precomputed = {}
+        self.base_graph_kernel_precomputed = {}
 
     def initialize(self):
         """Initialize all transformer arguments, needing initialization."""
@@ -111,26 +113,37 @@ def initialize(self):
         if not self._initialized["base_graph_kernel"]:
             base_graph_kernel = self.base_graph_kernel
             if base_graph_kernel is None:
-                base_graph_kernel, params = VertexHistogram, dict()
+                base_graph_kernel, params = VertexHistogram, {}
             # TODO: make sure we're always passing like this
             elif type(base_graph_kernel) is type and issubclass(  # pylint: disable=C0123
                 base_graph_kernel, Kernel
             ):
-                params = dict()
+                params = {}
             else:
                 try:
                     base_graph_kernel, params = base_graph_kernel
                 except Exception as _error:
-                    NOT_YET_IMPLEMENTED_StmtRaise
+                    raise TypeError(
+                        "Base kernel was not formulated in "
+                        "the correct way. "
+                        "Check documentation."
+                    ) from _error
 
                 if not (
-                    type(base_graph_kernel)
-                    is type  # pylint: disable=C0123
+                    type(base_graph_kernel) is type  # pylint: disable=C0123
                     and issubclass(base_graph_kernel, Kernel)
                 ):
-                    NOT_YET_IMPLEMENTED_StmtRaise
+                    raise TypeError(
+                        "The first argument must be a valid "
+                        "grakel.kernel.kernel Object"
+                    )
                 if not isinstance(params, dict):
-                    NOT_YET_IMPLEMENTED_StmtRaise
+                    raise ValueError(
+                        "If the second argument of base "
+                        "kernel exists, it must be a diction"
+                        "ary between parameters names and "
+                        "values"
+                    )
                 params.pop("normalize", None)
 
             params["normalize"] = False
@@ -141,7 +154,9 @@ def initialize(self):
 
         if not self._initialized["h"]:
             if not isinstance(self.h, int) or self.h < 0:
-                NOT_YET_IMPLEMENTED_StmtRaise
+                raise TypeError(
+                    "'h' must be a non-negative integer. Got h:" + str(self.h)
+                )
             self._h = self.h + 1
             self._initialized["h"] = True
 
@@ -181,7 +196,7 @@ def parse_input(
         gp_fit: bool
             If False use precomputed vals for first N values, else compute them and save them
 
-        Returns
+        Returns:
         -------
         base_graph_kernel : object
         Returns base_graph_kernel.
@@ -191,34 +206,34 @@ def parse_input(
 
         """
         if self._method_calling not in [1, 2]:
-            NOT_YET_IMPLEMENTED_StmtRaise
+            raise ValueError(
+                "method call must be called either from fit " + "or fit-transform"
+            )
         elif hasattr(self, "_X_diag"):
             # Clean _X_diag value
             delattr(self, "_X_diag")
 
         # skip kernel computation if we have already computed the corresponding kernel
-        if self._h in self.K_precomputed.keys() and self.X_fit[self._h] == X:
+        if self._h in self.K_precomputed and self.X_fit[self._h] == X:
             K = self.K_precomputed[self._h]
             base_graph_kernel = self.base_graph_kernel_precomputed[self._h]
         else:
             # Input validation and parsing
             if not isinstance(X, collections.abc.Iterable):
-                NOT_YET_IMPLEMENTED_StmtRaise
+                raise TypeError("input must be an iterable\n")
             else:
                 nx = 0
-                Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict()
+                Gs_ed, L, distinct_values, extras = {}, {}, set(), {}
                 for idx, x in enumerate(iter(X)):
                     is_iter = isinstance(x, collections.abc.Iterable)
                     if is_iter:
                         x = list(x)
                     if is_iter and (len(x) == 0 or len(x) >= 2):
                         if len(x) == 0:
-                            warnings.warn(
-                                "Ignoring empty element on index: " + str(idx)
-                            )
+                            warnings.warn("Ignoring empty element on index: " + str(idx))
                             continue
                         elif len(x) > 2:
-                            extra = tuple()
+                            extra = ()
                             if len(x) > 3:
                                 extra = tuple(x[3:])
                             x = Graph(x[0], x[1], x[2], graph_format=self._graph_format)
@@ -228,10 +243,11 @@ def parse_input(
                                     label_type="edge",
                                     return_none=True,
                                 ),
-                            ) + extra
+                                *extra,
+                            )
                         else:
                             x = Graph(x[0], x[1], {}, graph_format=self._graph_format)
-                            extra = tuple()
+                            extra = ()
 
                     elif isinstance(x, Graph):
                         x.desired_format(self._graph_format)
@@ -240,20 +256,22 @@ def parse_input(
                             label_type="edge",
                             return_none=True,
                         )
-                        if el is None:
-                            extra = tuple()
-                        else:
-                            extra = (el,)
+                        extra = () if el is None else (el,)
 
                     else:
-                        NOT_YET_IMPLEMENTED_StmtRaise
+                        raise TypeError(
+                            "each element of X must be either a "
+                            + "graph object or a list with at least "
+                            + "a graph like object and node labels "
+                            + "dict \n"
+                        )
                     Gs_ed[nx] = x.get_edge_dictionary()
                     L[nx] = x.get_labels(purpose="dictionary")
                     extras[nx] = extra
-                    NOT_YET_IMPLEMENTED_StmtAugAssign
-                    NOT_YET_IMPLEMENTED_StmtAugAssign
+                    distinct_values |= set(L[nx].values())
+                    nx += 1
                 if nx == 0:
-                    NOT_YET_IMPLEMENTED_StmtRaise
+                    raise ValueError("parsed input is empty")
 
             # Save the number of "fitted" graphs.
             self._nx = nx
@@ -261,70 +279,75 @@ def parse_input(
 
             # assign a number to each label
             label_count = 0
-            for dv in sorted(list(distinct_values)):
+            for dv in sorted(distinct_values):
                 WL_labels_inverse[dv] = label_count
-                NOT_YET_IMPLEMENTED_StmtAugAssign
+                label_count += 1
 
             # Initalize an inverse dictionary of labels for all iterations
-            self._inv_labels = OrderedDict()  # Inverse dictionary of labels, in term of the *previous layer*
+            self._inv_labels = (
+                OrderedDict()
+            )  # Inverse dictionary of labels, in term of the *previous layer*
             self._inv_labels[0] = deepcopy(WL_labels_inverse)
-            self.feature_dims.append(len(WL_labels_inverse))  # Update the zeroth iteration feature dim
-
-            self._inv_label_node_attr = OrderedDict()  # Inverse dictionary of labels, in term of the *node attribute*
-            self._label_node_attr = OrderedDict()  # Same as above, but with key and value inverted
-            (
-                self._label_node_attr[0],
-                self._inv_label_node_attr[0],
-            ) = self.translate_label(WL_labels_inverse, 0)
+            self.feature_dims.append(
+                len(WL_labels_inverse)
+            )  # Update the zeroth iteration feature dim
+
+            self._inv_label_node_attr = (
+                OrderedDict()
+            )  # Inverse dictionary of labels, in term of the *node attribute*
+            self._label_node_attr = (
+                OrderedDict()
+            )  # Same as above, but with key and value inverted
+            self._label_node_attr[0], self._inv_label_node_attr[0] = self.translate_label(
+                WL_labels_inverse, 0
+            )
 
             if self.node_weights is not None:
                 self._feature_weight = OrderedDict()
                 # Ensure the order is the same
                 self._feature_weight[0] = self._compute_feature_weight(
                     self.node_weights, 0, WL_labels_inverse
-                )[
-                    1
-                ]
+                )[1]
             else:
                 self._feature_weight = None
 
             def generate_graphs(label_count: int, WL_labels_inverse):
-                new_graphs = list()
+                new_graphs = []
                 for j in range(self._nx):
-                    new_labels = dict()
-                    for k in L[j].keys():
+                    new_labels = {}
+                    for k in L[j]:
                         new_labels[k] = WL_labels_inverse[L[j][k]]
                     L[j] = new_labels
                     # add new labels
                     new_graphs.append((Gs_ed[j], new_labels) + extras[j])
-                NOT_YET_IMPLEMENTED_ExprYield
+                yield new_graphs
 
                 for i in range(1, self._h):
-                    label_set, WL_labels_inverse, L_temp = set(), dict(), dict()
+                    label_set, WL_labels_inverse, L_temp = set(), {}, {}
                     for j in range(nx):
                         # Find unique labels and sort
                         # them for both graphs
                         # Keep for each node the temporary
-                        L_temp[j] = dict()
-                        for v in Gs_ed[j].keys():
-                            credential = str(L[j][v]) + "," + str(
-                                sorted(
-                                    (NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in [])
-                                )
+                        L_temp[j] = {}
+                        for v in Gs_ed[j]:
+                            credential = (
+                                str(L[j][v])
+                                + ","
+                                + str(sorted(L[j][n] for n in Gs_ed[j][v]))
                             )
                             L_temp[j][v] = credential
                             label_set.add(credential)
 
-                    label_list = sorted(list(label_set))
+                    label_list = sorted(label_set)
                     for dv in label_list:
                         WL_labels_inverse[dv] = label_count
-                        NOT_YET_IMPLEMENTED_StmtAugAssign
+                        label_count += 1
 
                     # Recalculate labels
-                    new_graphs = list()
+                    new_graphs = []
                     for j in range(nx):
-                        new_labels = dict()
-                        for k in L_temp[j].keys():
+                        new_labels = {}
+                        for k in L_temp[j]:
                             new_labels[k] = WL_labels_inverse[L_temp[j][k]]
                         L[j] = new_labels
                         # relabel
@@ -344,11 +367,9 @@ def generate_graphs(label_count: int, WL_labels_inverse):
                     if self.node_weights is not None:
                         self._feature_weight[i] = self._compute_feature_weight(
                             self.node_weights, i, self._inv_label_node_attr[i]
-                        )[
-                            1
-                        ]
+                        )[1]
                     # assert len(self._feature_weight[i] == len(WL_labels_inverse))
-                    NOT_YET_IMPLEMENTED_ExprYield
+                    yield new_graphs
 
             # Initialise the base graph kernel.
             base_graph_kernel = {}
@@ -397,6 +418,7 @@ def generate_graphs(label_count: int, WL_labels_inverse):
                 K = torch.stack(K, dim=0).sum(dim=0)
                 return K, base_graph_kernel
             return np.sum(K, axis=0), base_graph_kernel
+        return None
 
     def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True):  # pylint: disable=unused-argument
         """Fit and transform, on the same dataset.
@@ -414,7 +436,7 @@ def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True):  # pylint: di
         y : Object, default=None
             Ignored argument, added for the pipeline.
 
-        Returns
+        Returns:
         -------
         K : numpy array, shape = [n_targets, n_input_graphs]
             corresponding to the kernel matrix, a calculation between
@@ -428,7 +450,7 @@ def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True):  # pylint: di
             0,
         ]  # Flush the feature dimensions
         if X is None:
-            NOT_YET_IMPLEMENTED_StmtRaise
+            raise ValueError("transform input cannot be None")
         else:
             km, self.X = self.parse_input(X, gp_fit=gp_fit)
 
@@ -450,7 +472,8 @@ def transform(self, X: Iterable, return_embedding_only: bool = True):
         return_embedding_only: bool
             Whether to return the embedding of the graphs only, instead of computing the kernel all
             the way to the end.
-        Returns
+
+        Returns:
         -------
         K : numpy array, shape = [n_targets, n_input_graphs]
             corresponding to the kernel matrix, a calculation between
@@ -463,13 +486,13 @@ def transform(self, X: Iterable, return_embedding_only: bool = True):
 
         # Input validation and parsing
         if X is None:
-            NOT_YET_IMPLEMENTED_StmtRaise
+            raise ValueError("transform input cannot be None")
         elif not isinstance(X, collections.abc.Iterable):
-            NOT_YET_IMPLEMENTED_StmtRaise
+            raise ValueError("input must be an iterable\n")
         else:
             nx = 0
             distinct_values = set()
-            Gs_ed, L = dict(), dict()
+            Gs_ed, L = {}, {}
             for i, x in enumerate(iter(X)):
                 is_iter = isinstance(x, collections.abc.Iterable)
                 if is_iter:
@@ -484,25 +507,32 @@ def transform(self, X: Iterable, return_embedding_only: bool = True):
                 elif isinstance(x, Graph):
                     x.desired_format("dictionary")
                 else:
-                    NOT_YET_IMPLEMENTED_StmtRaise
+                    raise ValueError(
+                        "each element of X must have at "
+                        + "least one and at most 3 elements\n"
+                    )
                 Gs_ed[nx] = x.get_edge_dictionary()
                 L[nx] = x.get_labels(purpose="dictionary")
 
                 # Hold all the distinct values
-                NOT_YET_IMPLEMENTED_StmtAugAssign
-                NOT_YET_IMPLEMENTED_StmtAugAssign
+                distinct_values |= {
+                    v for v in L[nx].values() if v not in self._inv_labels[0]
+                }
+                nx += 1
             if nx == 0:
-                NOT_YET_IMPLEMENTED_StmtRaise
+                raise ValueError("parsed input is empty")
 
         nl = len(self._inv_labels[0])
-        WL_labels_inverse = {NOT_IMPLEMENTED_dict_key: NOT_IMPLEMENTED_dict_value for key, value in NOT_IMPLEMENTED_dict}
+        WL_labels_inverse = {
+            dv: idx for (idx, dv) in enumerate(sorted(distinct_values), nl)
+        }
         WL_labels_inverse = OrderedDict(WL_labels_inverse)
 
         def generate_graphs_transform(WL_labels_inverse, nl):
             # calculate the kernel matrix for the 0 iteration
-            new_graphs = list()
+            new_graphs = []
             for j in range(nx):
-                new_labels = dict()
+                new_labels = {}
                 for k, v in L[j].items():
                     if v in self._inv_labels[0]:
                         new_labels[k] = self._inv_labels[0][v]
@@ -511,37 +541,35 @@ def generate_graphs_transform(WL_labels_inverse, nl):
                 L[j] = new_labels
                 # produce the new graphs
                 new_graphs.append([Gs_ed[j], new_labels])
-            NOT_YET_IMPLEMENTED_ExprYield
+            yield new_graphs
 
             for i in range(1, self._h):
-                new_graphs = list()
-                L_temp, label_set = dict(), set()
-                NOT_YET_IMPLEMENTED_StmtAugAssign
+                new_graphs = []
+                L_temp, label_set = {}, set()
+                nl += len(self._inv_labels[i])
                 for j in range(nx):
                     # Find unique labels and sort them for both graphs
                     # Keep for each node the temporary
-                    L_temp[j] = dict()
-                    for v in Gs_ed[j].keys():
-                        credential = str(L[j][v]) + "," + str(
-                            sorted(
-                                (NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in [])
-                            )
+                    L_temp[j] = {}
+                    for v in Gs_ed[j]:
+                        credential = (
+                            str(L[j][v]) + "," + str(sorted(L[j][n] for n in Gs_ed[j][v]))
                         )
                         L_temp[j][v] = credential
                         if credential not in self._inv_labels[i]:
                             label_set.add(credential)
 
                 # Calculate the new label_set
-                WL_labels_inverse = dict()
+                WL_labels_inverse = {}
                 if len(label_set) > 0:
-                    for dv in sorted(list(label_set)):
+                    for dv in sorted(label_set):
                         idx = len(WL_labels_inverse) + nl
                         WL_labels_inverse[dv] = idx
 
                 # Recalculate labels
-                new_graphs = list()
+                new_graphs = []
                 for j in range(nx):
-                    new_labels = dict()
+                    new_labels = {}
                     for k, v in L_temp[j].items():
                         if v in self._inv_labels[i]:
                             new_labels[k] = self._inv_labels[i][v]
@@ -550,7 +578,7 @@ def generate_graphs_transform(WL_labels_inverse, nl):
                     L[j] = new_labels
                     # Create the new graphs with the new labels.
                     new_graphs.append([Gs_ed[j], new_labels])
-                NOT_YET_IMPLEMENTED_ExprYield
+                yield new_graphs
 
         if return_embedding_only:
             K = []
@@ -567,11 +595,29 @@ def generate_graphs_transform(WL_labels_inverse, nl):
 
         # Calculate the kernel matrix without parallelization
         if self.as_tensor:
-            summand = [NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []]
+            summand = [
+                self.layer_weights[i]
+                * self.X[i].transform(
+                    g,
+                    label_start_idx=self.feature_dims[i],
+                    label_end_idx=self.feature_dims[i + 1],
+                )
+                for i, g in enumerate(generate_graphs_transform(WL_labels_inverse, nl))
+            ]
             K = torch.stack(summand, dim=0).sum(dim=0)
         else:
             K = np.sum(
-                (NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []),
+                (
+                    self.layer_weights[i]
+                    * self.X[i].transform(
+                        g,
+                        label_start_idx=self.feature_dims[i],
+                        label_end_idx=self.feature_dims[i + 1],
+                    )
+                    for (i, g) in enumerate(
+                        generate_graphs_transform(WL_labels_inverse, nl)
+                    )
+                ),
                 axis=0,
             )
 
@@ -580,7 +626,7 @@ def generate_graphs_transform(WL_labels_inverse, nl):
             X_diag, Y_diag = self.diagonal()
             if self.as_tensor:
                 div_ = torch.sqrt(torch.ger(Y_diag, X_diag))
-                NOT_YET_IMPLEMENTED_StmtAugAssign
+                K /= div_
             else:
                 old_settings = np.seterr(divide="ignore")
                 K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag))))
@@ -598,7 +644,7 @@ def diagonal(self):
         ----------
         None.
 
-        Returns
+        Returns:
         -------
         X_diag : np.array
             The diagonal of the kernel matrix, of the fitted data.
@@ -616,7 +662,7 @@ def diagonal(self):
             if self._is_transformed:
                 Y_diag = self.X[0].diagonal()[1]
                 for i in range(1, self._h):
-                    NOT_YET_IMPLEMENTED_StmtAugAssign
+                    Y_diag += self.X[i].diagonal()[1]
         except NotFittedError:
             # Calculate diagonal of X
             if self._is_transformed:
@@ -625,8 +671,8 @@ def diagonal(self):
                 X_diag.flags.writeable = True
                 for i in range(1, self._h):
                     x, y = self.X[i].diagonal()
-                    NOT_YET_IMPLEMENTED_StmtAugAssign
-                    NOT_YET_IMPLEMENTED_StmtAugAssign
+                    X_diag += x
+                    Y_diag += y
                     self._X_diag = X_diag
 
                 # case sub kernel is only fitted
@@ -635,7 +681,7 @@ def diagonal(self):
                 X_diag.flags.writeable = True
                 for i in range(1, self._n_iter):
                     x = self.X[i].diagonal()
-                    NOT_YET_IMPLEMENTED_StmtAugAssign
+                    X_diag += x
                 self._X_diag = X_diag
 
         if self.as_tensor:
@@ -648,42 +694,39 @@ def diagonal(self):
             return self._X_diag
 
     @staticmethod
-    def translate_label(curr_layer: dict, h: int, prev_layer: dict = None):
+    def translate_label(curr_layer: dict, h: int, prev_layer: dict | None = None):
         """Translate the label to be in terms of the node attributes
         curr_layer: the WL_label_inverse object. A dictionary with element of the format of
-        {pattern: encoding}
+        {pattern: encoding}.
 
-        return:
+        Return:
            label_in_node_attr: in terms of {encoding: pattern}, but pattern is always in term of the node attribute
            inv_label_in_node_attr: in terms of {pattern: encoding}
 
         """
         if h == 0:
-            return (
-                {NOT_IMPLEMENTED_dict_key: NOT_IMPLEMENTED_dict_value for key, value in NOT_IMPLEMENTED_dict},
-                curr_layer,
-            )
+            return {v: str(k) for k, v in curr_layer.items()}, curr_layer
         else:
-            NOT_YET_IMPLEMENTED_StmtAssert
+            assert prev_layer is not None
             label_in_node_attr, inv_label_in_node_attr = OrderedDict(), OrderedDict()
             for pattern, encoding in curr_layer.items():
                 # current pattern is in terms of the encoding previous layer. Find the pattern from the prev_layer
                 root, leaf = literal_eval(pattern)
                 root_ = prev_layer[root]
-                leaf_ = [NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []]
-                label_in_node_attr.update({encoding: "~".join([root_] + leaf_)})
-                inv_label_in_node_attr.update({"~".join([root_] + leaf_): encoding})
+                leaf_ = [prev_layer[i] for i in leaf]
+                label_in_node_attr.update({encoding: "~".join([root_, *leaf_])})
+                inv_label_in_node_attr.update({"~".join([root_, *leaf_]): encoding})
             return label_in_node_attr, inv_label_in_node_attr
 
     @staticmethod
     def _compute_feature_weight(
         node_weight: OrderedDict, h: int, inv_label_node_attr: OrderedDict
     ):
-        """
-        Compute the feature weight, based on the average weight of the constituent node attributes.
+        """Compute the feature weight, based on the average weight of the constituent node attributes.
+
         Return:
             feature_weights: a dictionary with h layers, each of which is a dictionary of the format of
-            {tuple1: weight1; tuplr2, weight2 ...} where tuplex is the tuple representation of the learned graph feature
+            {tuple1: weight1; tuplr2, weight2 ...} where tuplex is the tuple representation of the learned graph feature.
 
             feature_weight_flattened: same as above, but in a flattened np format.
         """
@@ -691,29 +734,25 @@ def _compute_feature_weight(
         feature_weights_flattened = []
         if h == 0:
             feature_weight = OrderedDict(
-                {NOT_IMPLEMENTED_dict_key: NOT_IMPLEMENTED_dict_value for key, value in NOT_IMPLEMENTED_dict}
+                {k: (node_weight[k]) ** 2 for k in inv_label_node_attr}
             )
-            feature_weights_flattened = np.array(
-                list(feature_weight.values())
-            ).flatten()
+            feature_weights_flattened = np.array(list(feature_weight.values())).flatten()
         else:
             for k, _ in inv_label_node_attr.items():
                 # k is the pattern, v is the encoding
                 k_sep = k.split("~")
-                average_weight = np.mean(
-                    [NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []]
-                )
+                average_weight = np.mean([(node_weight[i]) ** 2 for i in k_sep])
                 feature_weights.update({k: average_weight})
                 feature_weights_flattened.append(average_weight)
         feature_weights_flattened = np.array(feature_weights_flattened).flatten()
-        NOT_YET_IMPLEMENTED_StmtAssert
+        assert len(feature_weights_flattened) == len(inv_label_node_attr)
         return feature_weights, feature_weights_flattened
 
     def dK_dX(self, X_test: None):
-        """
-        Do additional forward and backward pass, compute the kernel derivative wrt the testing location.
-        If no test locations are provided, the derivatives are evaluated at the training points
-        Returns
+        """Do additional forward and backward pass, compute the kernel derivative wrt the testing location.
+        If no test locations are provided, the derivatives are evaluated at the training points.
+
+        Returns.
         -------
 
         """
diff --git a/neps/optimizers/bayesian_optimization/kernels/kernel.py b/neps/optimizers/bayesian_optimization/kernels/kernel.py
deleted file mode 100644
index 42382a51..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/kernel.py
+++ /dev/null
@@ -1,161 +0,0 @@
-from __future__ import annotations
-
-import copy
-import inspect
-import math
-from abc import ABC, abstractmethod
-from typing import Any, ClassVar, Generic, Mapping, Sequence, TypeVar
-from typing_extensions import Self
-
-import torch
-from torch import nn
-
-from neps.utils.types import NotSet
-
-T = TypeVar("T")
-
-
-class Kernel(ABC, nn.Module, Generic[T]):
-    suggested_grid: ClassVar[Sequence[Mapping[str, Any]]]
-
-    def __init__(self) -> None:
-        super().__init__()
-
-    @abstractmethod
-    def as_optimizable(self) -> Self: ...
-
-    @abstractmethod
-    def forward(self, x: T, x2: T | None = None) -> torch.Tensor:
-        raise NotImplementedError
-
-    def clone(self) -> Self:
-        return self.clone_with()
-
-    def clone_with(self, **params: Any) -> Self:
-        # h ttps://github.com/scikit-learn/scikit-learn/blob/70fdc843a4b8182d97a3508c1a426acc5e87e980/sklearn/base.py#L197
-        sig = inspect.signature(self.__init__)
-
-        self_values = {}
-        for p in sig.parameters.values():
-            if p.name == "self":
-                continue
-
-            attr = getattr(self, p.name, NotSet)
-            if attr is NotSet:
-                raise ValueError(
-                    f"Could not clone as the variable {p.name} was not set in"
-                    f" the constructor on the object: {self}"
-                )
-            self_values[p.name] = params.get(p.name, attr)
-
-        new_self_values = copy.deepcopy(self_values)
-        return self.__class__(**new_self_values)
-
-    def grid_search(
-        self,
-        x: T,
-        y: torch.Tensor,
-        *,
-        grid: Sequence[Mapping[str, Any]] | None = None,
-        noise_variances: Sequence[float] = (1e-6,),
-    ) -> tuple[Self, float]:
-        # Returns: (Kernel[T], float) | None if failed
-        grid = grid or self.suggested_grid
-
-        def _fit_and_eval(
-            _params: Mapping[str, Any],
-        ) -> tuple[Kernel[T], float] | Exception:
-            cloned_kernel = self.clone_with(**_params)
-            K = cloned_kernel.forward(x)
-
-            best_lml = -float("inf")
-            for noise_variance in noise_variances:
-                K.diag().add_(noise_variance)
-
-                K_inv, logDetK = compute_pd_inverse(K)
-                lml = log_marginal_likelihood(K_inv, logDetK, y).item()
-                if lml > best_lml:
-                    best_lml = lml
-
-                K.diag().sub_(noise_variance)
-
-            return cloned_kernel, best_lml
-
-        evals = [_fit_and_eval(params) for params in grid]
-        evals_with_score = [e for e in evals if not isinstance(e, Exception)]
-        if not any(evals_with_score):
-            raise evals[-1]  # type: ignore
-
-        return max(evals_with_score, key=lambda e: e[1])  # type: ignore
-
-
-class NumericKernel(Kernel[torch.Tensor]): ...
-
-
-TWO_LOG_2_PI = 2 * torch.log(torch.tensor(2 * math.pi))
-
-
-def log_marginal_likelihood(
-    K_inv: torch.Tensor,
-    logDetK: torch.Tensor,
-    y: torch.Tensor,
-) -> torch.Tensor:
-    # y.T @ K_inv @ y  --- Benchmarked to be twice as fast
-    quad_form = torch.matmul(y, torch.matmul(K_inv, y))
-    n = y.shape[0]
-
-    # TODO: We can drop the `n / 2 * TWO_LOG_2_PI` term for the grid
-    # search above as it's constant between the different kernel grids
-    # as it's purely data dependant with the `n`
-    return -0.5 * quad_form + 0.5 * logDetK - n / TWO_LOG_2_PI
-
-
-class _CholeskyError(RuntimeError):
-    """Raised when the Cholesky decomposition fails."""
-
-
-# https://github.com/cornellius-gp/linear_operator/blob/eec70f9e1cd9106c32b05a3e774ea29d00d71cea/linear_operator/utils/cholesky.py#L12
-def _cholesky_routine(
-    K: torch.Tensor,
-    jitter: float | torch.Tensor = 1e-6,
-    max_tries: int = 4,
-) -> torch.Tensor:
-    L, info = torch.linalg.cholesky_ex(K)
-    if not torch.any(info):
-        return L
-
-    # Clone as we will modify in place, still cheaper
-    # than creating a new full tensor for identity.
-    K_prime = K.clone()
-    jitter_prev = 0
-    for i in range(max_tries):
-        jitter_new = jitter * (10**i)
-        K_prime.diagonal().add_(jitter_new - jitter_prev)
-        L, info = torch.linalg.cholesky_ex(K_prime)
-        if not torch.any(info):
-            return L
-
-        jitter_prev = jitter_new
-
-    raise _CholeskyError("Failed to compute Cholesky decomposition.")
-
-
-def compute_pd_inverse(K: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    # Adding noise to the diagonal of K helps with numerical stability
-    # when K is singular or near-singular, (i.e. it helps K be more "positive") which
-    # is required for the decomposition.
-
-    try:
-        # L @ L.T = K_inv  --- solves for L
-        L = _cholesky_routine(K)
-        logDetK = 2 * torch.sum(torch.log(torch.diag(L)))
-
-        # K_inv = L_inv @ L_inv.T  --- Efficiently solve for K_inv using just L
-        K_inv = torch.cholesky_inverse(L)
-    except _CholeskyError:
-        # If we fail to compute the Cholesky decomposition,
-        # then just compute the inverse directly.
-        K_inv = torch.linalg.inv(K)
-        logDetK = torch.linalg.slogdet(K)[1]
-
-    return K_inv, logDetK
diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
deleted file mode 100644
index 07b56333..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py
+++ /dev/null
@@ -1,112 +0,0 @@
-from __future__ import annotations
-
-from abc import ABC
-from itertools import product
-from math import sqrt
-from typing import Any, ClassVar, Mapping, Sequence
-from typing_extensions import Self, override
-
-import torch
-from torch import nn
-
-from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel
-
-# TODO:
-# We should try some variations of singular length scales
-# (1 scale shared across all dimensions)
-# and individual ARD lengthscales (1 for each dimension)
-# ARD can overfit if not properly tuned...
-LENGTHSCALE_GRID = (1e-2, 1e-1, 1, 1e1, 1e2)
-STD_ENCODED_OUTPUT_SCALE = (1e-2, 1e-1, 1, 1e1, 1e2)
-
-
-class NumericKernel(Kernel[torch.Tensor], ABC):
-    suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] = [
-        {"lengthscale": _l, "output_scale": o}
-        for _l, o in product(LENGTHSCALE_GRID, STD_ENCODED_OUTPUT_SCALE)
-    ]
-
-    def __init__(
-        self,
-        *,
-        lengthscale: torch.Tensor | None = None,
-        outputscale: torch.Tensor | None = None,
-        lengthscale_bounds: tuple[float, float] | None = (1e-2, 1e2),
-        outputscale_bounds: tuple[float, float] | None = (1e-2, 1e2),
-        device: torch.device | None = None,
-    ):
-        super().__init__()
-        self.lengthscale = (
-            torch.as_tensor(lengthscale, dtype=torch.float64, device=device)
-            if lengthscale is not None
-            else torch.tensor(1, dtype=torch.float64, device=device)
-        )
-        self.outputscale = (
-            torch.as_tensor(outputscale, dtype=torch.float64, device=device)
-            if outputscale is not None
-            else torch.tensor(1, dtype=torch.float64, device=device)
-        )
-        self.lengthscale_bounds = lengthscale_bounds
-        self.outputscale_bounds = outputscale_bounds
-        self.device = device
-
-        self.train_: torch.Tensor | None = None
-
-    def as_optimizable(self) -> Self:
-        return self.clone_with(
-            lengthscale=nn.Parameter(self.lengthscale),
-            outputscale=nn.Parameter(self.outputscale),
-        )
-
-    def forward(self, x: torch.Tensor, x2: torch.Tensor | None = None) -> torch.Tensor:
-        # NOTE: I don't think this is the right way to do this...
-        if self.lengthscale_bounds is not None or self.outputscale_bounds is not None:
-            with torch.no_grad():
-                if self.lengthscale_bounds is not None:
-                    self.lengthscale.data.clamp_(*self.lengthscale_bounds)
-                if self.outputscale_bounds is not None:
-                    self.outputscale.data.clamp_(*self.outputscale_bounds)
-
-        x2 = x if x2 is None else x2
-        return self._forward(x, x2)
-
-    def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: ...
-
-
-class Stationary(NumericKernel):
-    @override
-    def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
-        return self.outputscale * torch.cdist(x1, x2, p=2) / self.lengthscale
-
-
-class RBFKernel(NumericKernel):
-    @override
-    def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
-        dist_sq = torch.cdist(x1, x2, p=2) ** 2
-        return self.outputscale * torch.exp(-dist_sq / (2 * self.lengthscale**2))
-
-
-class Matern32Kernel(NumericKernel):
-    @override
-    def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
-        dist = torch.cdist(x1, x2, p=2) / self.lengthscale
-        factor = sqrt(3.0) * dist
-        matern32 = (1 + factor) * torch.exp(-factor)
-        return self.outputscale * matern32
-
-
-class HammingKernel(NumericKernel):
-    @override
-    def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
-        dists = (x1.unsqueeze(1) != x2.unsqueeze(0)).float().sum(-1) / x1.shape[-1]
-        scaled_dists = dists / self.lengthscale
-        return self.outputscale * torch.exp(-scaled_dists)
-
-
-class Matern52Kernel(NumericKernel):
-    @override
-    def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
-        dist = torch.cdist(x1, x2, p=2) / self.lengthscale
-        factor = sqrt(5.0) * dist
-        matern52 = (1 + factor + (factor**2) / 3) * torch.exp(-factor)
-        return self.outputscale * matern52
diff --git a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
index 8c1feb26..44e8b8e1 100644
--- a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
+++ b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
-from itertools import product
-from typing import Any, ClassVar, Mapping, Sequence
+from typing import TYPE_CHECKING
 from typing_extensions import Self
 
 import numpy as np
@@ -14,20 +13,29 @@
     WeisfeilerLehman as _WL,
 )
 from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel
-from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import NumericKernel
+
+if TYPE_CHECKING:
+    from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import (
+        NumericKernel,
+    )
 
 GRID_WL_LENGTHSCALES = torch.tensor([np.e**i for i in range(-2, 3)])
 GRID_WL_SUBTREE_CANDIDATES = (1, 2, 3, 4, 5)
 
 
+def normal_prior(param: torch.Tensor, mean: float, std: float) -> torch.Tensor:
+    return -0.5 * torch.sum(((param - mean) / std) ** 2) - torch.sum(
+        torch.log(std * torch.sqrt(2 * torch.tensor(np.pi)))
+    )
+
+
+def kernel_hp_prior(params: dict[str, nn.Parameter]) -> torch.Tensor:
+    return normal_prior(params["layer_weights"], mean=0, std=1)
+
+
 class WeisfilerLehman(Kernel[npt.NDArray[np.object_]]):
     """Weisfiler Lehman kernel using grakel functions."""
 
-    suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] = [
-        {"h": h, "se_kernel": NumericKernel(lengthscale=l)}
-        for h, l in product(GRID_WL_SUBTREE_CANDIDATES, GRID_WL_LENGTHSCALES)
-    ]
-
     def __init__(
         self,
         *,
@@ -46,11 +54,12 @@ def __init__(
                 vector embedding inner products are computed).
                 If None, uses the default linear kernel
             layer_weights: The weights for each layer of the Weisfeiler-Lehman kernel.
-                If None, uses uniform
+                If None, uses uniform 1s
             oa: whether the optimal assignment variant of the Weisfiler-Lehman
                 kernel should be used
             node_label: the node_label defining the key node attribute.
         """
+        super().__init__(hyperparameter_prior=kernel_hp_prior)
         if se_kernel is not None and oa:
             raise ValueError(
                 "Only one or none of se (successive embedding) and oa (optimal assignment) may be true!"
@@ -72,7 +81,6 @@ def as_optimizable(self) -> Self:
         return self.clone_with(layer_weights=nn.Parameter(self.layer_weights))
 
     def fit_transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor:
-        self.layer_weights.clamp_(0, 1)
         self.wl_kernel_ = _WL(
             h=self.h,
             base_graph_kernel=(  # type: ignore
@@ -84,7 +92,7 @@ def fit_transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor:
                     "requires_ordered_features": True,
                 },
             ),
-            layer_weights=self.layer_weights,
+            layer_weights=self.layer_weights / self.layer_weights.sum(),
             normalize=True,
         )
 
@@ -93,7 +101,6 @@ def fit_transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor:
 
     def transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor:
         assert self.wl_kernel_ is not None
-        self.layer_weights.clamp_(0, 1)
 
         K = self.wl_kernel_.transform(iter(gr))
         return torch.as_tensor(K, dtype=torch.float64)
diff --git a/neps/optimizers/bayesian_optimization/models/__init__.py b/neps/optimizers/bayesian_optimization/models/__init__.py
index 6ce65b61..5e40df9d 100755
--- a/neps/optimizers/bayesian_optimization/models/__init__.py
+++ b/neps/optimizers/bayesian_optimization/models/__init__.py
@@ -1,7 +1,5 @@
 from neps.utils.common import MissingDependencyError
 
-from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP
-
 try:
     from neps.optimizers.models.deepGP import DeepGP
 except ImportError as e:
@@ -14,6 +12,8 @@
 
 SurrogateModelMapping = {
     "deep_gp": DeepGP,
-    "gp": ComprehensiveGP,
+    "gp": MissingDependencyError(
+        "Removed for now", NotImplementedError("GP is not implemented")
+    ),
     "pfn": PFN_SURROGATE,
 }
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index e63c033f..ab2884f3 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -1,261 +1,269 @@
 from __future__ import annotations
 
 import logging
-from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Literal, Mapping, Sequence
+import math
+from typing import TYPE_CHECKING, Any, Mapping, TypeVar
 
-import numpy as np
+import gpytorch
+import gpytorch.constraints
 import torch
-from torch import nn
-from torch.optim import SGD, Adam  # type: ignore
+from botorch.acquisition.analytic import SingleTaskGP
+from botorch.models import MixedSingleTaskGP
+from botorch.models.gp_regression_mixed import CategoricalKernel
+from botorch.models.transforms.outcome import Standardize
+from botorch.optim import optimize_acqf, optimize_acqf_mixed
+from gpytorch.kernels import MaternKernel, ScaleKernel
 
-from neps.optimizers.bayesian_optimization.kernels.kernel import (
-    Kernel,
-    compute_pd_inverse,
-    log_marginal_likelihood,
-)
-from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import (
-    HammingKernel,
-    Matern52Kernel,
-    NumericKernel,
-)
-from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import (
-    WeisfilerLehman,
-)
-from neps.search_spaces import SearchSpace
 from neps.search_spaces.encoding import (
     CategoricalToIntegerTransformer,
+    DataEncoder,
     DataPack,
-    MinMaxNormalizer,
-    OneHotEncoder,
-    TensorTransformer,
-    Transformer,
-    WLInputTransformer,
 )
-from neps.search_spaces.hyperparameters import FloatParameter, IntegerParameter
 
 if TYPE_CHECKING:
-    from neps.search_spaces.search_space import SearchSpace
+    from botorch.acquisition import AcquisitionFunction
 
 logger = logging.getLogger(__name__)
 
 
-# The optimization we do for the noise is relatively cheap while the matrices
-NOISE_VARIANCE_GRID = (1e-6, 1e-4, 1e-2, 1, 1e1, 1e2)
-
-
-@dataclass
-class ComprehensiveGP:
-    space: SearchSpace
-    kernels: dict[str, tuple[Sequence[str], Kernel]]
-
-    combined_kernel: Literal["sum", "product"] = "sum"
-    noise_variance: Sequence[float] = NOISE_VARIANCE_GRID
-    kernel_parameter_grid: Mapping[str, Sequence[Mapping[str, Any]]] | bool = True
-
-    optimizer: Literal["adam", "sgd"] = "adam"
-    optimizer_kwargs: Mapping[str, Any] = field(default_factory=lambda: {"lr": 0.1})
-    optimizer_iters: int = 20
-    device: torch.device | None = None
-
-    # Post fit attributes
-    K_inv_: torch.Tensor | None = None
-    likelihood_: float | None = None
-    y_: torch.Tensor | None = None
-    y_normalized_: torch.Tensor | None = None
-    y_mean_: float | None = None
-    y_std_: float | None = None
-    opt_kernels_: dict[str, tuple[Sequence[str], Kernel]] | None = None
-    train_x_: DataPack | None = None
-
-    def __post_init__(self):
-        # TODO: Remove when search space is just definition and does not hold values.
-        self.space = self.space.clone()
-
-    def fit(
-        self,
-        *,
-        x: DataPack,
-        train_y: torch.Tensor,
-    ) -> None:
-        # Preprocessing
-        y_ = torch.as_tensor(train_y, device=self.device, dtype=torch.float64)
-
-        # TODO: Dunno if I like this silent hack, setting std to 1 if no std
-        self.y_std_ = s if (s := torch.std(y_).item()) != 0 else 1
-        self.y_mean_ = torch.mean(y_).item()
-        self.y_normalized_ = (y_ - self.y_mean_) / self.y_std_
-        self.y_ = y_
-
-        # optimized kernel parameters + noise variance
-        optim_vars: list[nn.Parameter] = []
-        opt_kernels: dict[str, tuple[Sequence[str], Kernel]] = {}
-
-        N: int
-        for _kernel_name, (hps, kernel) in self.kernels.items():
-            data = x.select(hps)
-            opt_kernel, _ = kernel.grid_search(
-                x=data,  # type: ignore
-                y=self.y_normalized_,
-            )
-            optim_vars.extend(opt_kernel.parameters())
-            opt_kernels[_kernel_name] = (hps, opt_kernel)
-
-        # Now that we've optimized the kernels, we convert go convert their
-        # parameters into a tensor we can further refine with some optimizer iterations
-        # - Optimize kernel-lengthscales, kernel-outputscale, noise-variance
-        #   and any additional parameters they wish to advertise.
-        noise_variance = nn.Parameter(
-            torch.tensor(1e-3, device=self.device, dtype=torch.float64)
+T = TypeVar("T")
+
+
+def default_likelihood_with_prior() -> gpytorch.likelihoods.GaussianLikelihood:
+    # The effect of the likelihood of noise is pretty crucial w.r.t.
+    # whether we are going to overfit every point by overfitting with
+    # the lengthscale, or whether we smooth through and assume variation
+    # is due to noise. Setting it's prior is hard. For a non-noisy
+    # function, we'd want it looooowww, like 1e-8 kind of low. For
+    # even a 0.01% noise, we need that all the way up to 1e-2. Hence
+    #
+    # If we had 10% noise and we allow the noise to easily optimize towards
+    # 1e-8, then the lengthscales are forced to beome very small, essentially
+    # overfitting. If we have 0% noise and we don't allow it to easily get low
+    # then we will drastically underfit.
+    # A guiding principle here is that we should allow the noise to be just
+    # as if not slightly easier to tune than the lengthscales. I.e. we prefer
+    # smoother functions as it is easier to acquisition over. However once we
+    # over smooth and underfit, any new observations that inform us otherwise
+    # could just be attributed to noise.
+    #
+    # TOOD: We may want to move the likelihood inside the GP and decay the
+    # amount the GP can attribute to noise (reduce std and mean) relative
+    # to samples seen, effectively reducing the smoothness of the GP overtime
+    noise_mean = 1e-2
+    noise_std = math.sqrt(3)
+    _noise_prior = gpytorch.priors.LogNormalPrior(
+        math.log(noise_mean) + noise_std**2,
+        noise_std,
+    )
+    return gpytorch.likelihoods.GaussianLikelihood(
+        noise_prior=_noise_prior,
+        # Going below 1e-6 could introduuce a lot of numerical instability in the
+        # kernels, even if it's a noiseless function
+        noise_constraint=gpytorch.constraints.Interval(
+            lower_bound=1e-6,
+            upper_bound=1,
+            initial_value=noise_mean,
+        ),
+    )
+
+
+def default_signal_variance_prior() -> gpytorch.priors.NormalPrior:
+    # The outputscale prior is a bit more tricky. Essentially
+    # it describes how much we expect the function to move
+    # around the mean (0 as we normalize the `ys`)
+    # Based on `Vanilla GP work great in High Dimensions` by Carl Hvafner
+    # where it's fixed to `1.0`, we follow suit but allow some minor deviation
+    # with a prior.
+    return gpytorch.priors.NormalPrior(loc=1.0, scale=0.1)
+
+
+def default_lengthscale_prior(
+    N: int,
+) -> tuple[gpytorch.priors.LogNormalPrior, gpytorch.constraints.Interval]:
+    # Based on `Vanilla GP work great in High Dimensions` by Carl Hvafner
+    # TODO: I'm not convinced entirely that the `std` is independant
+    # of the dimension and number of samples
+    lengthscale_prior = gpytorch.priors.LogNormalPrior(
+        loc=math.sqrt(2.0) + math.log(N) / 2,
+        scale=math.sqrt(3.0),
+    )
+    # NOTE: It's possible to just specify `GreaterThan`, however
+    # digging through the code, if this ends up at botorch's optimize,
+    # it will read this and take the bounds and give it to Scipy's
+    # L-BFGS-B optimizer. Without an upper bound, it defaults to `inf`,
+    # which can impact gradient estimates.
+    # tldr; set a bound if you have one, it always helps
+    lengthscale_constraint = gpytorch.constraints.Interval(
+        lower_bound=1e-4,
+        upper_bound=1e3,
+        initial_value=math.sqrt(2.0) + math.log(N) / 2,
+    )
+    return lengthscale_prior, lengthscale_constraint
+
+
+def default_mean() -> gpytorch.means.ConstantMean:
+    return gpytorch.means.ConstantMean(
+        constant_prior=gpytorch.priors.NormalPrior(0, 0.2),
+        constant_constraint=gpytorch.constraints.Interval(
+            lower_bound=-1e6,
+            upper_bound=1e6,
+            initial_value=0.0,
+        ),
+    )
+
+
+def default_matern_kernel(
+    N: int,  # noqa: N803
+    active_dims: tuple[int, ...] | None = None,
+) -> ScaleKernel:
+    lengthscale_prior, lengthscale_constraint = default_lengthscale_prior(N)
+
+    return ScaleKernel(
+        MaternKernel(
+            nu=2.5,
+            ard_num_dims=N,
+            active_dims=active_dims,
+            lengthscale_prior=lengthscale_prior,
+            lengthscale_constraint=lengthscale_constraint,
+        ),
+    )
+
+
+def default_categorical_kernel(
+    N: int,  # noqa: N803
+    active_dims: tuple[int, ...] | None = None,
+) -> ScaleKernel:
+    # Following BoTorches implementation of the MixedSingleTaskGP
+    return ScaleKernel(
+        CategoricalKernel(
+            ard_num_dims=N,
+            active_dims=active_dims,
+            lengthscale_constraint=gpytorch.constraints.GreaterThan(1e-6),
         )
-        optim_vars.append(noise_variance)
-
-        if self.optimizer == "adam":
-            optim = Adam(optim_vars, **self.optimizer_kwargs)  # type: ignore
-        elif self.optimizer == "sgd":
-            optim = SGD(optim_vars, **self.optimizer_kwargs)  # type: ignore
+    )
+
+
+def default_single_obj_gp(x: DataPack, y: torch.Tensor) -> SingleTaskGP:
+    encoder = x.encoder
+    assert x.tensor is not None
+    assert encoder.tensors is not None
+    # Here, we will collect all graph encoded hyperparameters and assign each
+    # to its own individual WL kernel.
+    if encoder.graphs is not None:
+        raise NotImplementedError("Graphs are not yet supported.")
+
+    numerics: list[str] = []
+    categoricals: list[str] = []
+    for hp_name, transformer in encoder.tensors.transformers.items():
+        if isinstance(transformer, CategoricalToIntegerTransformer):
+            categoricals.append(hp_name)
         else:
-            raise ValueError(f"Invalid optimizer {self.optimizer}")
-
-        K_inv: torch.Tensor | None = None
-        _init = torch.zeros if self.combined_kernel == "sum" else torch.ones
-        N = len(x)
-        K = _init((N, N), device=self.device, dtype=torch.float64)
-        for _i in range(self.optimizer_iters):
-            optim.zero_grad()
-
-            for _kernel_name, (hps, opt_kernel) in opt_kernels.items():
-                data = x.select(hps)
-                k = opt_kernel.forward(data)
-                K.add_(k) if self.combined_kernel == "sum" else K.mul_(k)
-
-            K.diag().add_(noise_variance)
-            K_inv, logDetK = compute_pd_inverse(K)
-            nlml = -log_marginal_likelihood(K_inv, logDetK, y=self.y_normalized_)
-
-            # TODO: Could early stop here...
-            nlml.backward()
-            optim.step()
-
-            with torch.no_grad():
-                noise_variance.clamp_(1e-6, np.inf)
-
-        # Apply the optimal hyperparameters
-        assert K_inv is not None
-        self.K_inv_ = K_inv.clone()
-        self.noise_variance_ = noise_variance.item()
-        self.opt_kernels_ = opt_kernels
-        self.train_x_ = x
-
-    def predict(
-        self,
-        *,
-        x: DataPack,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """Kriging predictions."""
-        if (
-            self.K_inv_ is None
-            or self.train_x_ is None
-            or self.y_normalized_ is None
-            or self.y_std_ is None
-            or self.opt_kernels_ is None
-        ):
-            raise ValueError(
-                "Inverse of Gram matrix is not instantiated. Please call the optimize "
-                "function to fit on the training data first!"
-            )
-
-        _init = torch.zeros if self.combined_kernel == "sum" else torch.ones
-        n_test = len(x)
-
-        K_train_test = _init(
-            len(self.train_x_), n_test, device=self.device, dtype=torch.float64
+            numerics.append(hp_name)
+
+    categorical_indices = encoder.indices(categoricals)
+    numeric_indices = encoder.indices(numerics)
+
+    # Purely vectorial
+    if len(categorical_indices) == 0:
+        return SingleTaskGP(
+            train_X=x.tensor,
+            train_Y=y,
+            mean_module=default_mean(),
+            likelihood=default_likelihood_with_prior(),
+            # Only matern kernel
+            covar_module=default_matern_kernel(len(numerics)),
+            outcome_transform=Standardize(m=1),
+        )
+
+    # Purely categorical
+    if len(numeric_indices) == 0:
+        return SingleTaskGP(
+            train_X=x.tensor,
+            train_Y=y,
+            mean_module=default_mean(),
+            likelihood=default_likelihood_with_prior(),
+            # Only categorical kernel
+            covar_module=default_categorical_kernel(len(categoricals)),
+            outcome_transform=Standardize(m=1),
         )
-        for _kernel_name, (hps, opt_kernel) in self.opt_kernels_.items():
-            train = self.train_x_.select(hps)
-            test = x.select(hps)
-            k = opt_kernel.forward(train, test)
-            if self.combined_kernel == "sum":
-                K_train_test.add_(k)
-            else:
-                K_train_test.mul_(k)
-
-        K_test_test = _init(n_test, n_test, device=self.device, dtype=torch.float64)
-        for _kernel_name, (hps, opt_kernel) in self.opt_kernels_.items():
-            test = x.select(hps)
-            k = opt_kernel.forward(test, test)
-            if self.combined_kernel == "sum":
-                K_test_test.add_(k)
-            else:
-                K_test_test.mul_(k)
-
-        # Compute the predictive mean
-
-        # Scale by the standard deviation and mean
-        mu_s = K_train_test.t() @ self.K_inv_ @ self.y_normalized_
-        mu_s = mu_s * self.y_std_ + self.y_mean_
-
-        cov_s = K_test_test - K_train_test.t() @ self.K_inv_ @ K_train_test
-        cov_s *= self.y_std_**2
-
-        return mu_s, cov_s
-
-    @classmethod
-    def get_default(
-        cls, space: SearchSpace, *, include_fidelities: bool = False
-    ) -> ComprehensiveGP:
-        kernels = get_default_kernels(space=space, include_fidelities=include_fidelities)
-        return cls(space=space, kernels=kernels)
-
-
-def get_default_kernels(
-    *,
-    space: SearchSpace,
-    include_fidelities: bool = False,
-) -> dict[str, tuple[Kernel, Transformer]]:
-    kernels: dict[str, tuple[Kernel, Transformer]] = {}
-
-    # We will always need to use a graph kernel for graphs and there's no
-    # possibility to embed them into a tensor.
-    if any(space.graphs):
-        for hp_name in space.graphs:
-            kernels[f"graph_{hp_name}"] = (
-                WeisfilerLehman(h=2, oa=True),
-                WLInputTransformer((hp_name,)),
-            )
-
-    assert all(
-        isinstance(f, (IntegerParameter, FloatParameter)) for f in space.fidelities
-    ), "Assumption for numeric represetnation of fidelity broken"
-
-    any_numerical = any(space.numerical) or (include_fidelities and any(space.fidelities))
-    if any_numerical:
-        # At least one numerical, fuse numeric + categoricals into one tensor encoding
-        transformers: list[TensorTransformer] = []
-        if any(space.categoricals):
-            transformers.append(OneHotEncoder(tuple(space.categoricals)))
-
-        if include_fidelities:
-            min_max_normalizer = MinMaxNormalizer(
-                tuple(space.numerical) + tuple(space.fidelities)
-            )
-        else:
-            min_max_normalizer = MinMaxNormalizer(tuple(space.numerical))
 
-        transformers.append(min_max_normalizer)
-        kernels["vectorial"] = (Matern52Kernel(), JointTransformer.join(*transformers))
-    else:
-        # At this point, we assume only categoricals and maybe fidelities
-        assert any(space.categoricals)
+    # Mixed
+    def cont_kernel_factory(
+        batch_shape: torch.Size,
+        ard_num_dims: int,
+        active_dims: list[int],
+    ) -> ScaleKernel:
+        lengthscale_prior, lengthscale_constraint = default_lengthscale_prior(
+            ard_num_dims
+        )
+        return ScaleKernel(
+            MaternKernel(
+                nu=2.5,
+                batch_shape=batch_shape,
+                ard_num_dims=ard_num_dims,
+                active_dims=active_dims,
+                lengthscale_prior=lengthscale_prior,
+                lengthscale_constraint=lengthscale_constraint,
+            ),
+        )
 
-        if include_fidelities and any(space.fidelities):
-            fid_normalizer = MinMaxNormalizer(tuple(space.fidelities))
-            one_hot_encoder = OneHotEncoder(tuple(space.categoricals))
+    return MixedSingleTaskGP(
+        train_X=x.tensor,
+        train_Y=y,
+        cat_dims=list(categorical_indices),
+        likelihood=default_likelihood_with_prior(),
+        cont_kernel_factory=cont_kernel_factory,
+        outcome_transform=Standardize(m=1),
+    )
 
-            transformer = JointTransformer.join(one_hot_encoder, fid_normalizer)
-            kernels["vectorial"] = (Matern52Kernel(), transformer)
-        else:
-            transformer = CategoricalToIntegerTransformer(tuple(space.categoricals))
-            kernels["categorical"] = (HammingKernel(), transformer)
 
-    return kernels
+def optimize_acq(
+    acq_fn: AcquisitionFunction,
+    encoder: DataEncoder,
+    *,
+    q: int,
+    num_restarts: int,
+    raw_samples: int,
+    acq_options: Mapping[str, Any] | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    acq_options = acq_options or {}
+    if encoder.has_graphs():
+        raise NotImplementedError("Graphs are not yet supported.")
+
+    assert encoder.tensors is not None
+    lower = [t.domain.lower for t in encoder.tensors.transformers.values()]
+    upper = [t.domain.upper for t in encoder.tensors.transformers.values()]
+    bounds = torch.tensor([lower, upper], dtype=torch.float)
+
+    fixed_categoricals = encoder.categorical_product_indices()
+
+    if not any(fixed_categoricals):
+        return optimize_acqf(
+            acq_function=acq_fn,
+            bounds=bounds,
+            q=q,
+            num_restarts=num_restarts,
+            raw_samples=raw_samples,
+            **acq_options,
+        )
+
+    if len(fixed_categoricals) > 30:
+        raise ValueError(
+            "The number of fixed categorical dimensions is too high. "
+            "This will lead to an explosion in the number of possible "
+            "combinations. Please reduce the number of fixed categorical "
+            "dimensions or consider encoding your categoricals in some other format."
+        )
+
+    # TODO: we should deterministicall shuffle the fixed_categoricals as the
+    # underlying function does not.
+    return optimize_acqf_mixed(
+        acq_function=acq_fn,
+        bounds=bounds,
+        num_restarts=num_restarts,
+        raw_samples=raw_samples,
+        q=q,
+        fixed_features_list=fixed_categoricals,  # type: ignore
+        **acq_options,
+    )
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 19efa6b6..a89a15aa 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -1,21 +1,22 @@
 from __future__ import annotations
 
 import random
-from itertools import chain
-from typing import TYPE_CHECKING, Any, Literal, Mapping
+from typing import TYPE_CHECKING, Any, Callable, Literal, Mapping
 
 import torch
+from botorch.acquisition import (
+    LinearMCObjective,
+    qLogExpectedImprovement,
+)
 
 from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig
 from neps.optimizers.bayesian_optimization.acquisition_functions import (
-    AcquisitionMapping,
     DecayingPriorWeightedAcquisition,
 )
-from neps.optimizers.bayesian_optimization.acquisition_samplers import (
-    AcquisitionSamplerMapping,
+from neps.optimizers.bayesian_optimization.models.gp import (
+    default_single_obj_gp,
+    optimize_acq,
 )
-from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
-from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP
 from neps.search_spaces import (
     CategoricalParameter,
     ConstantParameter,
@@ -23,16 +24,13 @@
     IntegerParameter,
     SearchSpace,
 )
-from neps.search_spaces.encoding import Encoder
-from neps.utils.common import instance_from_map
+from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN
+from neps.search_spaces.encoding import DataEncoder
 
 if TYPE_CHECKING:
-    from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
-        BaseAcquisition,
-    )
-    from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-        AcquisitionSampler,
-    )
+    from botorch.models.model import Model
+
+    from neps.search_spaces.encoding import DataPack
     from neps.state import BudgetInfo, Trial
 
 # TODO(eddiebergman): Why not just include in the definition of the parameters.
@@ -53,10 +51,8 @@ def __init__(
         pipeline_space: SearchSpace,
         *,
         initial_design_size: int = 10,
-        surrogate_model: str | Any = "gp",
-        acquisition: str | BaseAcquisition = "EI",
+        surrogate_model: Literal["gp"] | Callable[[DataPack, torch.Tensor], Model] = "gp",
         log_prior_weighted: bool = False,
-        acquisition_sampler: str | AcquisitionSampler = "mutation",
         random_interleave_prob: float = 0.0,
         patience: int = 100,
         budget: None | int | float = None,
@@ -67,6 +63,8 @@ def __init__(
         disable_priors: bool = False,
         prior_confidence: Literal["low", "medium", "high"] | None = None,
         sample_default_first: bool = False,
+        device: torch.device | None = None,
+        **kwargs: Any,  # TODO: Remove
     ):
         """Initialise the BO loop.
 
@@ -128,44 +126,20 @@ def __init__(
         self._initial_design_size = initial_design_size
         self._random_interleave_prob = random_interleave_prob
         self._num_error_evaluations: int = 0
+        self.device = device
         self.sample_default_first = sample_default_first
+        self.encoder: DataEncoder | None = None
 
-        if isinstance(surrogate_model, str):
-            if surrogate_model == "gp":
-                self.surrogate_model = ComprehensiveGP.get_default(
-                    space=pipeline_space,
-                    include_fidelities=False,
-                )
-                self._encoder = Encoder.default(self.pipeline_space)
-            else:
-                raise NotImplementedError(
-                    "Only 'gp' is supported as a surrogate model for now."
-                )
-                self.surrogate_model = instance_from_map(
-                    SurrogateModelMapping,
-                    surrogate_model,
-                    name="surrogate model",
-                    kwargs=surrogate_model_args,
-                )
+        if surrogate_model == "gp":
+            self._get_fitted_model = default_single_obj_gp
         else:
-            self.surrogate_model = surrogate_model
+            self._get_fitted_model = surrogate_model
 
-        self.acquisition = instance_from_map(
-            AcquisitionMapping,
-            acquisition,
-            name="acquisition function",
-        )
         if self.pipeline_space.has_prior:
             self.acquisition = DecayingPriorWeightedAcquisition(
                 self.acquisition, log=log_prior_weighted
             )
 
-        self.acquisition_sampler = instance_from_map(
-            AcquisitionSamplerMapping,
-            acquisition_sampler,
-            name="acquisition sampler function",
-            kwargs={"patience": self.patience, "pipeline_space": self.pipeline_space},
-        )
         if self.pipeline_space.has_prior:
             for k, v in self.pipeline_space.items():
                 if v.is_fidelity or isinstance(v, ConstantParameter):
@@ -179,6 +153,8 @@ def __init__(
                     ]
                     self.pipeline_space[k].default_confidence_score = confidence
 
+        self._cached_sobol_configs: list[dict[str, Any]] | None = None
+
     def ask(
         self,
         trials: Mapping[str, Trial],
@@ -192,74 +168,102 @@ def ask(
             for t in trials.values()
             if t.report is not None and t.report.loss is not None
         ]
-        train_x = [t.config for t in completed]
-        train_y: torch.Tensor = torch.as_tensor([t.report.loss for t in completed])  # type: ignore
+        x_configs = [t.config for t in completed]
+        y: torch.Tensor = torch.as_tensor(
+            [t.report.loss for t in completed],
+            dtype=torch.float64,
+        )  # type: ignore
+
+        # We only do single objective for now but may as well include this for when we have MO
+        if y.ndim == 1:
+            y = y.unsqueeze(1)
 
         pending = [t.config for t in trials.values() if t.state.pending()]
+        if self.encoder is None:
+            self.encoder = DataEncoder.default_encoder(
+                self.pipeline_space,
+                include_fidelities=False,
+            )
 
         space = self.pipeline_space
 
-        # TODO: This would be better if we could serialize these
-        # in their encoded form. later...
-        for name, hp in space.categoricals.items():
-            for config in chain(train_x, pending):
-                config[name] = hp.choices.index(config[name])
-        for name, hp in space.graphs.items():
-            for config in chain(train_x, pending):
-                config[name] = hp.clone().load_from(config[name])
-
         if len(trials) == 0 and self.sample_default_first and space.has_prior:
             config = space.sample_default_configuration(
                 patience=self.patience, ignore_fidelity=False
-            )
+            ).hp_values()
+
         elif len(trials) <= self._initial_design_size:
-            config = space.sample(
-                patience=self.patience, user_priors=True, ignore_fidelity=False
-            )
+            if self._cached_sobol_configs is None:
+                assert self.encoder.tensors is not None
+                ndim = len(self.encoder.tensors.transformers)
+                sobol = torch.quasirandom.SobolEngine(
+                    dimension=ndim,
+                    scramble=True,
+                    seed=5,
+                )
+
+                # TODO: Need a better encapsulation of this
+                x = sobol.draw(self._initial_design_size * ndim, dtype=torch.float64)
+                hp_normalized_values = []
+                for i, (_k, v) in enumerate(self.encoder.tensors.transformers.items()):
+                    tensor = v.domain.cast(x[:, i], frm=UNIT_FLOAT_DOMAIN)
+                    tensor = tensor.unsqueeze(1) if tensor.ndim == 1 else tensor
+                    hp_normalized_values.append(tensor)
+
+                tensor = torch.cat(hp_normalized_values, dim=1)
+                uniq = torch.unique(tensor, dim=0)
+                self._cached_sobol_configs = self.encoder.tensors.decode_dicts(uniq)
+
+            if len(trials) <= len(self._cached_sobol_configs):
+                config = self._cached_sobol_configs[len(trials) - 1]
+            else:
+                # The case where sobol sampling couldn't generate enough unique configs
+                config = space.sample(
+                    patience=self.patience, ignore_fidelity=False, user_priors=False
+                ).hp_values()
+
         elif random.random() < self._random_interleave_prob:
             config = space.sample(
                 patience=self.patience, user_priors=False, ignore_fidelity=False
-            )
+            ).hp_values()
         else:
-            try:
-                if len(pending) > 0:
-                    # We want to use hallucinated results for the evaluations that have
-                    # not finished yet. For this we fit a model on the finished
-                    # evaluations and add these to the other results to fit another model.
-                    self.surrogate_model.fit(train_x, train_y)
-                    ys, _ = self.surrogate_model.predict(pending)
-                    train_x += pending
-                    train_y += list(ys.detach().numpy())
-
-                # TODO: When using a GP, if we've already fit the
-                # model due to the if stamet above, we only
-                # need to update the model with the new points.
-                # fit on all the data again, only the new points...
-                self.surrogate_model.fit(train_x, train_y)
-                self.acquisition.set_state(self.surrogate_model)
-                self.acquisition_sampler.set_state(x=train_x, y=train_y)
-                for _ in range(self.patience):
-                    config = self.acquisition_sampler.sample(self.acquisition)
-                    if config not in pending:
-                        break
-                else:
-                    config = space.sample(
-                        patience=self.patience, user_priors=True, ignore_fidelity=False
-                    )
-
-            except RuntimeError as e:
-                self.logger.exception(
-                    "Model could not be updated due to below error. Sampling will not use"
-                    " the model.",
-                    exc_info=e,
-                )
-                config = space.sample(
-                    patience=self.patience, user_priors=True, ignore_fidelity=False
-                )
+            assert self.encoder is not None
+            x = self.encoder.encode(x_configs, device=self.device)
+            if any(pending):
+                x_pending = self.encoder.encode(pending, device=self.device)
+                x_pending = x_pending.tensor
+                assert x_pending is not None
+            else:
+                x_pending = None
+
+            model = self._get_fitted_model(x, y)
+
+            N_CANDIDATES_REQUIRED = 1
+            N_INITIAL_RANDOM_SAMPLES = 512
+            N_RESTARTS = 20
+
+            candidates, _eis = optimize_acq(
+                # TODO: We should evaluate whether LogNoisyEI is better than LogEI
+                acq_fn=qLogExpectedImprovement(
+                    model,
+                    best_f=y.min(),
+                    X_pending=x_pending,
+                    # Unfortunatly, there's no option to indicate that we minimize
+                    # the AcqFunction so we need to do some kind of transformation.
+                    # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607
+                    objective=LinearMCObjective(weights=torch.tensor([-1.0])),
+                ),
+                encoder=self.encoder,
+                q=N_CANDIDATES_REQUIRED,
+                raw_samples=N_INITIAL_RANDOM_SAMPLES,
+                num_restarts=N_RESTARTS,
+                acq_options={},  # options to underlying optim function of botorch
+            )
+            config = self.encoder.decode_dicts(candidates)[0]
 
         config_id = str(len(trials) + 1)
         return SampledConfig(
             id=config_id,
-            config=config.hp_values(),
+            config=config,
             previous_config_id=None,
         ), optimizer_state
diff --git a/neps/optimizers/bayesian_optimization/sobol.py b/neps/optimizers/bayesian_optimization/sobol.py
new file mode 100644
index 00000000..e69de29b
diff --git a/neps/runtime.py b/neps/runtime.py
index 5cf0f29f..7d1cd60f 100644
--- a/neps/runtime.py
+++ b/neps/runtime.py
@@ -46,7 +46,7 @@ def _default_worker_name() -> str:
     return f"{os.getpid()}-{isoformat}"
 
 
-N_FAILED_GET_NEXT_PENDING_ATTEMPTS_BEFORE_ERROR = 10
+N_FAILED_GET_NEXT_PENDING_ATTEMPTS_BEFORE_ERROR = 0
 N_FAILED_TO_SET_TRIAL_STATE = 10
 
 Loc = TypeVar("Loc")
@@ -388,7 +388,7 @@ def run(self) -> None:  # noqa: C901, PLR0915
                 _repeated_fail_get_next_trial_count = 0
             except Exception as e:
                 _repeated_fail_get_next_trial_count += 1
-                logger.error(
+                logger.debug(
                     "Error while trying to get the next trial to evaluate.", exc_info=True
                 )
 
diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py
index 06814862..e3e297de 100644
--- a/neps/search_spaces/domain.py
+++ b/neps/search_spaces/domain.py
@@ -33,18 +33,22 @@ class NumberDomain(Generic[V]):
     bins: int | None = None
 
     dtype: torch.dtype = field(init=False, repr=False)
-    is_unit: bool = field(init=False, repr=False)
+    is_unit_float: bool = field(init=False, repr=False)
     midpoint: V = field(init=False, repr=False)
     is_log: bool = field(init=False, repr=False)
     length: V = field(init=False, repr=False)
     cardinality: int | None = field(init=False, repr=False)
+    bounds: tuple[V, V] = field(init=False, repr=False)
 
     def __post_init__(self):
         assert isinstance(self.lower, type(self.upper))
-        object.__setattr__(self, "is_unit", self.lower == 0 and self.upper == 1)
+        is_int = isinstance(self.lower, int)
         object.__setattr__(self, "is_log", self.log_bounds is not None)
+        object.__setattr__(self, "dtype", torch.int64 if is_int else torch.float64)
         object.__setattr__(
-            self, "dtype", torch.int64 if isinstance(self.lower, int) else torch.float64
+            self,
+            "is_unit_float",
+            self.lower == 0 and self.upper == 1 and is_int and not self.round,
         )
         object.__setattr__(self, "length", self.upper - self.lower)
 
@@ -60,6 +64,7 @@ def __post_init__(self):
         if self.dtype == torch.int64:
             mid = int(round(mid))
         object.__setattr__(self, "midpoint", mid)
+        object.__setattr__(self, "bounds", (self.lower, self.upper))
 
     @classmethod
     def float(
@@ -107,7 +112,7 @@ def indices(cls, n: int) -> NumberDomain[int]:
         return NumberDomain.int(0, n - 1)
 
     def to_unit(self, x: Tensor) -> Tensor:
-        if self.is_unit:
+        if self.is_unit_float:
             return x  # type: ignore
 
         if self.log_bounds is not None:
@@ -119,7 +124,7 @@ def to_unit(self, x: Tensor) -> Tensor:
         return (x - lower) / (upper - lower)
 
     def from_unit(self, x: Tensor) -> Tensor:
-        if self.is_unit:
+        if self.is_unit_float:
             return x
 
         bins = self.bins
@@ -146,10 +151,6 @@ def cast(
         x: Tensor,
         frm: Domain,
     ) -> Tensor:
-        if isinstance(frm, OneHotDomain):
-            x = torch.argmax(x, dim=1)
-            frm = frm.int_domain
-
         # NOTE: In general, we should always be able to go through the unit interval
         # [0, 1] to be able to transform between domains. However sometimes we can
         # bypass some steps, dependant on the domains, hence the ugliness...
@@ -168,7 +169,7 @@ def cast(
 
         # Shortcut 2. (From normalized)
         # The domain we are coming from is already normalized, we only need to lift
-        if frm.is_unit:
+        if frm.is_unit_float:
             return self.from_unit(x)  # type: ignore
 
         # Shortcut 3. (Log lift)
@@ -289,28 +290,6 @@ def unit_float(cls) -> NumberDomain[float]:
         return UNIT_FLOAT_DOMAIN
 
 
-@dataclass(frozen=True)
-class OneHotDomain:
-    cardinality: int
-    int_domain: NumberDomain[int] = field(init=False, repr=False)
-
-    def __post_init__(self):
-        object.__setattr__(
-            self,
-            "int_domain",
-            NumberDomain.indices(self.cardinality),
-        )
-
-    def cast(self, x: Tensor, frm: NumberDomain[int]) -> Tensor:
-        # Convert to integers first
-        x = self.int_domain.cast(x, frm)
-
-        # Then one hot encode
-        buffer = torch.zeros((len(x), self.cardinality))
-        buffer.scatter_(1, x.unsqueeze(1), 1)
-        return buffer
-
-
 UNIT_FLOAT_DOMAIN = NumberDomain.float(0.0, 1.0)
 
-Domain: TypeAlias = NumberDomain | OneHotDomain
+Domain: TypeAlias = NumberDomain
diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
index adcaa121..3d9d2928 100644
--- a/neps/search_spaces/encoding.py
+++ b/neps/search_spaces/encoding.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
+from itertools import chain
 from typing import (
     TYPE_CHECKING,
     Any,
     Generic,
+    Mapping,
     Sequence,
     Sized,
     TypeAlias,
@@ -17,13 +19,16 @@
 import numpy.typing as npt
 import torch
 from grakel.utils import graph_from_networkx
+from torch._dynamo.utils import product
 
+from neps.search_spaces.architecture.graph_grammar import GraphParameter
 from neps.search_spaces.domain import (
     UNIT_FLOAT_DOMAIN,
     Domain,
     NumberDomain,
-    OneHotDomain,
 )
+from neps.search_spaces.hyperparameters.float import FloatParameter
+from neps.search_spaces.hyperparameters.integer import IntegerParameter
 
 if TYPE_CHECKING:
     import networkx as nx
@@ -47,7 +52,7 @@ class TensorTransformer(Transformer[torch.Tensor], Protocol):
 
     def encode(
         self,
-        x: list[Any],
+        x: Sequence[Any],
         *,
         out: torch.Tensor | None = None,
         dtype: torch.dtype | None = None,
@@ -57,7 +62,7 @@ def encode(
 
 @dataclass
 class CategoricalToIntegerTransformer(TensorTransformer):
-    choices: list[Any]
+    choices: Sequence[Any]
 
     domain: NumberDomain = field(init=False)
     output_cols: int = field(init=False)
@@ -68,6 +73,7 @@ def __post_init__(self):
 
         self.domain = NumberDomain.indices(len(self.choices))
         self.output_cols = 1
+        self._lookup = None
         if len(self.choices) > 3:
             try:
                 self._lookup = {c: i for i, c in enumerate(self.choices)}
@@ -77,7 +83,7 @@ def __post_init__(self):
     @override
     def encode(
         self,
-        x: list[Any],
+        x: Sequence[Any],
         *,
         out: torch.Tensor | None = None,
         dtype: torch.dtype | None = None,
@@ -92,16 +98,16 @@ def encode(
             else [self.choices.index(c) for c in x]
         )
 
+        tensor = torch.tensor(values, dtype=torch.int64, device=device)
         if out is None:
-            return torch.tensor(values, dtype=dtype, device=device)
+            return tensor.to(dtype)
 
-        assert out.shape == (len(x),), f"{out.shape} != {(len(x),)}"
-        out[:] = torch.tensor(values, dtype=out.dtype, device=out.device)
+        out.copy_(tensor.to(out.dtype)).round_()
         return out
 
     @override
     def decode(self, x: torch.Tensor) -> list[Any]:
-        return [self.choices[i] for i in x]
+        return [self.choices[int(i)] for i in torch.round(x).tolist()]
 
 
 # TODO: Maybe add a shift argument, could be useful to have `0` as midpoint
@@ -137,8 +143,7 @@ def encode(
         if out is None:
             return values
 
-        assert out.shape == (len(x),), f"{out.shape} != {(len(x),)}"
-        out[:] = values
+        out.copy_(values)
         return out
 
     @override
@@ -147,51 +152,6 @@ def decode(self, x: torch.Tensor) -> list[V]:
         return values.tolist()
 
 
-@dataclass
-class OneHotEncoder(TensorTransformer):
-    choices: list[Any]
-
-    domain: OneHotDomain = field(init=False)
-    output_cols: int = field(init=False)
-    categorical_to_integer: CategoricalToIntegerTransformer = field(init=False)
-
-    def __post_init__(self):
-        self.categorical_to_integer = CategoricalToIntegerTransformer(self.choices)
-        self.output_cols = len(self.choices)
-
-    @override
-    def encode(
-        self,
-        x: list[Any],
-        *,
-        out: torch.Tensor | None = None,
-        dtype: torch.dtype | None = None,
-        device: torch.device | None = None,
-    ) -> torch.Tensor:
-        if out is not None:
-            dtype = out.dtype
-            device = out.device
-        else:
-            dtype = torch.float64 if dtype is None else dtype
-
-        ints = self.categorical_to_integer.encode(x, dtype=torch.int64, device=device)
-        shape = (len(x), self.output_cols)
-        if out is None:
-            buffer = torch.zeros(size=shape, dtype=dtype, device=device)
-        else:
-            assert out.shape == shape, f"{out.shape} != {shape}"
-            buffer = out
-
-        cat_tensor = torch.tensor(ints, dtype=torch.int64, device=device).unsqueeze(1)
-        buffer.scatter_(1, cat_tensor, 1)
-        return buffer
-
-    @override
-    def decode(self, x: torch.Tensor) -> list[Any]:
-        ints = torch.argmax(x, dim=1)
-        return self.categorical_to_integer.decode(ints)
-
-
 @dataclass
 class WLInputTransformer(Transformer[WLInput]):
     hp: str
@@ -199,7 +159,7 @@ class WLInputTransformer(Transformer[WLInput]):
     def encode(self, x: Sequence[nx.Graph]) -> list[WLInput]:
         return [graph_from_networkx(g) for g in x]  # type: ignore
 
-    def decode(self, x: dict[str, list[WLInput]]) -> dict[str, list[Any]]:
+    def decode(self, x: Mapping[str, Sequence[WLInput]]) -> dict[str, list[Any]]:
         raise NotImplementedError("Cannot decode WLInput to values.")
 
 
@@ -224,10 +184,10 @@ def select(
 
         return x[:, [self.column_lookup[h] for h in hp]]
 
-    def encode(self, x: list[SearchSpace]) -> npt.NDArray[np.object_]:
+    def encode(self, x: Sequence[Any]) -> npt.NDArray[np.object_]:
         buffer = np.empty((len(x), len(self.transformers)), dtype=np.object_)
         for hp, transformer in self.transformers.items():
-            values = [conf[hp].value for conf in x]
+            values = [conf[hp] for conf in x]
             buffer[:, self.column_lookup[hp]] = transformer.encode(values)  # type: ignore
         return buffer
 
@@ -259,7 +219,7 @@ def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor:
 
     def encode(
         self,
-        x: list[SearchSpace],
+        x: Sequence[Mapping[str, Any]],
         *,
         device: torch.device | None = None,
     ) -> torch.Tensor:
@@ -269,11 +229,12 @@ def encode(
         for hp_name, transformer in self.transformers.items():
             values = [conf[hp_name] for conf in x]
             lookup = self.column_lookup[hp_name]
+            lookup = lookup[0] if lookup[1] - lookup[0] == 1 else slice(*lookup)
 
             # Encode directly into buffer
             transformer.encode(
                 values,
-                out=buffer[:, slice(*lookup)],
+                out=buffer[:, lookup],
                 dtype=torch.float64,
                 device=device,
             )
@@ -284,7 +245,12 @@ def decode_dicts(self, x: torch.Tensor) -> list[dict[str, Any]]:
         values: dict[str, list[Any]] = {}
         for hp_name, transformer in self.transformers.items():
             lookup = self.column_lookup[hp_name]
-            values[hp_name] = transformer.decode(x[:, slice(*lookup)])
+            if lookup[1] == lookup[0] + 1:
+                tensor = x[:, lookup[0]]
+            else:
+                tensor = x[:, slice(*lookup)]
+
+            values[hp_name] = transformer.decode(tensor)
 
         keys = list(values.keys())
         return [dict(zip(keys, vals)) for vals in zip(*values.values())]
@@ -297,13 +263,13 @@ class DataEncoder:
 
     def encode(
         self,
-        x: list[SearchSpace],
+        x: Sequence[Mapping[str, Any]],
         *,
         device: torch.device | None = None,
-    ) -> tuple[torch.Tensor | None, npt.NDArray[np.object_] | None]:
+    ) -> DataPack:
         tensor = self.tensors.encode(x, device=device) if self.tensors else None
         graphs = self.graphs.encode(x) if self.graphs else None
-        return tensor, graphs
+        return DataPack(encoder=self, tensor=tensor, graphs=graphs)
 
     @overload
     def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor: ...
@@ -358,21 +324,110 @@ def decode_dicts(
         assert graph_values is not None
         return graph_values
 
+    def indices(self, hp: str | Sequence[str]) -> tuple[int, ...]:
+        if isinstance(hp, str):
+            if self.tensors and hp in self.tensors.transformers:
+                lower, upper = self.tensors.column_lookup[hp]
+                return tuple(torch.arange(lower, upper).tolist())
+
+            if self.graphs and hp in self.graphs.transformers:
+                raise ValueError("Cannot select indices from graphs.")
+
+            tkeys = None if self.tensors is None else self.tensors.transformers.keys()
+            gkeys = None if self.graphs is None else self.graphs.transformers.keys()
+            raise KeyError(
+                f"Unknown hyperparameter {hp}. Not in either tensors or graphs"
+                f"\nTensors: {tkeys}"
+                f"\nGraphs: {gkeys}"
+            )
+
+        return tuple(sorted(chain.from_iterable(self.indices(h) for h in hp)))
+
+    @classmethod
+    def default_encoder(
+        cls,
+        space: SearchSpace,
+        *,
+        include_fidelities: bool | list[str] = False,
+    ) -> DataEncoder:
+        tensor_transformers: dict[str, TensorTransformer] = {}
+        graph_transformers: dict[str, WLInputTransformer] = {}
+
+        for hp_name, hp in space.categoricals.items():
+            tensor_transformers[hp_name] = CategoricalToIntegerTransformer(hp.choices)
+
+        for hp_name, hp in space.numerical.items():
+            assert isinstance(hp, (FloatParameter, IntegerParameter))
+            tensor_transformers[hp_name] = MinMaxNormalizer(hp.domain)
+
+        for hp_name, hp in space.graphs.items():
+            assert isinstance(hp, GraphParameter)
+            graph_transformers[hp_name] = WLInputTransformer(hp_name)
+
+        if include_fidelities is True:
+            include_fidelities = list(space.fidelities.keys())
+
+        if include_fidelities:
+            for fid_name in include_fidelities:
+                hp = space.fidelities[fid_name]
+                assert isinstance(hp, (FloatParameter, IntegerParameter))
+                tensor_transformers[fid_name] = MinMaxNormalizer(hp.domain)
+
+        tensor_encoder = (
+            TensorEncoder(tensor_transformers) if any(tensor_transformers) else None
+        )
+        graph_encoder = (
+            GraphEncoder(graph_transformers) if any(graph_transformers) else None
+        )
+        return DataEncoder(tensors=tensor_encoder, graphs=graph_encoder)
+
+    def has_categoricals(self) -> bool:
+        return self.tensors is not None and any(
+            isinstance(t, CategoricalToIntegerTransformer)
+            for t in self.tensors.transformers.values()
+        )
+
+    def has_graphs(self) -> bool:
+        return self.graphs is not None
+
+    def has_numericals(self) -> bool:
+        return self.tensors is not None and any(
+            not isinstance(t, CategoricalToIntegerTransformer)
+            for t in self.tensors.transformers.values()
+        )
+
+    def categorical_product_indices(self) -> list[dict[int, int]]:
+        cats: dict[int, list[int]] = {}
+        if self.tensors is None:
+            return []
+
+        for i, (_hp_name, transformer) in enumerate(self.tensors.transformers.items()):
+            if isinstance(transformer, CategoricalToIntegerTransformer):
+                cats[i] = list(range(len(transformer.choices)))
+
+        if len(cats) == 0:
+            return []
+
+        if len(cats) == 1:
+            key, values = cats.popitem()
+            return [{key: v} for v in values]
+
+        return [dict(zip(cats.keys(), vs)) for vs in product(*cats.values())]
+
 
 @dataclass
 class DataPack(Sized):
-    space: SearchSpace
     encoder: DataEncoder
-    numerical: torch.Tensor | None = None
+    tensor: torch.Tensor | None = None
     graphs: npt.NDArray[np.object_] | None = None
     _len: int = field(init=False)
 
     def __post_init__(self):
-        if self.numerical is not None and self.graphs is not None:
-            assert len(self.numerical) == len(self.graphs)
-            self._len = len(self.numerical)
-        elif self.numerical is not None:
-            self._len = len(self.numerical)
+        if self.tensor is not None and self.graphs is not None:
+            assert len(self.tensor) == len(self.graphs)
+            self._len = len(self.tensor)
+        elif self.tensor is not None:
+            self._len = len(self.tensor)
         elif self.graphs is not None:
             self._len = len(self.graphs)
         else:
@@ -384,8 +439,8 @@ def __len__(self) -> int:
     def select(self, hp: str | Sequence[str]) -> torch.Tensor | npt.NDArray[np.object_]:
         if isinstance(hp, str):
             if self.encoder.tensors and hp in self.encoder.tensors.transformers:
-                assert self.numerical is not None
-                return self.encoder.tensors.select(self.numerical, hp)
+                assert self.tensor is not None
+                return self.encoder.tensors.select(self.tensor, hp)
 
             if self.encoder.graphs and hp in self.encoder.graphs.transformers:
                 assert self.graphs is not None
@@ -427,16 +482,67 @@ def select(self, hp: str | Sequence[str]) -> torch.Tensor | npt.NDArray[np.objec
             )
 
         if all_in_tensors:
-            assert self.numerical is not None
+            assert self.tensor is not None
             assert self.encoder.tensors is not None
-            return self.encoder.tensors.select(self.numerical, hp)
+            return self.encoder.tensors.select(self.tensor, hp)
 
         assert self.graphs is not None
         assert self.encoder.graphs is not None
         return self.encoder.graphs.select(self.graphs, hp)
 
-    def decode(self) -> list[SearchSpace]:
+    def decode(self, space: SearchSpace) -> list[SearchSpace]:
         return [
-            self.space.from_dict(d)
-            for d in self.encoder.decode_dicts((self.numerical, self.graphs))
+            space.from_dict(d)
+            for d in self.encoder.decode_dicts((self.tensor, self.graphs))
         ]
+
+    def split(self, index: int) -> tuple[DataPack, DataPack]:
+        if self.tensor is not None:
+            numerical_left = self.tensor[:index]
+            numerical_right = self.tensor[index:]
+        else:
+            numerical_left = None
+            numerical_right = None
+
+        if self.graphs is not None:
+            graphs_left = self.graphs[:index]
+            graphs_right = self.graphs[:index]
+        else:
+            graphs_left = None
+            graphs_right = None
+
+        return (
+            DataPack(
+                self.encoder,
+                tensor=numerical_left,
+                graphs=graphs_left,
+            ),
+            DataPack(
+                self.encoder,
+                tensor=numerical_right,
+                graphs=graphs_right,
+            ),
+        )
+
+    def join(self, *other: DataPack) -> DataPack:
+        assert all(o.encoder == self.encoder for o in other)
+
+        if self.tensor is not None:
+            other_numericals = []
+            for o in other:
+                assert o.tensor is not None
+                other_numericals.append(o.tensor)
+            numerical = torch.cat([self.tensor, *other_numericals], dim=0)
+        else:
+            numerical = None
+
+        if self.graphs is not None:
+            other_graphs = []
+            for o in other:
+                assert o.graphs is not None
+                other_graphs.append(o.graphs)
+            graphs = np.concatenate([self.graphs, *other_graphs], axis=0)
+        else:
+            graphs = None
+
+        return DataPack(self.encoder, tensor=numerical, graphs=graphs)
diff --git a/neps/search_spaces/hyperparameters/float.py b/neps/search_spaces/hyperparameters/float.py
index b780f3ff..6086e3b7 100644
--- a/neps/search_spaces/hyperparameters/float.py
+++ b/neps/search_spaces/hyperparameters/float.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 
+from neps.search_spaces.domain import NumberDomain
 from neps.search_spaces.hyperparameters.numerical import NumericalParameter
 
 if TYPE_CHECKING:
@@ -70,6 +71,7 @@ def __init__(
             default=float(default) if default is not None else None,
             default_confidence=default_confidence,
             is_fidelity=is_fidelity,
+            domain=NumberDomain.float(lower, upper, log=log),
         )
 
     @override
diff --git a/neps/search_spaces/hyperparameters/integer.py b/neps/search_spaces/hyperparameters/integer.py
index 6462cc63..da3bbd71 100644
--- a/neps/search_spaces/hyperparameters/integer.py
+++ b/neps/search_spaces/hyperparameters/integer.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 
+from neps.search_spaces.domain import NumberDomain
 from neps.search_spaces.hyperparameters.float import FloatParameter
 from neps.search_spaces.hyperparameters.numerical import NumericalParameter
 
@@ -75,6 +76,7 @@ def __init__(
             is_fidelity=is_fidelity,
             default=int(np.rint(default)) if default is not None else None,
             default_confidence=default_confidence,
+            domain=NumberDomain.int(lower, upper, log=log),
         )
 
         # We subtract/add 0.499999 from lower/upper bounds respectively, such that
diff --git a/neps/search_spaces/hyperparameters/numerical.py b/neps/search_spaces/hyperparameters/numerical.py
index 9aaaf6d1..f00b590c 100644
--- a/neps/search_spaces/hyperparameters/numerical.py
+++ b/neps/search_spaces/hyperparameters/numerical.py
@@ -32,6 +32,7 @@
 from neps.search_spaces.parameter import MutatableParameter, ParameterWithPrior
 
 if TYPE_CHECKING:
+    from neps.search_spaces.domain import NumberDomain
     from neps.search_spaces.hyperparameters.float import FloatParameter
     from neps.search_spaces.hyperparameters.integer import IntegerParameter
     from neps.utils.types import TruncNorm
@@ -81,6 +82,7 @@ def __init__(
         log: bool = False,
         default: T | None,
         is_fidelity: bool,
+        domain: NumberDomain[T],
         default_confidence: Literal["low", "medium", "high"] = "low",
     ):
         """Initialize the numerical hyperparameter.
@@ -133,6 +135,7 @@ def __init__(
         self.lower: T = lower
         self.upper: T = upper
         self.log: bool = log
+        self.domain = domain
         self.log_value: float | None = None
         self.log_bounds: tuple[float, float] | None = None
         self.log_default: float | None = None
diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py
index 2a20399d..a89c9bcc 100644
--- a/neps_examples/basic_usage/hyperparameters.py
+++ b/neps_examples/basic_usage/hyperparameters.py
@@ -2,19 +2,30 @@
 import time
 
 import numpy as np
+import math
+import random
 
 import neps
 
 
-def run_pipeline(float1, float2, categorical, integer1, integer2):
-    loss = -float(np.sum([float1, float2, int(categorical), integer1, integer2]))
+def run_pipeline(float1, float2, float3, categorical, integer1, integer2):
+    loss = -float(
+        np.sum(
+            [
+                (float1 * float2 / (float3 + 1)) * int(categorical),
+                integer1,
+                math.log(integer2),
+            ]
+        )
+    )  # Random noise
     # time.sleep(0.7)  # For demonstration purposes
     return loss
 
 
 pipeline_space = dict(
     float1=neps.FloatParameter(lower=0, upper=1),
-    float2=neps.FloatParameter(lower=-10, upper=10),
+    float2=neps.FloatParameter(lower=0, upper=20),
+    float3=neps.FloatParameter(lower=0, upper=5),
     categorical=neps.CategoricalParameter(choices=[0, 1]),
     integer1=neps.IntegerParameter(lower=0, upper=1),
     integer2=neps.IntegerParameter(lower=1, upper=1000, log=True),

From cf43821cef6cd38cb53b4cfbdb50b3ca7f776f68 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Tue, 27 Aug 2024 18:03:25 +0200
Subject: [PATCH 12/63] refactor: Prior distributions

---
 .../acquisition_functions/__init__.py         |   9 +-
 .../acquisition_functions/prior_weighted.py   |  56 +++-
 .../bayesian_optimization/cost_cooling.py     |  49 ++--
 .../bayesian_optimization/models/gp.py        |  19 +-
 .../bayesian_optimization/optimizer.py        | 273 +++++++-----------
 neps/runtime.py                               |   7 +-
 neps/search_spaces/domain.py                  | 204 ++++++-------
 neps/search_spaces/encoding.py                |  73 +++--
 neps/search_spaces/hyperparameters/float.py   |   4 +-
 neps/search_spaces/hyperparameters/integer.py |   4 +-
 .../hyperparameters/numerical.py              |   4 +-
 neps/state/neps_state.py                      | 140 ++++-----
 neps/state/optimizer.py                       |  12 +-
 pyproject.toml                                |   1 +
 14 files changed, 423 insertions(+), 432 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py b/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py
index 89cfb4fb..0d0893a5 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py
@@ -7,14 +7,13 @@
     ComprehensiveExpectedImprovement,
 )
 from neps.optimizers.bayesian_optimization.acquisition_functions.mf_ei import MFEI
-from neps.optimizers.bayesian_optimization.acquisition_functions.ucb import (
-    UpperConfidenceBound,
-    MF_UCB,
-)
 from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import (
     DecayingPriorWeightedAcquisition,
 )
-
+from neps.optimizers.bayesian_optimization.acquisition_functions.ucb import (
+    MF_UCB,
+    UpperConfidenceBound,
+)
 
 AcquisitionMapping: dict[str, Callable] = {
     "EI": partial(
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
index ca3a3f5b..7b0d4318 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
@@ -1,9 +1,59 @@
-from typing import Iterable, Union
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Iterable
+from typing_extensions import override
 
 import numpy as np
 import torch
+from botorch.acquisition import MCAcquisitionFunction
+
+from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
+    BaseAcquisition,
+)
+
+if TYPE_CHECKING:
+    from neps.priors import Prior
+
+
+class PiboAcquisition(MCAcquisitionFunction):
+    """Compute a prior weighted acquisition function according to PiBO.
+
+    * https://arxiv.org/pdf/2204.11051
+    """
+
+    def __init__(
+        self,
+        acq_fn: MCAcquisitionFunction,
+        prior: Prior,
+        beta: float,
+        n: int,
+    ):
+        """Initialize the acquisition function.
+
+        Args:
+            acq_fn: The acquisition function to be weighted.
+            prior: The prior distribution to be used for weighting.
+            beta: The beta parameter for weighting.
+            n: The denominator for the beta parameter.
+        """
+        self._log = self.acq_fn._log
+        self.acq_fn = acq_fn
+
+        self.beta = beta
+        self.n = n
+        self.prior = prior
+
+    @override
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+        weight = self.beta / self.n
+        acq = self.acq_fn(X)
+
+        # The weight is shown as being applied to the pdf and not the log_pdf
+        values = acq * self.prior.prob(X) * weight
 
-from .base_acquisition import BaseAcquisition
+        # However, if the base acq function advertises as being log,
+        # i.e. self._log, then we should return the log of the values
+        return torch.log(values) if self._log else values
 
 
 class DecayingPriorWeightedAcquisition(BaseAcquisition):
@@ -23,7 +73,7 @@ def eval(
         self,
         x: Iterable,
         **base_acquisition_kwargs,
-    ) -> Union[np.ndarray, torch.Tensor, float]:
+    ) -> np.ndarray | torch.Tensor | float:
         acquisition = self.base_acquisition(x, **base_acquisition_kwargs)
 
         if self.log:
diff --git a/neps/optimizers/bayesian_optimization/cost_cooling.py b/neps/optimizers/bayesian_optimization/cost_cooling.py
index 5a8926c7..eb3ee28e 100644
--- a/neps/optimizers/bayesian_optimization/cost_cooling.py
+++ b/neps/optimizers/bayesian_optimization/cost_cooling.py
@@ -1,35 +1,35 @@
 from __future__ import annotations
 
-from typing import Any
+from typing import TYPE_CHECKING, Any
 from typing_extensions import override
 
-from neps.state.optimizer import BudgetInfo
-from neps.utils.types import ConfigResult
-from neps.utils.common import instance_from_map
+from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping
 from neps.optimizers.bayesian_optimization.acquisition_functions.cost_cooling import (
     CostCooler,
 )
-from neps.search_spaces.search_space import SearchSpace
-from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping
-from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
-    BaseAcquisition,
-)
-from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import (
-    DecayingPriorWeightedAcquisition,
-)
 from neps.optimizers.bayesian_optimization.acquisition_samplers import (
     AcquisitionSamplerMapping,
 )
-from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-    AcquisitionSampler,
-)
 from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
 from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization
+from neps.utils.common import instance_from_map
+
+if TYPE_CHECKING:
+    from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
+        BaseAcquisition,
+    )
+    from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
+        AcquisitionSampler,
+    )
+    from neps.search_spaces.search_space import SearchSpace
+    from neps.state.optimizer import BudgetInfo
+    from neps.utils.types import ConfigResult
 
 
 class CostCooling(BayesianOptimization):
     """Implements a basic cost-cooling as described in
-    "Cost-aware Bayesian Optimization" (https://arxiv.org/abs/2003.10870) by Lee et al."""
+    "Cost-aware Bayesian Optimization" (https://arxiv.org/abs/2003.10870) by Lee et al.
+    """
 
     def __init__(
         self,
@@ -37,12 +37,12 @@ def __init__(
         initial_design_size: int = 10,
         surrogate_model: str | Any = "gp",
         cost_model: str | Any = "gp",
-        surrogate_model_args: dict = None,
-        cost_model_args: dict = None,
+        surrogate_model_args: dict | None = None,
+        cost_model_args: dict | None = None,
         optimal_assignment: bool = False,
-        domain_se_kernel: str = None,
-        graph_kernels: list = None,
-        hp_kernels: list = None,
+        domain_se_kernel: str | None = None,
+        graph_kernels: list | None = None,
+        hp_kernels: list | None = None,
         acquisition: str | BaseAcquisition = "EI",
         log_prior_weighted: bool = False,
         acquisition_sampler: str | AcquisitionSampler = "mutation",
@@ -181,11 +181,6 @@ def __init__(
 
         self.acquisition = CostCooler(orig_acquisition)
 
-        if self.pipeline_space.has_prior:
-            self.acquisition = DecayingPriorWeightedAcquisition(
-                self.acquisition, log=log_prior_weighted
-            )
-
         self.acquisition_sampler = instance_from_map(
             AcquisitionSamplerMapping,
             acquisition_sampler,
@@ -214,7 +209,7 @@ def load_optimization_state(
         train_y = [self.get_loss(el.result) for el in previous_results.values()]
         train_cost = [self.get_cost(el.result) for el in previous_results.values()]
         self._num_train_x = len(train_x)
-        self._pending_evaluations = [el for el in pending_evaluations.values()]
+        self._pending_evaluations = list(pending_evaluations.values())
         if self._num_train_x >= self._initial_design_size:
             try:
                 if len(self._pending_evaluations) > 0:
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index ab2884f3..b302edcd 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -144,7 +144,10 @@ def default_categorical_kernel(
     )
 
 
-def default_single_obj_gp(x: DataPack, y: torch.Tensor) -> SingleTaskGP:
+def default_single_obj_gp(
+    x: DataPack,
+    y: torch.Tensor,
+) -> SingleTaskGP:
     encoder = x.encoder
     assert x.tensor is not None
     assert encoder.tensors is not None
@@ -222,9 +225,9 @@ def optimize_acq(
     acq_fn: AcquisitionFunction,
     encoder: DataEncoder,
     *,
-    q: int,
-    num_restarts: int,
-    raw_samples: int,
+    n_candidates_required: int = 1,
+    num_restarts: int = 20,
+    n_intial_start_points: int = 512,
     acq_options: Mapping[str, Any] | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     acq_options = acq_options or {}
@@ -242,9 +245,9 @@ def optimize_acq(
         return optimize_acqf(
             acq_function=acq_fn,
             bounds=bounds,
-            q=q,
+            q=n_candidates_required,
             num_restarts=num_restarts,
-            raw_samples=raw_samples,
+            raw_samples=n_intial_start_points,
             **acq_options,
         )
 
@@ -262,8 +265,8 @@ def optimize_acq(
         acq_function=acq_fn,
         bounds=bounds,
         num_restarts=num_restarts,
-        raw_samples=raw_samples,
-        q=q,
+        raw_samples=n_intial_start_points,
+        q=n_candidates_required,
         fixed_features_list=fixed_categoricals,  # type: ignore
         **acq_options,
     )
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index a89a15aa..a08578d6 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-import random
+import math
 from typing import TYPE_CHECKING, Any, Callable, Literal, Mapping
 
 import torch
@@ -10,38 +10,25 @@
 )
 
 from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig
-from neps.optimizers.bayesian_optimization.acquisition_functions import (
-    DecayingPriorWeightedAcquisition,
+from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import (
+    PiboAcquisition,
 )
 from neps.optimizers.bayesian_optimization.models.gp import (
     default_single_obj_gp,
     optimize_acq,
 )
-from neps.search_spaces import (
-    CategoricalParameter,
-    ConstantParameter,
-    FloatParameter,
-    IntegerParameter,
-    SearchSpace,
-)
-from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN
+from neps.optimizers.initial_design import Sobol
 from neps.search_spaces.encoding import DataEncoder
 
 if TYPE_CHECKING:
     from botorch.models.model import Model
 
+    from neps.search_spaces import (
+        SearchSpace,
+    )
     from neps.search_spaces.encoding import DataPack
     from neps.state import BudgetInfo, Trial
 
-# TODO(eddiebergman): Why not just include in the definition of the parameters.
-CUSTOM_FLOAT_CONFIDENCE_SCORES = dict(FloatParameter.DEFAULT_CONFIDENCE_SCORES)
-CUSTOM_FLOAT_CONFIDENCE_SCORES.update({"ultra": 0.05})
-
-CUSTOM_CATEGORICAL_CONFIDENCE_SCORES = dict(
-    CategoricalParameter.DEFAULT_CONFIDENCE_SCORES
-)
-CUSTOM_CATEGORICAL_CONFIDENCE_SCORES.update({"ultra": 8})
-
 
 class BayesianOptimization(BaseOptimizer):
     """Implements the basic BO loop."""
@@ -50,18 +37,9 @@ def __init__(
         self,
         pipeline_space: SearchSpace,
         *,
-        initial_design_size: int = 10,
+        initial_design_size: int | None = None,
         surrogate_model: Literal["gp"] | Callable[[DataPack, torch.Tensor], Model] = "gp",
-        log_prior_weighted: bool = False,
-        random_interleave_prob: float = 0.0,
-        patience: int = 100,
-        budget: None | int | float = None,
-        ignore_errors: bool = False,
-        loss_value_on_error: None | float = None,
-        cost_value_on_error: None | float = None,
-        logger=None,
-        disable_priors: bool = False,
-        prior_confidence: Literal["low", "medium", "high"] | None = None,
+        use_priors: bool = False,
         sample_default_first: bool = False,
         device: torch.device | None = None,
         **kwargs: Any,  # TODO: Remove
@@ -70,95 +48,49 @@ def __init__(
 
         Args:
             pipeline_space: Space in which to search
-            initial_design_size: Number of 'x' samples that need to be evaluated before
-                selecting a sample using a strategy instead of randomly.
+            initial_design_size: Number of samples used before using the surrogate model.
+                If None, it will take `int(log(N) ** 2)` samples where `N` is the number
+                of parameters in the search space.
             surrogate_model: Surrogate model
-            acquisition: Acquisition strategy
-            log_prior_weighted: if to use log for prior
-            acquisition_sampler: Acquisition function fetching strategy
-            random_interleave_prob: Frequency at which random configurations are sampled
-                instead of configurations from the acquisition strategy.
-            patience: How many times we try something that fails before giving up.
-            budget: Maximum budget
-            ignore_errors: Ignore hyperparameter settings that threw an error and do not
-                raise an error. Error configs still count towards max_evaluations_total.
-            loss_value_on_error: Setting this and cost_value_on_error to any float will
-                supress any error during bayesian optimization and will use given loss
-                value instead. default: None
-            cost_value_on_error: Setting this and loss_value_on_error to any float will
-                supress any error during bayesian optimization and will use given cost
-                value instead. default: None
-            logger: logger object, or None to use the neps logger
-            disable_priors: allows to choose between BO and piBO regardless the search
-                space definition
-            sample_default_first: if True and a default prior exists, the first sampel is
-                the default configuration
+            use_priors: Whether to use priors set on the hyperparameters during search.
 
         Raises:
-            ValueError: if patience < 1
             ValueError: if initial_design_size < 1
-            ValueError: if random_interleave_prob is not between 0.0 and 1.0
             ValueError: if no kernel is provided
         """
-        if disable_priors:
-            pipeline_space.has_prior = False
-            self.prior_confidence = None
-        else:
-            self.prior_confidence = prior_confidence
-
-        super().__init__(
-            pipeline_space=pipeline_space,
-            patience=patience,
-            logger=logger,
-            budget=budget,
-            loss_value_on_error=loss_value_on_error,
-            cost_value_on_error=cost_value_on_error,
-            ignore_errors=ignore_errors,
-        )
-
-        if initial_design_size < 1:
+        if initial_design_size is None:
+            N = len(pipeline_space.hyperparameters)
+            initial_design_size = int(max(1, math.log(N) ** 2))
+        elif initial_design_size < 1:
             raise ValueError(
                 "BayesianOptimization needs initial_design_size to be at least 1"
             )
-        if not 0 <= random_interleave_prob <= 1:
-            raise ValueError("random_interleave_prob should be between 0.0 and 1.0")
 
-        self._initial_design_size = initial_design_size
-        self._random_interleave_prob = random_interleave_prob
-        self._num_error_evaluations: int = 0
+        super().__init__(pipeline_space=pipeline_space)
+
+        self.use_priors = use_priors
+
+        # TODO: This needs to be moved to the search space class, however to not break
+        # the current prior based APIs, we will create this manually here
+        if use_priors:
+            self._prior_confidences = {}
+
         self.device = device
         self.sample_default_first = sample_default_first
-        self.encoder: DataEncoder | None = None
+        self.n_initial_design = initial_design_size
 
         if surrogate_model == "gp":
             self._get_fitted_model = default_single_obj_gp
         else:
             self._get_fitted_model = surrogate_model
 
-        if self.pipeline_space.has_prior:
-            self.acquisition = DecayingPriorWeightedAcquisition(
-                self.acquisition, log=log_prior_weighted
-            )
-
-        if self.pipeline_space.has_prior:
-            for k, v in self.pipeline_space.items():
-                if v.is_fidelity or isinstance(v, ConstantParameter):
-                    continue
-                elif isinstance(v, (FloatParameter, IntegerParameter)):
-                    confidence = CUSTOM_FLOAT_CONFIDENCE_SCORES[self.prior_confidence]
-                    self.pipeline_space[k].default_confidence_score = confidence
-                elif isinstance(v, CategoricalParameter):
-                    confidence = CUSTOM_CATEGORICAL_CONFIDENCE_SCORES[
-                        self.prior_confidence
-                    ]
-                    self.pipeline_space[k].default_confidence_score = confidence
-
-        self._cached_sobol_configs: list[dict[str, Any]] | None = None
+        self.encoder_: DataEncoder | None = None
+        self.initial_design_: list[dict[str, Any]] | None = None
 
     def ask(
         self,
         trials: Mapping[str, Trial],
-        budget_info: BudgetInfo | None,
+        budget_info: BudgetInfo,
         optimizer_state: dict[str, Any],
     ) -> tuple[SampledConfig, dict[str, Any]]:
         # TODO: Lift this into runtime, let the
@@ -174,96 +106,91 @@ def ask(
             dtype=torch.float64,
         )  # type: ignore
 
-        # We only do single objective for now but may as well include this for when we have MO
+        # We only do single objective for now but may as well include this
+        # for when we have MO
         if y.ndim == 1:
             y = y.unsqueeze(1)
 
         pending = [t.config for t in trials.values() if t.state.pending()]
-        if self.encoder is None:
-            self.encoder = DataEncoder.default_encoder(
+        if self.encoder_ is None:
+            self.encoder_ = DataEncoder.default_encoder(
                 self.pipeline_space,
                 include_fidelities=False,
             )
 
         space = self.pipeline_space
 
-        if len(trials) == 0 and self.sample_default_first and space.has_prior:
-            config = space.sample_default_configuration(
-                patience=self.patience, ignore_fidelity=False
-            ).hp_values()
-
-        elif len(trials) <= self._initial_design_size:
-            if self._cached_sobol_configs is None:
-                assert self.encoder.tensors is not None
-                ndim = len(self.encoder.tensors.transformers)
-                sobol = torch.quasirandom.SobolEngine(
-                    dimension=ndim,
-                    scramble=True,
-                    seed=5,
-                )
+        if self.initial_design_ is None:
+            size = self.n_initial_design
+            self.initial_design_ = []
 
-                # TODO: Need a better encapsulation of this
-                x = sobol.draw(self._initial_design_size * ndim, dtype=torch.float64)
-                hp_normalized_values = []
-                for i, (_k, v) in enumerate(self.encoder.tensors.transformers.items()):
-                    tensor = v.domain.cast(x[:, i], frm=UNIT_FLOAT_DOMAIN)
-                    tensor = tensor.unsqueeze(1) if tensor.ndim == 1 else tensor
-                    hp_normalized_values.append(tensor)
+            if self.sample_default_first:
+                config = space.sample_default_configuration()
+                self.initial_design_.append(config.hp_values())
 
-                tensor = torch.cat(hp_normalized_values, dim=1)
-                uniq = torch.unique(tensor, dim=0)
-                self._cached_sobol_configs = self.encoder.tensors.decode_dicts(uniq)
+            assert self.encoder_.tensors is not None
+            sobol = Sobol(seed=0, encoder=self.encoder_, allow_undersampling=True)
+            sobol_configs = sobol.sample(size - len(self.initial_design_))
+            self.initial_design_.extend(sobol_configs)
+        else:
+            self.initial_design_ = []
 
-            if len(trials) <= len(self._cached_sobol_configs):
-                config = self._cached_sobol_configs[len(trials) - 1]
-            else:
-                # The case where sobol sampling couldn't generate enough unique configs
-                config = space.sample(
-                    patience=self.patience, ignore_fidelity=False, user_priors=False
-                ).hp_values()
+        config_id = str(len(trials) + 1)
+        if len(trials) < len(self.initial_design_):
+            config = self.initial_design_[len(trials)]
+            return (
+                SampledConfig(id=config_id, config=config, previous_config_id=None),
+                optimizer_state,
+            )
 
-        elif random.random() < self._random_interleave_prob:
-            config = space.sample(
-                patience=self.patience, user_priors=False, ignore_fidelity=False
-            ).hp_values()
+        assert self.encoder_ is not None
+        x = self.encoder_.encode(x_configs, device=self.device)
+        if any(pending):
+            x_pending = self.encoder_.encode(pending, device=self.device)
+            x_pending = x_pending.tensor
+            assert x_pending is not None
         else:
-            assert self.encoder is not None
-            x = self.encoder.encode(x_configs, device=self.device)
-            if any(pending):
-                x_pending = self.encoder.encode(pending, device=self.device)
-                x_pending = x_pending.tensor
-                assert x_pending is not None
-            else:
-                x_pending = None
+            x_pending = None
 
-            model = self._get_fitted_model(x, y)
+        model = self._get_fitted_model(x, y)
 
-            N_CANDIDATES_REQUIRED = 1
-            N_INITIAL_RANDOM_SAMPLES = 512
-            N_RESTARTS = 20
-
-            candidates, _eis = optimize_acq(
-                # TODO: We should evaluate whether LogNoisyEI is better than LogEI
-                acq_fn=qLogExpectedImprovement(
-                    model,
-                    best_f=y.min(),
-                    X_pending=x_pending,
-                    # Unfortunatly, there's no option to indicate that we minimize
-                    # the AcqFunction so we need to do some kind of transformation.
-                    # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607
-                    objective=LinearMCObjective(weights=torch.tensor([-1.0])),
-                ),
-                encoder=self.encoder,
-                q=N_CANDIDATES_REQUIRED,
-                raw_samples=N_INITIAL_RANDOM_SAMPLES,
-                num_restarts=N_RESTARTS,
-                acq_options={},  # options to underlying optim function of botorch
-            )
-            config = self.encoder.decode_dicts(candidates)[0]
+        acq = qLogExpectedImprovement(
+            model,
+            best_f=y.min(),
+            X_pending=x_pending,
+            objective=LinearMCObjective(weights=torch.tensor([-1.0])),
+        )
 
-        config_id = str(len(trials) + 1)
-        return SampledConfig(
-            id=config_id,
-            config=config,
-            previous_config_id=None,
-        ), optimizer_state
+        if self.use_priors:
+            # From the PIBO paper (Section 4.1)
+            # https://arxiv.org/pdf/2204.11051
+            if budget_info.max_evaluations is not None:
+                beta = budget_info.max_evaluations / 10
+                n = budget_info.used_evaluations
+            elif budget_info.max_cost_budget is not None:
+                # This might not work well if cost number is high
+                # early on, but it will start to normalize.
+                beta = budget_info.max_cost_budget / 10
+                n = budget_info.used_cost_budget
+
+            acq = PiboAcquisition(acq, n=n, beta=beta)
+
+        candidates, _eis = optimize_acq(
+            # TODO: We should evaluate whether LogNoisyEI is better than LogEI
+            acq_fn=qLogExpectedImprovement(
+                model,
+                best_f=y.min(),
+                X_pending=x_pending,
+                # Unfortunatly, there's no option to indicate that we minimize
+                # the AcqFunction so we need to do some kind of transformation.
+                # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607
+                objective=LinearMCObjective(weights=torch.tensor([-1.0])),
+            ),
+            encoder=self.encoder_,
+            acq_options={},  # options to underlying optim function of botorch
+        )
+        config = self.encoder_.decode_dicts(candidates)[0]
+        return (
+            SampledConfig(id=config_id, config=config, previous_config_id=None),
+            optimizer_state,
+        )
diff --git a/neps/runtime.py b/neps/runtime.py
index 7d1cd60f..b102b153 100644
--- a/neps/runtime.py
+++ b/neps/runtime.py
@@ -513,7 +513,12 @@ def _launch_runtime(  # noqa: PLR0913
         optimizer_info=OptimizerInfo(optimizer_info),
         optimizer_state=OptimizationState(
             budget=(
-                BudgetInfo(max_cost_budget=max_cost_total, used_cost_budget=0)
+                BudgetInfo(
+                    max_cost_budget=max_cost_total,
+                    used_cost_budget=0,
+                    max_evaluations=max_evaluations_total,
+                    used_evaluations=0,
+                )
                 if max_cost_total is not None
                 else None
             ),
diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py
index e3e297de..2081bf33 100644
--- a/neps/search_spaces/domain.py
+++ b/neps/search_spaces/domain.py
@@ -1,31 +1,58 @@
+"""A class representing a domain, a range for a value + properties.
+
+Some properties include:
+
+* The lower and upper bounds of the domain.
+* Whether the domain is a log domain.
+* Whether the domain is float/int.
+* The midpoint of the domain.
+* Whether the domain is split into bins.
+
+With that, the primary method of a domain is to be able to cast
+values from one to domain to another,
+e.g. `values_a = domain_a.cast(values_b, frm=domain_b)`.
+
+This can be used to convert float samples to integers, integers
+to log space, etc.
+
+The core method to do so is to be able to cast `to_unit` which takes
+values to a unit interval [0, 1], and then to be able to cast values in [0, 1]
+to the new domain with `from_unit`.
+
+There are some shortcuts implemented in `cast`, such as skipping going through
+the unit interval if the domains are the same, as no transformation is needed.
+
+The primary methods for creating a domain are
+
+* `Domain.float(l, u, ...)` - Used for modelling float ranges
+* `Domain.int(l, u, ...)` - Used for modelling integer ranges
+* `Domain.indices(n)` - Primarly used to model categorical choices
+
+If you have a tensor of values, where each column corresponds to a different domain,
+you can take a look at `Domain.cast_many` to cast all the values in one go.
+
+If you need a unit-interval domain, please use the `Domain.unit_float()` or
+`UNIT_FLOAT_DOMAIN` constant.
+"""
+
 # TODO: Could theoretically implement dtype,device,out for all methods here but
 # would need to be careful not to accidentally send to and from GPU.
 from __future__ import annotations
 
 import math
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Generic, TypeVar
-from typing_extensions import TypeAlias
+from typing import Generic, Sequence, TypeVar
 
 import torch
 from torch import Tensor
 
-if TYPE_CHECKING:
-    from neps.search_spaces.distributions.truncnorm import TruncNormDistribution
-    from neps.search_spaces.distributions.uniform_float import (
-        UniformFloatDistribution,
-    )
-    from neps.search_spaces.distributions.uniform_int import UniformIntDistribution
-    from neps.search_spaces.distributions.weighted_ints import WeightedIntsDistribution
-
-
 Number = int | float
 V = TypeVar("V", int, float)
 V2 = TypeVar("V2", int, float)
 
 
 @dataclass(frozen=True)
-class NumberDomain(Generic[V]):
+class Domain(Generic[V]):
     lower: V
     upper: V
     round: bool
@@ -74,8 +101,8 @@ def float(
         *,
         log: bool = False,
         bins: int | None = None,
-    ) -> NumberDomain[float]:
-        return NumberDomain(
+    ) -> Domain[float]:
+        return Domain(
             lower=float(lower),
             upper=float(upper),
             log_bounds=(math.log(lower), math.log(upper)) if log else None,
@@ -91,8 +118,8 @@ def int(
         *,
         log: bool = False,
         bins: int | None = None,
-    ) -> NumberDomain[int]:
-        return NumberDomain(
+    ) -> Domain[int]:
+        return Domain(
             lower=int(round(lower)),
             upper=int(round(upper)),
             log_bounds=(math.log(lower), math.log(upper)) if log else None,
@@ -101,7 +128,7 @@ def int(
         )
 
     @classmethod
-    def indices(cls, n: int) -> NumberDomain[int]:
+    def indices(cls, n: int) -> Domain[int]:
         """Create a domain for a range of indices.
 
         Like range based functions this domain is inclusive of the lower bound
@@ -109,7 +136,7 @@ def indices(cls, n: int) -> NumberDomain[int]:
 
         Use this method to create a domain for indices
         """
-        return NumberDomain.int(0, n - 1)
+        return Domain.int(0, n - 1)
 
     def to_unit(self, x: Tensor) -> Tensor:
         if self.is_unit_float:
@@ -187,109 +214,64 @@ def cast(
         lift = self.from_unit(norm)
         return lift  # noqa: RET504
 
-    def uniform_distribution(self) -> UniformFloatDistribution | UniformIntDistribution:
-        from neps.search_spaces.distributions import (
-            UNIT_UNIFORM,
-            UniformFloatDistribution,
-            UniformIntDistribution,
-        )
-
-        # (Log Lift) - sample on it's log domain
-        if self.log_bounds is not None:
-            return UniformFloatDistribution.new(*self.log_bounds)
-
-        # (Same Domain) - Just sample integers
-        if self.dtype == torch.int64 and self.bins is None:
-            return UniformIntDistribution.new(self.lower, self.upper)
-
-        # NOTE: There's a possibility where you could use an integer distribution for
-        # binned domains, however the cost of sampling integers and casting is likely
-        # higher than just casting from normalized domain. Would need to verify this
-        # In any case, Normalized Uniform Float is a safe choice
-
-        # (From Normalized)
-        return UNIT_UNIFORM
-
-    def unit_uniform_distribution(self) -> UniformFloatDistribution:
-        from neps.search_spaces.distributions import UNIT_UNIFORM
-
-        return UNIT_UNIFORM
-
-    def truncnorm_distribution(
-        self,
-        center: Number,
-        *,
-        confidence: float | None = None,
-        std: float | None = None,
-    ) -> TruncNormDistribution:
-        from neps.search_spaces.distributions import TruncNormDistribution
+    @classmethod
+    def unit_float(cls) -> Domain[float]:
+        return UNIT_FLOAT_DOMAIN
 
-        # If you need a unit one, create this and then call `normalize()` on it.
-        if std is None and confidence is None:
+    @classmethod
+    def cast_many(
+        cls, x: Tensor, frm: Domain | Sequence[Domain], to: Domain | Sequence[Domain]
+    ) -> Tensor:
+        """Cast a tensor of mixed domains to a new set of mixed domains.
+
+        Args:
+            x: Tensor of shape (n_samples, n_dims) with each dim `i` corresponding
+                to the domain `frm[i]`.
+            frm: List of domains to cast from. If list, must be length of `n_dims`,
+                otherwise we assume the single domain provided is the one to be used
+                across all dimensions.
+            to: List of domains to cast to. If list, must be length as `n_dims`,
+                otherwise we assume the single domain provided is the one to be used
+                across all dimensions.
+
+        Returns:
+            Tensor of shape (n_samples, n_dims) with each dim `i` transformed
+            from the domain `frm[i]` to the domain `to[i]`.
+        """
+        if x.ndim == 1:
             raise ValueError(
-                "Must specify either `std` in (lower, upper) or `confidence` in (0, 1)"
+                "Expected a 2D tensor of shape (n_samples, n_dims), got a 1D tensor."
             )
 
-        if std is None:
-            assert 0 <= confidence <= 1  # type: ignore
-            _std = float(1 - confidence)  # type: ignore
-            _is_normalized = True
-        else:
-            _std = float(std)
-            _is_normalized = False
-
-        # (Log Lift) - sample on it's log domain
-        if self.log_bounds is not None:
-            return TruncNormDistribution.new(
-                lower=self.log_bounds[0],
-                center=math.log(center),
-                upper=self.log_bounds[1],
-                std=_std,
-                std_is_normalized=_is_normalized,
-            )
-
-        # NOTE: There's a possibility where you could use an integer distribution for
-        # binned domains, however the cost of sampling integers and casting is likely
-        # higher than just casting from normalized domain. Would need to verify this
-        # In any case, Normalized Uniform Float is a safe choice
-
-        # (From Normalized)
-        truncnorm = TruncNormDistribution.new(
-            lower=self.lower,
-            center=math.log(center),
-            upper=self.upper,
-            std=_std,
-            std_is_normalized=_is_normalized,
-        )
-        return truncnorm.normalize()
-
-    def weighted_indices_distribution(
-        self, center_index: int, *, confidence: float
-    ) -> WeightedIntsDistribution:
-        from neps.search_spaces.distributions import WeightedIntsDistribution
-
-        if self.cardinality is None:
+        if isinstance(frm, Sequence) and len(frm) != x.shape[1]:
             raise ValueError(
-                "Cannot create a weighted distribution for a continuous domain!"
+                "The number of domains in `frm` must match the number of tensors"
+                " if provided as a list."
+                f" Expected {x.shape[1]}, got {len(frm)}."
             )
-        if not isinstance(center_index, int):
+
+        if isinstance(to, Sequence) and len(to) != x.shape[1]:
             raise ValueError(
-                f"Center index must be an integer of type {self.dtype} to"
-                " create a weighted distribution!"
+                "The number of domains in `to` must match the number of tensors"
+                " if provided as a list."
+                f" Expected {x.shape[1]}, got {len(to)}."
             )
-        assert 0 <= confidence <= 1
 
-        return WeightedIntsDistribution.with_favoured(
-            n=self.cardinality,
-            favoured=int(round(center_index)),
-            confidence=confidence,
-        )
+        # If both are not a list, we can just cast the whole tensor
+        if not isinstance(frm, Sequence) and not isinstance(to, Sequence):
+            return to.cast(x, frm=frm)
 
-    @classmethod
-    def unit_float(cls) -> NumberDomain[float]:
-        return UNIT_FLOAT_DOMAIN
+        # Otherwise, we need to go column by column
+        if isinstance(frm, Domain):
+            frm = [frm] * x.shape[1]
+        if isinstance(to, Domain):
+            to = [to] * x.shape[1]
+
+        buffer = torch.empty_like(x)
+        for i, (f, t) in enumerate(zip(frm, to)):
+            buffer[:, i] = t.cast(x[:, i], frm=f)
 
+        return buffer
 
-UNIT_FLOAT_DOMAIN = NumberDomain.float(0.0, 1.0)
 
-Domain: TypeAlias = NumberDomain
+UNIT_FLOAT_DOMAIN = Domain.float(0.0, 1.0)
diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
index 3d9d2928..b4035297 100644
--- a/neps/search_spaces/encoding.py
+++ b/neps/search_spaces/encoding.py
@@ -25,7 +25,6 @@
 from neps.search_spaces.domain import (
     UNIT_FLOAT_DOMAIN,
     Domain,
-    NumberDomain,
 )
 from neps.search_spaces.hyperparameters.float import FloatParameter
 from neps.search_spaces.hyperparameters.integer import IntegerParameter
@@ -48,7 +47,6 @@ def decode(self, x: T) -> list[Any]: ...
 
 class TensorTransformer(Transformer[torch.Tensor], Protocol):
     domain: Domain
-    output_cols: int
 
     def encode(
         self,
@@ -64,15 +62,13 @@ def encode(
 class CategoricalToIntegerTransformer(TensorTransformer):
     choices: Sequence[Any]
 
-    domain: NumberDomain = field(init=False)
-    output_cols: int = field(init=False)
+    domain: Domain = field(init=False)
     _lookup: dict[Any, int] | None = field(init=False)
 
     def __post_init__(self):
         assert len(self.choices) > 0
 
-        self.domain = NumberDomain.indices(len(self.choices))
-        self.output_cols = 1
+        self.domain = Domain.indices(len(self.choices))
         self._lookup = None
         if len(self.choices) > 3:
             try:
@@ -114,14 +110,12 @@ def decode(self, x: torch.Tensor) -> list[Any]:
 # and `-0.5` as lower bound with `0.5` as upper bound.
 @dataclass
 class MinMaxNormalizer(TensorTransformer, Generic[V]):
-    original_domain: NumberDomain[V]
+    original_domain: Domain[V]
 
-    domain: NumberDomain[float] = field(init=False)
-    output_cols: int = field(init=False)
+    domain: Domain[float] = field(init=False)
 
     def __post_init__(self):
         self.domain = UNIT_FLOAT_DOMAIN
-        self.output_cols = 1
 
     @override
     def encode(
@@ -198,22 +192,30 @@ def decode_dicts(self, x: npt.NDArray[np.object_]) -> list[dict[str, Any]]:
 @dataclass
 class TensorEncoder:
     transformers: dict[str, TensorTransformer]
-    column_lookup: dict[str, tuple[int, int]] = field(init=False)
+    column_lookup: dict[str, int] = field(init=False)
+    n_numerical: int = field(init=False)
+    n_categorical: int = field(init=False)
 
     def __post_init__(self):
-        transformers = sorted(
-            self.transformers.items(), key=lambda t: (t[1].output_cols, t[0])
-        )
+        transformers = sorted(self.transformers.items(), key=lambda t: t[0])
         self.transformers = dict(transformers)
-        self.column_lookup: dict[str, tuple[int, int]] = {}
-        offset = 0
-        for name, transformer in self.transformers.items():
-            self.column_lookup[name] = (offset, offset + transformer.output_cols)
-            offset += transformer.output_cols
+        self.column_lookup: dict[str, int] = {}
+        n_numerical = 0
+        n_categorical = 0
+        for i, (name, transformer) in enumerate(self.transformers.items()):
+            self.column_lookup[name] = i
+            if isinstance(transformer, CategoricalToIntegerTransformer):
+                n_categorical += 1
+            else:
+                n_numerical += 1
+
+        self.n_numerical = n_numerical
+        self.n_categorical = n_categorical
 
     def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor:
         if isinstance(hp, str):
-            return x[:, slice(*self.column_lookup[hp])]
+            return x[:, self.column_lookup[hp]]
+
         cols = torch.concatenate([torch.arange(*self.column_lookup[h]) for h in hp])
         return x[:, cols]
 
@@ -223,13 +225,12 @@ def encode(
         *,
         device: torch.device | None = None,
     ) -> torch.Tensor:
-        width = sum(t.output_cols for t in self.transformers.values())
+        width = len(self.transformers)
         buffer = torch.empty((len(x), width), dtype=torch.float64, device=device)
 
         for hp_name, transformer in self.transformers.items():
             values = [conf[hp_name] for conf in x]
             lookup = self.column_lookup[hp_name]
-            lookup = lookup[0] if lookup[1] - lookup[0] == 1 else slice(*lookup)
 
             # Encode directly into buffer
             transformer.encode(
@@ -245,21 +246,39 @@ def decode_dicts(self, x: torch.Tensor) -> list[dict[str, Any]]:
         values: dict[str, list[Any]] = {}
         for hp_name, transformer in self.transformers.items():
             lookup = self.column_lookup[hp_name]
-            if lookup[1] == lookup[0] + 1:
-                tensor = x[:, lookup[0]]
-            else:
-                tensor = x[:, slice(*lookup)]
-
+            tensor = x[:, lookup]
             values[hp_name] = transformer.decode(tensor)
 
         keys = list(values.keys())
         return [dict(zip(keys, vals)) for vals in zip(*values.values())]
 
+    def from_unit_tensor(
+        self,
+        x: torch.Tensor,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        buffer = torch.empty_like(x, dtype=torch.float64, device=device)
+
+        for i, transformer in enumerate(self.transformers.values()):
+            buffer[:, i] = transformer.domain.cast(x[:, i], frm=UNIT_FLOAT_DOMAIN)
+
+        return buffer
+
 
 @dataclass
 class DataEncoder:
     tensors: TensorEncoder | None = None
     graphs: GraphEncoder | None = None
+    device: torch.device = field(default_factory=lambda: torch.device("cpu"))
+
+    n_numerical: int = field(init=False)
+    n_categorical: int = field(init=False)
+    n_graphs: int = field(init=False)
+
+    def __post_init__(self):
+        self.n_numerical = 0 if self.tensors is None else self.tensors.n_numerical
+        self.n_categorical = 0 if self.tensors is None else self.tensors.n_categorical
+        self.n_graphs = 0 if self.graphs is None else len(self.graphs.transformers)
 
     def encode(
         self,
diff --git a/neps/search_spaces/hyperparameters/float.py b/neps/search_spaces/hyperparameters/float.py
index 6086e3b7..f8808bfe 100644
--- a/neps/search_spaces/hyperparameters/float.py
+++ b/neps/search_spaces/hyperparameters/float.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 
-from neps.search_spaces.domain import NumberDomain
+from neps.search_spaces.domain import Domain
 from neps.search_spaces.hyperparameters.numerical import NumericalParameter
 
 if TYPE_CHECKING:
@@ -71,7 +71,7 @@ def __init__(
             default=float(default) if default is not None else None,
             default_confidence=default_confidence,
             is_fidelity=is_fidelity,
-            domain=NumberDomain.float(lower, upper, log=log),
+            domain=Domain.float(lower, upper, log=log),
         )
 
     @override
diff --git a/neps/search_spaces/hyperparameters/integer.py b/neps/search_spaces/hyperparameters/integer.py
index da3bbd71..b481ffc1 100644
--- a/neps/search_spaces/hyperparameters/integer.py
+++ b/neps/search_spaces/hyperparameters/integer.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 
-from neps.search_spaces.domain import NumberDomain
+from neps.search_spaces.domain import Domain
 from neps.search_spaces.hyperparameters.float import FloatParameter
 from neps.search_spaces.hyperparameters.numerical import NumericalParameter
 
@@ -76,7 +76,7 @@ def __init__(
             is_fidelity=is_fidelity,
             default=int(np.rint(default)) if default is not None else None,
             default_confidence=default_confidence,
-            domain=NumberDomain.int(lower, upper, log=log),
+            domain=Domain.int(lower, upper, log=log),
         )
 
         # We subtract/add 0.499999 from lower/upper bounds respectively, such that
diff --git a/neps/search_spaces/hyperparameters/numerical.py b/neps/search_spaces/hyperparameters/numerical.py
index f00b590c..8cca8309 100644
--- a/neps/search_spaces/hyperparameters/numerical.py
+++ b/neps/search_spaces/hyperparameters/numerical.py
@@ -32,7 +32,7 @@
 from neps.search_spaces.parameter import MutatableParameter, ParameterWithPrior
 
 if TYPE_CHECKING:
-    from neps.search_spaces.domain import NumberDomain
+    from neps.search_spaces.domain import Domain
     from neps.search_spaces.hyperparameters.float import FloatParameter
     from neps.search_spaces.hyperparameters.integer import IntegerParameter
     from neps.utils.types import TruncNorm
@@ -82,7 +82,7 @@ def __init__(
         log: bool = False,
         default: T | None,
         is_fidelity: bool,
-        domain: NumberDomain[T],
+        domain: Domain[T],
         default_confidence: Literal["low", "medium", "high"] = "low",
     ):
         """Initialize the numerical hyperparameter.
diff --git a/neps/state/neps_state.py b/neps/state/neps_state.py
index 8afaee62..163679d8 100644
--- a/neps/state/neps_state.py
+++ b/neps/state/neps_state.py
@@ -32,6 +32,75 @@
 Loc = TypeVar("Loc")
 T = TypeVar("T")
 
+def sample_trial(
+    neps_state,
+    optimizer: BaseOptimizer,
+    *,
+    worker_id: str,
+    _sample_hooks: list[Callable] | None = None,
+) -> Trial:
+    """Sample a new trial from the optimizer.
+
+    Args:
+        optimizer: The optimizer to sample the trial from.
+        worker_id: The worker that is sampling the trial.
+        _sample_hooks: A list of hooks to apply to the optimizer before sampling.
+
+    Returns:
+        The new trial.
+    """
+    with neps_state._optimizer_state.acquire() as (
+        opt_state,
+        put_opt,
+    ), neps_state._seed_state.acquire() as (seed_state, put_seed_state):
+        trials: dict[Trial.ID, Trial] = {}
+        for trial_id, shared_trial in neps_state._trials.all().items():
+            trial = shared_trial.synced()
+            trials[trial_id] = trial
+
+        seed_state.set_as_global_seed_state()
+
+        # TODO: Not sure if any existing pre_load hooks required
+        # it to be done after `load_results`... I hope not.
+        if _sample_hooks is not None:
+            for hook in _sample_hooks:
+                optimizer = hook(optimizer)
+
+        # NOTE: We don't want optimizers mutating this before serialization
+        budget = opt_state.budget.clone() if opt_state.budget is not None else None
+        sampled_config, new_opt_state = optimizer.ask(
+            trials=trials,
+            budget_info=budget,
+            optimizer_state=opt_state.shared_state,
+        )
+
+        if sampled_config.previous_config_id is not None:
+            previous_trial = trials.get(sampled_config.previous_config_id)
+            if previous_trial is None:
+                raise ValueError(
+                    f"Previous trial '{sampled_config.previous_config_id}' not found."
+                )
+            previous_trial_location = previous_trial.metadata.location
+        else:
+            previous_trial_location = None
+
+        trial = Trial.new(
+            trial_id=sampled_config.id,
+            location="",  # HACK: This will be set by the `TrialRepo`
+            config=sampled_config.config,
+            previous_trial=sampled_config.previous_config_id,
+            previous_trial_location=previous_trial_location,
+            time_sampled=time.time(),
+            worker_id=worker_id,
+        )
+        shared_trial = neps_state._trials.put_new(trial)
+        seed_state.recapture()
+        put_seed_state(seed_state)
+        put_opt(
+            OptimizationState(budget=opt_state.budget, shared_state=new_opt_state)
+        )
+
+    return trial
 
 @dataclass
 class NePSState(Generic[Loc]):
@@ -71,75 +140,10 @@ def get_trials_by_ids(self, trial_ids: list[str], /) -> dict[str, Trial | None]:
             for _id, shared_trial in self._trials.get_by_ids(trial_ids).items()
         }
 
-    def sample_trial(
-        self,
-        optimizer: BaseOptimizer,
-        *,
-        worker_id: str,
-        _sample_hooks: list[Callable] | None = None,
-    ) -> Trial:
-        """Sample a new trial from the optimizer.
-
-        Args:
-            optimizer: The optimizer to sample the trial from.
-            worker_id: The worker that is sampling the trial.
-            _sample_hooks: A list of hooks to apply to the optimizer before sampling.
+    def get_optimizer_instance(self) -> BaseOptimizer:
+        """Get the optimizer instance."""
+        raise NotImplementedError
 
-        Returns:
-            The new trial.
-        """
-        with self._optimizer_state.acquire() as (
-            opt_state,
-            put_opt,
-        ), self._seed_state.acquire() as (seed_state, put_seed_state):
-            trials: dict[Trial.ID, Trial] = {}
-            for trial_id, shared_trial in self._trials.all().items():
-                trial = shared_trial.synced()
-                trials[trial_id] = trial
-
-            seed_state.set_as_global_seed_state()
-
-            # TODO: Not sure if any existing pre_load hooks required
-            # it to be done after `load_results`... I hope not.
-            if _sample_hooks is not None:
-                for hook in _sample_hooks:
-                    optimizer = hook(optimizer)
-
-            # NOTE: We don't want optimizers mutating this before serialization
-            budget = opt_state.budget.clone() if opt_state.budget is not None else None
-            sampled_config, new_opt_state = optimizer.ask(
-                trials=trials,
-                budget_info=budget,
-                optimizer_state=opt_state.shared_state,
-            )
-
-            if sampled_config.previous_config_id is not None:
-                previous_trial = trials.get(sampled_config.previous_config_id)
-                if previous_trial is None:
-                    raise ValueError(
-                        f"Previous trial '{sampled_config.previous_config_id}' not found."
-                    )
-                previous_trial_location = previous_trial.metadata.location
-            else:
-                previous_trial_location = None
-
-            trial = Trial.new(
-                trial_id=sampled_config.id,
-                location="",  # HACK: This will be set by the `TrialRepo`
-                config=sampled_config.config,
-                previous_trial=sampled_config.previous_config_id,
-                previous_trial_location=previous_trial_location,
-                time_sampled=time.time(),
-                worker_id=worker_id,
-            )
-            shared_trial = self._trials.put_new(trial)
-            seed_state.recapture()
-            put_seed_state(seed_state)
-            put_opt(
-                OptimizationState(budget=opt_state.budget, shared_state=new_opt_state)
-            )
-
-        return trial
 
     def report_trial_evaluation(
         self,
diff --git a/neps/state/optimizer.py b/neps/state/optimizer.py
index bd8cbc2e..11bd3eb6 100644
--- a/neps/state/optimizer.py
+++ b/neps/state/optimizer.py
@@ -10,18 +10,24 @@
 class BudgetInfo:
     """Information about the budget of an optimizer."""
 
-    max_cost_budget: float
-    used_cost_budget: float
+    max_cost_budget: float | None = None
+    used_cost_budget: float = 0.0
+    max_evaluations: int | None = None
+    used_evaluations: int = 0
 
     @property
-    def remaining_cost_budget(self) -> float:
+    def remaining_cost_budget(self) -> float | None:
         """The remaining budget."""
+        if self.max_cost_budget is None:
+            return None
         return self.max_cost_budget - self.used_cost_budget
 
     def clone(self) -> BudgetInfo:
         return BudgetInfo(
             max_cost_budget=self.max_cost_budget,
             used_cost_budget=self.used_cost_budget,
+            max_evaluations=self.max_evaluations,
+            used_evaluations=self.used_evaluations,
         )
 
 
diff --git a/pyproject.toml b/pyproject.toml
index b5be06c2..27e49fa2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -210,6 +210,7 @@ ignore = [
   "PLR2004", # No magic numbers inline
   "N817",    # CamelCase import as (ignore for ConfigSpace)
   "NPY002",  # Replace legacy `np.random.choice` call with `np.random.Generator`
+  "N803",    # Arguments should start with a lower case letter.
 ]
 
 

From 6db710ecd2ca1d62749e13815c90814be34c76b3 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Tue, 27 Aug 2024 18:11:43 +0200
Subject: [PATCH 13/63] refactor: Add in the priors

---
 neps/distributions.py             | 230 +++++++++++++++++++
 neps/optimizers/initial_design.py |  74 ++++++
 neps/priors.py                    | 366 ++++++++++++++++++++++++++++++
 3 files changed, 670 insertions(+)
 create mode 100644 neps/distributions.py
 create mode 100644 neps/optimizers/initial_design.py
 create mode 100644 neps/priors.py

diff --git a/neps/distributions.py b/neps/distributions.py
new file mode 100644
index 00000000..2361e191
--- /dev/null
+++ b/neps/distributions.py
@@ -0,0 +1,230 @@
+"""Custom distributions for NEPS."""
+
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass
+from numbers import Number
+from typing import TYPE_CHECKING, ClassVar, Mapping
+from typing_extensions import override
+
+import torch
+from torch.distributions import Distribution, constraints
+from torch.distributions.utils import broadcast_all
+
+if TYPE_CHECKING:
+    from neps.search_spaces.architecture.cfg_variants.constrained_cfg import Constraint
+    from neps.search_spaces.domain import Domain
+
+CONST_SQRT_2 = math.sqrt(2)
+CONST_INV_SQRT_2PI = 1 / math.sqrt(2 * math.pi)
+CONST_INV_SQRT_2 = 1 / math.sqrt(2)
+CONST_LOG_INV_SQRT_2PI = math.log(CONST_INV_SQRT_2PI)
+CONST_LOG_SQRT_2PI_E = 0.5 * math.log(2 * math.pi * math.e)
+
+# from https://github.com/toshas/torch_truncnorm
+
+
+class TruncatedStandardNormal(Distribution):
+    """Truncated Standard Normal distribution.
+
+    Source: https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    """
+
+    arg_constraints: ClassVar[Mapping[str, Constraint]] = {
+        "a": constraints.real,
+        "b": constraints.real,
+    }  # type: ignore
+    has_rsample: ClassVar[bool] = True
+    eps: ClassVar[float] = 1e-6
+
+    def __init__(
+        self,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        validate_args: bool | None = None,
+        device: torch.device | None = None,
+    ):
+        """Initialize a truncated standard normal distribution.
+
+        Args:
+            a: Lower truncation bound.
+            b: Upper truncation bound.
+            validate_args: Whether to validate input.
+            device: Device to use.
+        """
+        self.a, self.b = broadcast_all(a, b)
+        self.a = self.a.to(device)
+        self.b = self.b.to(device)
+
+        if isinstance(a, Number) and isinstance(b, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.a.size()
+
+        super().__init__(batch_shape, validate_args=validate_args)
+
+        if self.a.dtype != self.b.dtype:
+            raise ValueError("Truncation bounds types are different")
+
+        if any((self.a >= self.b).view(-1).tolist()):
+            raise ValueError("Incorrect truncation range")
+
+        eps = self.eps
+        self._dtype_min_gt_0 = eps
+        self._dtype_max_lt_1 = 1 - eps
+        self._little_phi_a = self._little_phi(self.a)
+        self._little_phi_b = self._little_phi(self.b)
+        self._big_phi_a = self._big_phi(self.a)
+        self._big_phi_b = self._big_phi(self.b)
+        self._Z = (self._big_phi_b - self._big_phi_a).clamp(eps, 1 - eps)
+        self._log_Z = self._Z.log()
+        little_phi_coeff_a = torch.nan_to_num(self.a, nan=math.nan)
+        little_phi_coeff_b = torch.nan_to_num(self.b, nan=math.nan)
+        self._lpbb_m_lpaa_d_Z = (
+            self._little_phi_b * little_phi_coeff_b
+            - self._little_phi_a * little_phi_coeff_a
+        ) / self._Z
+        self._mean = -(self._little_phi_b - self._little_phi_a) / self._Z
+        self._variance = (
+            1
+            - self._lpbb_m_lpaa_d_Z
+            - ((self._little_phi_b - self._little_phi_a) / self._Z) ** 2
+        )
+        self._entropy = CONST_LOG_SQRT_2PI_E + self._log_Z - 0.5 * self._lpbb_m_lpaa_d_Z
+
+    @constraints.dependent_property
+    @override
+    def support(self) -> constraints._Interval:
+        return constraints.interval(self.a, self.b)
+
+    @property
+    @override
+    def mean(self) -> torch.Tensor:
+        return self._mean
+
+    @property
+    @override
+    def variance(self) -> torch.Tensor:
+        return self._variance
+
+    @override
+    def entropy(self) -> torch.Tensor:
+        return self._entropy
+
+    @staticmethod
+    def _little_phi(x: torch.Tensor) -> torch.Tensor:
+        return (-(x**2) * 0.5).exp() * CONST_INV_SQRT_2PI
+
+    def _big_phi(self, x: torch.Tensor) -> torch.Tensor:
+        phi = 0.5 * (1 + (x * CONST_INV_SQRT_2).erf())
+        return phi.clamp(self.eps, 1 - self.eps)
+
+    @staticmethod
+    def _inv_big_phi(x: torch.Tensor) -> torch.Tensor:
+        return CONST_SQRT_2 * (2 * x - 1).erfinv()
+
+    @override
+    def cdf(self, value: torch.Tensor) -> torch.Tensor:
+        if self._validate_args:
+            self._validate_sample(value)
+        return ((self._big_phi(value) - self._big_phi_a) / self._Z).clamp(0, 1)
+
+    @override
+    def icdf(self, value: torch.Tensor) -> torch.Tensor:
+        y = self._big_phi_a + value * self._Z
+        y = y.clamp(self.eps, 1 - self.eps)
+        return self._inv_big_phi(y)
+
+    @override
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
+        if self._validate_args:
+            self._validate_sample(value)
+        return CONST_LOG_INV_SQRT_2PI - self._log_Z - (value**2) * 0.5
+
+    @override
+    def rsample(self, sample_shape: torch.Size | None = None) -> torch.Tensor:
+        if sample_shape is None:
+            sample_shape = torch.Size([])
+        shape = self._extended_shape(sample_shape)
+        p = torch.empty(shape, device=self.a.device).uniform_(
+            self._dtype_min_gt_0, self._dtype_max_lt_1
+        )
+        return self.icdf(p)
+
+
+class TruncatedNormal(TruncatedStandardNormal):
+    """Truncated Normal distribution.
+
+    https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    """
+
+    def __init__(
+        self,
+        loc: float | torch.Tensor,
+        scale: float | torch.Tensor,
+        a: float | torch.Tensor,
+        b: float | torch.Tensor,
+        validate_args: bool | None = None,
+        device: torch.device | None = None,
+    ):
+        """Initialize a truncated standard normal distribution.
+
+        Args:
+            loc: The mean of the distribution.
+            scale: The std of the distribution.
+            a: The lower bound of the distribution.
+            b: The upper bound of the distribution.
+            validate_args: Whether to validate input.
+            device: Device to use.
+        """
+        scale = torch.as_tensor(scale, device=device)
+        scale = scale.clamp_min(self.eps)
+
+        self.loc, self.scale, a, b = broadcast_all(loc, scale, a, b)
+        a = a.to(device)  # type: ignore
+        b = b.to(device)  # type: ignore
+        self._non_std_a = a
+        self._non_std_b = b
+        a = (a - self.loc) / self.scale
+        b = (b - self.loc) / self.scale
+        super().__init__(a, b, validate_args=validate_args)  # type: ignore
+        self._log_scale = self.scale.log()
+        self._mean = self._mean * self.scale + self.loc
+        self._variance = self._variance * self.scale**2
+        self._entropy += self._log_scale
+
+    def _to_std_rv(self, value):
+        return (value - self.loc) / self.scale
+
+    def _from_std_rv(self, value):
+        return value * self.scale + self.loc
+
+    @override
+    def cdf(self, value):
+        return super().cdf(self._to_std_rv(value))
+
+    @override
+    def icdf(self, value):
+        sample = self._from_std_rv(super().icdf(value))
+
+        # clamp data but keep gradients
+        sample_clip = torch.stack(
+            [sample.detach(), self._non_std_a.detach().expand_as(sample)], 0
+        ).max(0)[0]
+        sample_clip = torch.stack(
+            [sample_clip, self._non_std_b.detach().expand_as(sample)], 0
+        ).min(0)[0]
+        sample.data.copy_(sample_clip)
+        return sample
+
+    @override
+    def log_prob(self, value):
+        value = self._to_std_rv(value)
+        return super().log_prob(value) - self._log_scale
+
+
+@dataclass
+class DistributionOverDomain:
+    distribution: Distribution
+    domain: Domain
diff --git a/neps/optimizers/initial_design.py b/neps/optimizers/initial_design.py
new file mode 100644
index 00000000..05553ab1
--- /dev/null
+++ b/neps/optimizers/initial_design.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Protocol
+
+import torch
+
+if TYPE_CHECKING:
+    from neps.search_spaces.encoding import DataEncoder
+
+
+class InitialDesign(Protocol):
+    def sample(self, n: int) -> list[dict[str, Any]]: ...
+
+
+@dataclass
+class Sobol(InitialDesign):
+    seed: int
+    """The seed for the Sobol sequence."""
+
+    encoder: DataEncoder
+    """The encoding used to encode the samples."""
+
+    scramble: bool = True
+    """Whether to scramble the Sobol sequence."""
+
+    buffer_sample_multiplier: int = 2
+    """How many samples to generate in the buffer before checking for uniqueness."""
+
+    allow_undersampling: bool = False
+    """If True, will allow undersampling if we can't generate `n` unique samples."""
+
+    def sample(self, n: int) -> list[dict[str, Any]]:
+        """Sample `n` points from the Sobol sequence.
+
+        !!! warning
+
+            If `self.allow_undersampling` is False, this method will raise a ValueError if
+            it cannot generate `n` unique samples.
+
+        Args:
+            n: The number of points to sample.
+
+        Returns:
+            A list of `n` points sampled from the Sobol sequence.
+        """
+        assert self.encoder.tensors is not None
+
+        if self.encoder.has_graphs():
+            # TODO: Won't work on graphs
+            raise NotImplementedError("Graphs are not yet supported.")
+
+        if self.encoder.n_numerical == 0 and self.encoder.n_categorical > 0:
+            # TODO: We need to do something else if we have only categoricals
+            # as we are going to get a lot of duplicates
+            raise NotImplementedError("Only categorical variables are not yet supported.")
+
+        ndim = self.encoder.n_numerical + self.encoder.n_categorical
+        sobol = torch.quasirandom.SobolEngine(dimension=ndim, scramble=True, seed=5)
+
+        SAMPLE_SIZE = self.buffer_sample_multiplier * n
+        unit_x = sobol.draw(SAMPLE_SIZE, dtype=torch.float64)
+
+        x = self.encoder.tensors.from_unit_tensor(unit_x)
+
+        # NOTE: We have to check uniqueness after conversion from unit cube space
+        # as we could have multiple unit floats mapping to the same categories or integers
+        unique_x = torch.unique(x, dim=0)
+        if len(unique_x) < n and not self.allow_undersampling:
+            raise ValueError(
+                f"Could not generate {n} unique samples, got {len(unique_x)}\n{self=}"
+            )
+
+        return self.encoder.decode_dicts(unique_x[:n])
diff --git a/neps/priors.py b/neps/priors.py
new file mode 100644
index 00000000..471718f0
--- /dev/null
+++ b/neps/priors.py
@@ -0,0 +1,366 @@
+"""Priors for search spaces.
+
+Loosely speaking, they are joint distributions over multiple independent
+variables, i.e. each column of a tensor is assumed to be independent and
+can be acted on independently.
+
+They are not a `torch.distributions.Distribution` subclass as methods like
+`entropy` and `kl_divergence` are just more difficult to implement
+(not impossible, just more difficult and not needed right now).
+
+See the class doc description of [`Prior`][neps.priors.Prior] for more details.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Container, Mapping, Protocol
+from typing_extensions import override
+
+import torch
+
+from neps.distributions import DistributionOverDomain, TruncatedNormal
+from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
+
+if TYPE_CHECKING:
+    from torch.distributions import Distribution
+
+
+class Prior(Protocol):
+    """A protocol for priors over search spaces.
+
+    At it's core, the two methods that need to be implemented are
+    `log_prob` and `sample`. The `log_prob` method should return the
+    log probability of a given tensor of samples under its distribution.
+    The `sample` method should return a tensor of samples from distribution.
+
+    All values given to the `log_prob` and the ones returned from the
+    `sample` method are assumed to be in the value domain of the prior,
+    i.e. the [`.domains`][neps.priors.Prior] attribute.
+
+    !!! warning
+
+        The domain in which samples are actually drawn from not necessarily
+        need to match that of the value domain. For example, the
+        [`UniformPrior`][neps.priors.UniformPrior] class uses a unit uniform
+        distribution to sample from the unit interval before converting
+        samples to the value domain.
+
+        **As a result, the `log_prob` and `prob` method may not give the same
+        values as you might expect for a distribution over the value domain.**
+
+        For example, consider a value domain `[0, 1e9]`. You might expect
+        the `pdf` to be `1e-9` (1 / 1e9) for any given value inside the domain.
+        However, since the `UniformPrior` samples from the unit interval, the `pdf` will
+        actually be `1` (1 / 1) for any value inside the domain.
+    """
+
+    domains: list[Domain]
+    """Domain of values which this prior acts upon.
+
+    Each domain corresponds to the corresponding `ndim` in a tensor
+    (n_samples, ndim).
+    """
+
+    device: torch.device | None
+    """Device to place the tensors on."""
+
+    def log_prob(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute the log probability of values in `x` under a prior.
+
+        All columns of `x` are assumed to be independent, such that the
+        log probability of the entire tensor is the sum of the log
+        probabilities of each column.
+
+        Args:
+            x: Tensor of shape (n_samples, n_dims)
+                In the case of a 1D tensor, the shape is assumed to be (n_dims,)
+
+        Returns:
+            Tensor of shape (n_samples,) with the log probabilities of each. In the
+            case that only single dimensional tensor is passed, the returns value
+            is a scalar.
+        """
+        ...
+
+    def sample(self, n: int) -> torch.Tensor:
+        """Sample from the prior.
+
+        Args:
+            n: Number of samples to draw.
+
+        Returns:
+            Tensor of shape (n, n_dims) with the samples.
+        """
+        ...
+
+    def prob(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute the probability of values in `x` under a prior.
+
+        See [`log_prob()`][neps.priors.Prior.log_prob] for details on shapes.
+        """
+        return torch.exp(self.log_prob(x))
+
+    @classmethod
+    def uniform(
+        cls,
+        domains: Mapping[str, Domain] | list[Domain],
+        *,
+        device: torch.device | None = None,
+    ) -> UniformPrior:
+        """Create a uniform prior for a given list of domains.
+
+        Args:
+            domains: domains over which to have a uniform prior.
+            device: Device to place the tensors on.
+        """
+        domains = domains if isinstance(domains, list) else list(domains.values())
+        return UniformPrior(domains=domains, device=device)
+
+    @classmethod
+    def make_centered(  # noqa: C901
+        cls,
+        domains: Mapping[str, Domain],
+        centers: Mapping[str, tuple[Any, float]],
+        *,
+        categoricals: Container[str] = (),
+        device: torch.device | None = None,
+    ) -> CenteredPrior:
+        """Create a prior for a given list of domains.
+
+        Will use a `TruncatedNormal` distribution for all parameters,
+        except those contained within `categoricals`, which will
+        use a `Categorical` instead. If no center is given for a domain,
+        a uniform prior will be used.
+
+        For non-categoricals, this will be interpreted as the mean and
+        std `(1 - confidence)` for a truncnorm. For categorical values,
+        the _center_ will contain a probability mass of `confidence` with
+        the remaining `(1 - confidence)` probability mass distributed uniformly
+        amongest the other choices.
+
+        The order of the items in `domains` matters and should align
+        with any tensors that you will use to evaluate from the prior.
+        I.e. the first domain in `domains` will be the first column
+        of a tensor that this prior can be used on.
+
+        Args:
+            domains: domains over which to have a centered prior.
+            centers: centers for the priors. Should be a mapping
+                from the domain name to the center value and confidence level.
+                If no center is given, a uniform prior will be used.
+
+                !!! warning
+
+                    The values contained in centers should be contained within the
+                    domain. All confidence levels should be within the `[0, 1]` range.
+
+            categoricals: The names of the domains that are categorical and which
+                a `Categorical` distribution will be used, rather than a
+                `TruncatedNormal`.
+
+                !!! warning
+
+                    Categoricals require that the corresponding domain has a
+                    `.cardinality`, i.e. it is not a float/continuous domain.
+
+            device: Device to place the tensors on.
+
+
+        Returns:
+            A prior for the search space.
+        """
+        for name, (_, confidence) in centers.items():
+            if not 0 <= confidence <= 1:
+                raise ValueError(
+                    f"Confidence level for {name} must be in the range [0, 1]."
+                    f" Got {confidence}."
+                )
+
+        for name in domains:
+            if name not in centers:
+                raise ValueError(
+                    f"Center for {name} is missing. "
+                    f"Please provide a center for all domains."
+                )
+
+        distributions: list[DistributionOverDomain] = []
+        for name, domain in domains.items():
+            center_confidence = centers.get(name)
+            if center_confidence is None:
+                dist = DistributionOverDomain(
+                    distribution=torch.distributions.Uniform(domain.lower, domain.upper),
+                    domain=domain,
+                )
+                continue
+
+            center, confidence = center_confidence
+            if name in categoricals:
+                if domain.cardinality is None:
+                    raise ValueError(
+                        f"{name} is not a finite domain and cannot be used as a"
+                        " categorical. Please remove it from the categoricals list."
+                    )
+
+                if not isinstance(center, int):
+                    raise ValueError(
+                        f"{name} is a categorical domain and should have an integer"
+                        f" center. Got {center} of type {type(center)}."
+                    )
+
+                remaining_weight = 1 - confidence
+                distributed_weight = remaining_weight / (domain.cardinality - 1)
+                weights = torch.full(
+                    (domain.cardinality,),
+                    distributed_weight,
+                    device=device,
+                    dtype=torch.float64,
+                )
+
+                weights[center] = confidence
+
+                dist = DistributionOverDomain(
+                    distribution=torch.distributions.Categorical(probs=weights),
+                    domain=domain,
+                )
+                distributions.append(dist)
+                continue
+
+            # We place a truncnorm over a unitnorm
+            if domain.log_bounds is not None:
+                domain.to_unit(torch.tensor(center, device=device, dtype=torch.float64))
+                torch.tensor(1 - confidence, device=device, dtype=torch.float64)
+
+            dist = DistributionOverDomain(
+                distribution=TruncatedNormal(
+                    loc=center,
+                    scale=(1 - confidence),
+                    a=domain.lower,
+                    b=domain.upper,
+                    device=device,
+                ),
+                domain=UNIT_FLOAT_DOMAIN,
+            )
+            distributions.append(dist)
+
+        return CenteredPrior(
+            domains=list(domains.values()), distributions=distributions, device=device
+        )
+
+
+@dataclass
+class CenteredPrior(Prior):
+    """A prior that is centered around a given value with a given confidence.
+
+    This prior is useful for creating priors for search spaces where the
+    values are centered around a given value with a given confidence level.
+
+    You can use a `torch.distribution.Uniform` for any values which do
+    not have a center and confidence level, i.e. no prior information.
+
+    You can create this class more easily using
+    [`Prior.make_centered()`][neps.priors.Prior.make_centered].
+    """
+
+    domains: list[Domain]
+    """Domain of values."""
+
+    device: torch.device | None
+    """Device to place the tensors on."""
+
+    distributions: list[DistributionOverDomain]
+    """Distributions along with the corresponding domains they sample from."""
+
+    _distribution_domains: list[Domain] = field(init=False, repr=False)
+
+    def __post_init__(self):
+        self._distribution_domains = [dist.domain for dist in self.distributions]
+
+    @override
+    def log_prob(self, x: torch.Tensor) -> torch.Tensor:
+        # Cast all values from the value domains to the domain of the sampler.
+        sample_domain_tensor = Domain.cast_many(
+            x, frm=self.domains, to=self._distribution_domains
+        )
+
+        # Calculate the log probabilities of the sample domain tensors under their
+        # respective distributions.
+        log_probs = torch.cat(
+            [
+                dist.distribution.log_prob(sample_domain_tensor[:, i])
+                for i, dist in enumerate(self.distributions)
+            ],
+            dim=1,
+        )
+        return torch.sum(log_probs, dim=1)
+
+    @override
+    def sample(self, n: int) -> torch.Tensor:
+        buffer = torch.empty(
+            n,
+            len(self.distributions),
+            device=self.device,
+            dtype=torch.float64,
+        )
+
+        size = torch.Size((n,))
+        for i, (value_domain, frm) in enumerate(zip(self.domains, self.distributions)):
+            samples = frm.distribution.sample(size)
+            buffer[:, i] = value_domain.cast(samples, frm=frm.domain)
+
+        return buffer
+
+
+@dataclass
+class UniformPrior(Prior):
+    """A prior that is uniform over a given domain.
+
+    Uses a UnitUniform under the hood before converting to the value domain.
+    """
+
+    domains: list[Domain]
+    """Domain of values."""
+
+    device: torch.device | None
+    """Device to place the tensors on."""
+
+    _unit_uniform: Distribution = field(init=False, repr=False)
+
+    def __post_init__(self):
+        self._unit_uniform = torch.distributions.Uniform(0.0, 1.0)
+
+    def log_prob(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute the log probability of values in `x` under a prior.
+
+        All columns of `x` are assumed to be independent, such that the
+        log probability of the entire tensor is the sum of the log
+        probabilities of each column.
+
+        Args:
+            x: Tensor of shape (n_samples, n_dims)
+                In the case of a 1D tensor, the shape is assumed to be (n_dims,)
+
+        Returns:
+            Tensor of shape (n_samples,) with the log probabilities of each. In the
+            case that only single dimensional tensor is passed, the returns value
+            is a scalar.
+        """
+        sample_domain_tensor = Domain.cast_many(x, frm=self.domains, to=UNIT_FLOAT_DOMAIN)
+        return torch.sum(self._unit_uniform.log_prob(sample_domain_tensor), dim=1)
+
+    def sample(self, n: int) -> torch.Tensor:
+        """Sample from the prior.
+
+        Args:
+            n: Number of samples to draw.
+
+        Returns:
+            Tensor of shape (n, n_dims) with the samples.
+        """
+        samples = torch.rand(
+            n,
+            len(self.domains),
+            device=self.device,
+            dtype=torch.float64,
+        )
+        return Domain.cast_many(samples, frm=UNIT_FLOAT_DOMAIN, to=self.domains)

From a013a13ab99efd0ab65945438e7c4d4f088a7794 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Tue, 27 Aug 2024 18:54:59 +0200
Subject: [PATCH 14/63] refactor: Clean up the BO

---
 .../acquisition_functions/prior_weighted.py   |   2 +-
 .../bayesian_optimization/optimizer.py        | 138 ++++++++++++------
 neps/optimizers/initial_design.py             |   8 +-
 neps/search_spaces/encoding.py                |  20 +--
 4 files changed, 113 insertions(+), 55 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
index 7b0d4318..8a735d58 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
@@ -26,7 +26,7 @@ def __init__(
         acq_fn: MCAcquisitionFunction,
         prior: Prior,
         beta: float,
-        n: int,
+        n: float,
     ):
         """Initialize the acquisition function.
 
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index a08578d6..5a1314db 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -18,7 +18,9 @@
     optimize_acq,
 )
 from neps.optimizers.initial_design import Sobol
+from neps.priors import Prior
 from neps.search_spaces.encoding import DataEncoder
+from neps.search_spaces.hyperparameters.categorical import CategoricalParameter
 
 if TYPE_CHECKING:
     from botorch.models.model import Model
@@ -26,6 +28,7 @@
     from neps.search_spaces import (
         SearchSpace,
     )
+    from neps.search_spaces.domain import Domain
     from neps.search_spaces.encoding import DataPack
     from neps.state import BudgetInfo, Trial
 
@@ -58,6 +61,11 @@ def __init__(
             ValueError: if initial_design_size < 1
             ValueError: if no kernel is provided
         """
+        if any(pipeline_space.graphs):
+            raise ValueError(
+                "BayesianOptimization currently only supports flat search spaces"
+            )
+
         if initial_design_size is None:
             N = len(pipeline_space.hyperparameters)
             initial_design_size = int(max(1, math.log(N) ** 2))
@@ -68,23 +76,61 @@ def __init__(
 
         super().__init__(pipeline_space=pipeline_space)
 
-        self.use_priors = use_priors
-
-        # TODO: This needs to be moved to the search space class, however to not break
-        # the current prior based APIs, we will create this manually here
+        self.encoder = DataEncoder.default_encoder(
+            pipeline_space,
+            include_fidelities=False,
+        )
+        # We should only be acting on tensor'able hyperparameters for now
+        assert self.encoder.tensors is not None
+
+        # TODO: This needs to be moved to the search space class, however
+        # to not break the current prior based APIs used elsewhere, we can
+        # just manually create this here.
+        # We use confidence here where `0` means no confidence and `1` means
+        # absolute confidence. This gets translated in to std's and weights
+        # accordingly in a `CenteredPrior`
+        self.prior: Prior | None = None
         if use_priors:
-            self._prior_confidences = {}
+            _mapping = {"low": 0.25, "medium": 0.5, "high": 0.75}
+
+            domains: dict[str, Domain] = {}
+            centers: dict[str, tuple[Any, float]] = {}
+            categoricals: set[str] = set()
+            for name in self.encoder.tensors.names():
+                hp = self.pipeline_space.hyperparameters[name]
+                domains[name] = hp.domain  # type: ignore
+
+                if isinstance(hp, CategoricalParameter):
+                    categoricals.add(name)
+
+                if hp.default is None:
+                    continue
+
+                confidence_score: float = hp.default_confidence_choice  # type: ignore
+                if isinstance(hp, CategoricalParameter):
+                    center = hp._default_index
+                else:
+                    center = hp.default
+
+                centers[name] = (center, confidence_score)
+
+            # Uses truncnorms for numerical and weighted choices categoricals
+            self.prior = Prior.make_centered(
+                domains=domains,
+                centers=centers,
+                categoricals=categoricals,
+            )
+        else:
+            self.prior = None
 
         self.device = device
         self.sample_default_first = sample_default_first
         self.n_initial_design = initial_design_size
-
         if surrogate_model == "gp":
             self._get_fitted_model = default_single_obj_gp
         else:
             self._get_fitted_model = surrogate_model
 
-        self.encoder_: DataEncoder | None = None
         self.initial_design_: list[dict[str, Any]] | None = None
 
     def ask(
@@ -101,41 +147,38 @@ def ask(
             if t.report is not None and t.report.loss is not None
         ]
         x_configs = [t.config for t in completed]
-        y: torch.Tensor = torch.as_tensor(
+        y = torch.as_tensor(
             [t.report.loss for t in completed],
             dtype=torch.float64,
+            device=self.device,
         )  # type: ignore
 
-        # We only do single objective for now but may as well include this
-        # for when we have MO
         if y.ndim == 1:
             y = y.unsqueeze(1)
 
         pending = [t.config for t in trials.values() if t.state.pending()]
-        if self.encoder_ is None:
-            self.encoder_ = DataEncoder.default_encoder(
-                self.pipeline_space,
-                include_fidelities=False,
-            )
 
         space = self.pipeline_space
+        config_id = str(len(trials) + 1)
+        assert self.encoder.tensors is not None
 
+        # Fill intitial design data if we don't have any...
         if self.initial_design_ is None:
             size = self.n_initial_design
             self.initial_design_ = []
 
+            # Add the default configuration first (maybe)
             if self.sample_default_first:
                 config = space.sample_default_configuration()
                 self.initial_design_.append(config.hp_values())
 
-            assert self.encoder_.tensors is not None
-            sobol = Sobol(seed=0, encoder=self.encoder_, allow_undersampling=True)
+            # Fill remaining with Sobol sequence samples
+            sobol = Sobol(seed=0, encoder=self.encoder, allow_undersampling=True)
             sobol_configs = sobol.sample(size - len(self.initial_design_))
             self.initial_design_.extend(sobol_configs)
-        else:
-            self.initial_design_ = []
 
-        config_id = str(len(trials) + 1)
+        # If we havn't passed the intial design phase, just return
+        # the next one.
         if len(trials) < len(self.initial_design_):
             config = self.initial_design_[len(trials)]
             return (
@@ -143,53 +186,66 @@ def ask(
                 optimizer_state,
             )
 
-        assert self.encoder_ is not None
-        x = self.encoder_.encode(x_configs, device=self.device)
+        # Now we actually do the BO loop, start by encoding the data
+        x = self.encoder.encode(x_configs, device=self.device)
         if any(pending):
-            x_pending = self.encoder_.encode(pending, device=self.device)
+            x_pending = self.encoder.encode(pending, device=self.device)
             x_pending = x_pending.tensor
             assert x_pending is not None
         else:
             x_pending = None
 
+        # Get our fitted model
         model = self._get_fitted_model(x, y)
 
+        # Build our acquisition function. This takes care of pending
+        # configs through x_pending.
+        # TODO: We should evaluate whether LogNoisyEI is better than LogEI
         acq = qLogExpectedImprovement(
             model,
             best_f=y.min(),
             X_pending=x_pending,
+            # Unfortunatly, there's no option to indicate that we minimize
+            # the AcqFunction so we need to do some kind of transformation.
+            # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607
             objective=LinearMCObjective(weights=torch.tensor([-1.0])),
         )
 
-        if self.use_priors:
-            # From the PIBO paper (Section 4.1)
-            # https://arxiv.org/pdf/2204.11051
+        # If we have a prior, then we use it with PiBO
+        if self.prior:
             if budget_info.max_evaluations is not None:
-                beta = budget_info.max_evaluations / 10
+                # From the PIBO paper (Section 4.1)
+                # https://arxiv.org/pdf/2204.11051
                 n = budget_info.used_evaluations
+                beta = budget_info.max_evaluations / 10
+
             elif budget_info.max_cost_budget is not None:
                 # This might not work well if cost number is high
                 # early on, but it will start to normalize.
-                beta = budget_info.max_cost_budget / 10
                 n = budget_info.used_cost_budget
+                beta = budget_info.max_cost_budget / 10
+
+            else:
+                # Otherwise, just some random heuristic based on the number
+                # of trials and dimensionality of the search space
+                # TODO: Think about and evaluate this more.
+                ndim = x.tensor.shape[1]  # type: ignore
+                n = len(x_configs)
+                beta = ndim**2 / 10
 
-            acq = PiboAcquisition(acq, n=n, beta=beta)
+            acq = PiboAcquisition(acq, prior=self.prior, n=n, beta=beta)
 
+        # Optimize it
         candidates, _eis = optimize_acq(
-            # TODO: We should evaluate whether LogNoisyEI is better than LogEI
-            acq_fn=qLogExpectedImprovement(
-                model,
-                best_f=y.min(),
-                X_pending=x_pending,
-                # Unfortunatly, there's no option to indicate that we minimize
-                # the AcqFunction so we need to do some kind of transformation.
-                # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607
-                objective=LinearMCObjective(weights=torch.tensor([-1.0])),
-            ),
-            encoder=self.encoder_,
+            acq_fn=acq,
+            encoder=self.encoder,
             acq_options={},  # options to underlying optim function of botorch
         )
-        config = self.encoder_.decode_dicts(candidates)[0]
+
+        # Take the first (and only?) candidate
+        assert len(candidates) == 1
+        config = self.encoder.decode_dicts(candidates)[0]
+
         return (
             SampledConfig(id=config_id, config=config, previous_config_id=None),
             optimizer_state,
diff --git a/neps/optimizers/initial_design.py b/neps/optimizers/initial_design.py
index 05553ab1..e80dfe33 100644
--- a/neps/optimizers/initial_design.py
+++ b/neps/optimizers/initial_design.py
@@ -5,6 +5,8 @@
 
 import torch
 
+from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
+
 if TYPE_CHECKING:
     from neps.search_spaces.encoding import DataEncoder
 
@@ -61,7 +63,11 @@ def sample(self, n: int) -> list[dict[str, Any]]:
         SAMPLE_SIZE = self.buffer_sample_multiplier * n
         unit_x = sobol.draw(SAMPLE_SIZE, dtype=torch.float64)
 
-        x = self.encoder.tensors.from_unit_tensor(unit_x)
+        x = Domain.cast_many(
+            unit_x,
+            to=list(self.encoder.tensors.domains().values()),
+            frm=UNIT_FLOAT_DOMAIN,
+        )
 
         # NOTE: We have to check uniqueness after conversion from unit cube space
         # as we could have multiple unit floats mapping to the same categories or integers
diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
index b4035297..21d7acd4 100644
--- a/neps/search_spaces/encoding.py
+++ b/neps/search_spaces/encoding.py
@@ -212,6 +212,14 @@ def __post_init__(self):
         self.n_numerical = n_numerical
         self.n_categorical = n_categorical
 
+    def domains(self) -> dict[str, Domain]:
+        return {
+            name: transformer.domain for name, transformer in self.transformers.items()
+        }
+
+    def names(self) -> list[str]:
+        return list(self.transformers.keys())
+
     def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor:
         if isinstance(hp, str):
             return x[:, self.column_lookup[hp]]
@@ -252,18 +260,6 @@ def decode_dicts(self, x: torch.Tensor) -> list[dict[str, Any]]:
         keys = list(values.keys())
         return [dict(zip(keys, vals)) for vals in zip(*values.values())]
 
-    def from_unit_tensor(
-        self,
-        x: torch.Tensor,
-        device: torch.device | None = None,
-    ) -> torch.Tensor:
-        buffer = torch.empty_like(x, dtype=torch.float64, device=device)
-
-        for i, transformer in enumerate(self.transformers.values()):
-            buffer[:, i] = transformer.domain.cast(x[:, i], frm=UNIT_FLOAT_DOMAIN)
-
-        return buffer
-
 
 @dataclass
 class DataEncoder:

From d4a11a2192efad6ba3e66e5779a1e38c926aeeda Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Tue, 27 Aug 2024 19:00:17 +0200
Subject: [PATCH 15/63] refactor: Delete unused files

---
 .../acquisition_functions/aq_functions.py     |  88 ------
 .../acquisition_sampler_2/__init__.py         |   0
 .../acquisition_sampler_2/aq_samplers.py      |  22 --
 .../acquisition_sampler_2/mutation_sampler.py | 163 -----------
 .../acquisition_sampler_2/random_sampler.py   |  15 --
 .../bayesian_optimization/cost_cooling.py     | 252 ------------------
 6 files changed, 540 deletions(-)
 delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py
 delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/__init__.py
 delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py
 delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py
 delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py
 delete mode 100644 neps/optimizers/bayesian_optimization/cost_cooling.py

diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py b/neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py
deleted file mode 100644
index 70b6b4e6..00000000
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py
+++ /dev/null
@@ -1,88 +0,0 @@
-from __future__ import annotations
-
-import math
-
-import torch
-
-
-def ei(
-    mu: torch.Tensor,
-    cov: torch.Tensor,
-    optimum: float | torch.Tensor,
-    *,
-    augmented_ei_regularizer: float | None = None,  # 0.01
-    xi: float = 0.0,
-    log_ei: bool = False,
-    log_ei_epsilon: float = 1e-6,
-) -> torch.Tensor:
-    improvement = optimum - mu - xi
-
-    sigma_sq = torch.diag(cov)
-    sigma = torch.sqrt(sigma_sq)
-
-    Z = improvement / sigma
-
-    # If we calculate it ourselves, we save some computation as mu = 0
-    # and sigma = 1 cancel a few terms out
-    # https://en.wikipedia.org/wiki/Normal_distribution
-    Z_cdf = 0.5 * (1 + torch.erf(Z / math.sqrt(2)))
-    Z_pdf = 1 / (math.sqrt(2 * math.pi)) * torch.exp(-0.5 * Z**2)
-    ei = improvement * Z_cdf + sigma * Z_pdf
-
-    if augmented_ei_regularizer is not None:
-        regularization_term = 1 + sigma_sq / augmented_ei_regularizer
-        ei = ei / regularization_term
-
-    if log_ei:
-        ei = torch.log(ei + log_ei_epsilon)
-
-    return ei
-
-
-def acq_by_confidence(
-    mu: torch.Tensor,
-    cov: torch.Tensor,
-    *,
-    confidence_scale: float = 1.0,
-) -> torch.Tensor:
-    # Assumes we are trying to minimize our objective but
-    # this acquisition function will be maximized, i.e. optimize
-    # this function to find the point which is most likely to be
-    # the minimum of the objective.
-
-    #         ****
-    #        * / \**
-    #   ***** /   \- ****
-    #  *     /      \    ***
-    # *     /        \   |  *   ***
-    #   ---/          \  |   +**
-    # -/               \ | /   \
-    #                   \|/     ---
-    #                    -  <- lcb = mu - c * sigma
-    # ______________________________
-    lcb = mu - confidence_scale * torch.sqrt(torch.diag(cov))
-
-    return -lcb  # Negate to make maximization
-
-
-def weight_by_cost(
-    acquisition_scores: torch.Tensor,
-) -> torch.Tensor:
-    # Assumes we are trying to minimize our objective but
-    # this acquisition function will be maximized, i.e. optimize
-    # this function to find the point which is most likely to be
-    # the minimum of the objective.
-
-    #         ****
-    #        * / \**
-    #   ***** /   \- ****
-    #  *     /      \    ***
-    # *     /        \   |  *   ***
-    #   ---/          \  |   +**
-    # -/               \ | /   \
-    #                   \|/     ---
-    #                    -  <- lcb = mu - c * sigma
-    # ______________________________
-    lcb = mu - cost_scale * torch.sqrt(torch.diag(cov))
-
-    return -lcb  # Negate to make maximization
diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/__init__.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py
deleted file mode 100644
index f799252b..00000000
--- a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    import torch
-
-    from neps.search_spaces import SearchSpace
-
-
-def random_sample(search_space: SearchSpace, *, seed: torch.Generator) -> SearchSpace:
-    """Sample a random value from a search space.
-
-    Args:
-        search_space: The search space to sample from.
-        user_priors: Whether to sample from user priors.
-        seed: The seed to use for sampling.
-
-    Returns:
-        A search space with a sampled value.
-    """
-    return search_space.sample_value(user_priors=user_priors)
diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py
deleted file mode 100644
index 972ad6c3..00000000
--- a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py
+++ /dev/null
@@ -1,163 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Callable, Sequence
-
-import numpy as np
-import torch
-from more_itertools import first
-from typing_extensions import override
-
-from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-    AcquisitionSampler,
-)
-from neps.optimizers.bayesian_optimization.acquisition_samplers.random_sampler import (
-    RandomSampler,
-)
-
-if TYPE_CHECKING:
-    from neps.search_spaces.search_space import SearchSpace
-
-
-def _propose_location(
-    acquisition_function: Callable,
-    candidates: list[SearchSpace],
-    top_n: int = 5,
-    return_distinct: bool = True,
-) -> tuple[list[SearchSpace], np.ndarray | torch.Tensor, np.ndarray]:
-    """top_n: return the top n candidates wrt the acquisition function."""
-    if return_distinct:
-        eis = acquisition_function(candidates, asscalar=True)  # faster
-        eis_, unique_idx = np.unique(eis, return_index=True)
-        try:
-            i = np.argpartition(eis_, -top_n)[-top_n:]
-            indices = np.array([unique_idx[j] for j in i])
-        except ValueError:
-            eis = torch.tensor([acquisition_function(c) for c in candidates])
-            _, indices = eis.topk(top_n)
-    else:
-        eis = torch.tensor([acquisition_function(c) for c in candidates])
-        _, indices = eis.topk(top_n)
-
-    xs = [candidates[int(i)] for i in indices]
-    return xs, eis, indices
-
-
-class MutationSampler(AcquisitionSampler):
-    def __init__(
-        self,
-        pipeline_space,
-        pool_size: int = 250,
-        n_best: int = 10,
-        mutate_size: float | int = 0.5,
-        allow_isomorphism: bool = False,
-        check_isomorphism_history: bool = True,
-        patience: int = 50,
-    ):
-        super().__init__(pipeline_space=pipeline_space, patience=patience)
-        self.pool_size = pool_size
-        self.n_best = n_best
-        self.mutate_size = mutate_size
-        if isinstance(mutate_size, int):
-            assert (
-                pool_size >= mutate_size
-            ), " pool_size must be larger or equal to mutate_size"
-
-        self.allow_isomorphism = allow_isomorphism
-        self.check_isomorphism_history = (
-            check_isomorphism_history  # check for isomorphisms also in previous graphs
-        )
-        self.random_sampling = RandomSampler(
-            pipeline_space=pipeline_space, patience=patience
-        )
-
-    @override
-    def set_state(
-        self, x: list[SearchSpace], y: Sequence[float] | np.ndarray | torch.Tensor
-    ) -> None:
-        super().set_state(x, y)
-        self.random_sampling.set_state(x, y)
-
-    @override
-    def sample(self, acquisition_function: Callable) -> SearchSpace:
-        return first(self.sample_batch(acquisition_function, batch=1))
-
-    @override
-    def sample_batch(
-        self,
-        acquisition_function: Callable,
-        batch: int,
-    ) -> list[SearchSpace]:
-        pool = self.create_pool(
-            x=self.x,
-            y=self.y,
-            acquisition_function=acquisition_function,
-            pool_size=self.pool_size,
-        )
-
-        samples, _, _ = _propose_location(
-            acquisition_function=acquisition_function,
-            top_n=batch,
-            candidates=pool,
-        )
-        return samples
-
-    def create_pool(
-        self,
-        x: list[SearchSpace],
-        y: Sequence[float] | np.ndarray | torch.Tensor,
-        acquisition_function: Callable,
-        pool_size: int,
-    ) -> list[SearchSpace]:
-        if len(x) == 0:
-            return self.random_sampling.sample_batch(acquisition_function, pool_size)
-
-        if isinstance(self.mutate_size, int):
-            mutate_size = self.mutate_size
-        else:
-            mutate_size = int(self.mutate_size * pool_size)
-
-        n_best = len(x) if len(x) < self.n_best else self.n_best
-        best_configs = [x for (_, x) in sorted(zip(y, x), key=lambda pair: pair[0])][
-            :n_best
-        ]
-
-        seen: set[int] = set()
-
-        def _hash(_config: SearchSpace) -> int:
-            return hash(_config.hp_values().values())
-
-        evaluation_pool = []
-        per_arch = mutate_size // n_best
-
-        for config in best_configs:
-            remaining_patience = self.patience
-            for _ in range(per_arch):
-                while remaining_patience:
-                    try:
-                        # needs to throw an Exception if config is not valid, e.g., empty graph etc.!
-                        child = config.mutate()
-                    except Exception:
-                        remaining_patience -= 1
-                        continue
-                    hash_child = _hash(child)
-
-                    if not self.allow_isomorphism:
-                        # if disallow isomorphism, we enforce that each time, we mutate n distinct graphs.
-                        # For now we do not check the isomorphism in all of the previous graphs though
-                        if child == config or hash_child in seen:
-                            remaining_patience -= 1
-                            continue
-
-                    evaluation_pool.append(child)
-                    seen.add(hash_child)
-                    break
-
-        # Fill missing pool with random samples
-        nrandom_archs = max(pool_size - len(evaluation_pool), 0)
-        if nrandom_archs:
-            random_evaluation_pool = self.random_sampling.sample_batch(
-                acquisition_function, nrandom_archs
-            )
-            evaluation_pool += random_evaluation_pool
-
-        return evaluation_pool
diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py
deleted file mode 100644
index f7a4da76..00000000
--- a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from __future__ import annotations
-
-import torch
-from neps.search_spaces import SearchSpace
-from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-    AcquisitionSampler,
-)
-
-
-class RandomSampler(AcquisitionSampler):
-
-    def sample(self, n: int, space: SearchSpace) -> torch.Tensor:
-        return self.pipeline_space.sample(
-            patience=self.patience, user_priors=False, ignore_fidelity=False
-        )
diff --git a/neps/optimizers/bayesian_optimization/cost_cooling.py b/neps/optimizers/bayesian_optimization/cost_cooling.py
deleted file mode 100644
index eb3ee28e..00000000
--- a/neps/optimizers/bayesian_optimization/cost_cooling.py
+++ /dev/null
@@ -1,252 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Any
-from typing_extensions import override
-
-from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping
-from neps.optimizers.bayesian_optimization.acquisition_functions.cost_cooling import (
-    CostCooler,
-)
-from neps.optimizers.bayesian_optimization.acquisition_samplers import (
-    AcquisitionSamplerMapping,
-)
-from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
-from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization
-from neps.utils.common import instance_from_map
-
-if TYPE_CHECKING:
-    from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
-        BaseAcquisition,
-    )
-    from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-        AcquisitionSampler,
-    )
-    from neps.search_spaces.search_space import SearchSpace
-    from neps.state.optimizer import BudgetInfo
-    from neps.utils.types import ConfigResult
-
-
-class CostCooling(BayesianOptimization):
-    """Implements a basic cost-cooling as described in
-    "Cost-aware Bayesian Optimization" (https://arxiv.org/abs/2003.10870) by Lee et al.
-    """
-
-    def __init__(
-        self,
-        pipeline_space: SearchSpace,
-        initial_design_size: int = 10,
-        surrogate_model: str | Any = "gp",
-        cost_model: str | Any = "gp",
-        surrogate_model_args: dict | None = None,
-        cost_model_args: dict | None = None,
-        optimal_assignment: bool = False,
-        domain_se_kernel: str | None = None,
-        graph_kernels: list | None = None,
-        hp_kernels: list | None = None,
-        acquisition: str | BaseAcquisition = "EI",
-        log_prior_weighted: bool = False,
-        acquisition_sampler: str | AcquisitionSampler = "mutation",
-        random_interleave_prob: float = 0.0,
-        patience: int = 100,
-        budget: None | int | float = None,
-        ignore_errors: bool = False,
-        loss_value_on_error: None | float = None,
-        cost_value_on_error: None | float = None,
-        logger=None,
-    ):
-        """Initialise the BO loop.
-
-        Args:
-            pipeline_space: Space in which to search
-            initial_design_size: Number of 'x' samples that need to be evaluated before
-                selecting a sample using a strategy instead of randomly.
-            surrogate_model: Surrogate model
-            cost_model: Cost model
-            surrogate_model_args: Arguments that will be given to the surrogate model
-                (the Gaussian processes model).
-            cost_model_args: Arguments that will be given to the cost model
-                (the Gaussian processes model).
-            optimal_assignment: whether the optimal assignment kernel should be used.
-            domain_se_kernel: Stationary kernel name
-            graph_kernels: Kernels for NAS
-            hp_kernels: Kernels for HPO
-            acquisition: Acquisition strategy
-            log_prior_weighted: if to use log for prior
-            acquisition_sampler: Acquisition function fetching strategy
-            random_interleave_prob: Frequency at which random configurations are sampled
-                instead of configurations from the acquisition strategy.
-            patience: How many times we try something that fails before giving up.
-            budget: Maximum budget
-            ignore_errors: Ignore hyperparameter settings that threw an error and do not
-                raise an error. Error configs still count towards max_evaluations_total.
-            loss_value_on_error: Setting this and cost_value_on_error to any float will
-                supress any error during bayesian optimization and will use given loss
-                value instead. default: None
-            cost_value_on_error: Setting this and loss_value_on_error to any float will
-                supress any error during bayesian optimization and will use given cost
-                value instead. default: None
-            logger: logger object, or None to use the neps logger
-
-        Raises:
-            ValueError: if patience < 1
-            ValueError: if initial_design_size < 1
-            ValueError: if random_interleave_prob is not between 0.0 and 1.0
-            ValueError: if no kernel is provided
-        """
-        super().__init__(
-            pipeline_space=pipeline_space,
-            patience=patience,
-            logger=logger,
-            budget=budget,
-            ignore_errors=ignore_errors,
-            loss_value_on_error=loss_value_on_error,
-            cost_value_on_error=cost_value_on_error,
-        )
-
-        if initial_design_size < 1:
-            raise ValueError(
-                "BayesianOptimization needs initial_design_size to be at least 1"
-            )
-        if not 0 <= random_interleave_prob <= 1:
-            raise ValueError("random_interleave_prob should be between 0.0 and 1.0")
-
-        self._initial_design_size = initial_design_size
-        self._random_interleave_prob = random_interleave_prob
-        self._num_train_x: int = 0
-        self._pending_evaluations: list = []
-        self._model_update_failed: bool = False
-
-        if ignore_errors:
-            self.logger.warning(
-                "ignore_errors was set, but this optimizer does not support it"
-            )
-
-        surrogate_model_args = surrogate_model_args or {}
-        cost_model_args = cost_model_args or {}
-        graph_kernels, hp_kernels = get_default_kernels(
-            self.pipeline_space,
-            domain_se_kernel,
-            graph_kernels,
-            hp_kernels,
-            optimal_assignment,
-        )
-        if "graph_kernels" not in surrogate_model_args:
-            surrogate_model_args["graph_kernels"] = graph_kernels
-        if "hp_kernels" not in surrogate_model_args:
-            surrogate_model_args["hp_kernels"] = hp_kernels
-
-        if (
-            not surrogate_model_args["graph_kernels"]
-            and not surrogate_model_args["hp_kernels"]
-        ):
-            raise ValueError("No kernels are provided!")
-
-        if "vectorial_features" not in surrogate_model_args:
-            surrogate_model_args["vectorial_features"] = (
-                self.pipeline_space.get_vectorial_dim()
-            )
-
-        self.surrogate_model = instance_from_map(
-            SurrogateModelMapping,
-            surrogate_model,
-            name="surrogate model",
-            kwargs=surrogate_model_args,
-        )
-
-        if "graph_kernels" not in cost_model_args:
-            cost_model_args["graph_kernels"] = graph_kernels
-        if "hp_kernels" not in cost_model_args:
-            cost_model_args["hp_kernels"] = hp_kernels
-
-        if not cost_model_args["graph_kernels"] and not cost_model_args["hp_kernels"]:
-            raise ValueError("No kernels are provided!")
-
-        if "vectorial_features" not in cost_model_args:
-            cost_model_args["vectorial_features"] = (
-                self.pipeline_space.get_vectorial_dim()
-            )
-
-        self.cost_model = instance_from_map(
-            SurrogateModelMapping,
-            cost_model,
-            name="cost model",  # does changing this string work?
-            kwargs=cost_model_args,
-        )
-
-        orig_acquisition = instance_from_map(
-            AcquisitionMapping,
-            acquisition,
-            name="acquisition function",
-        )
-
-        self.acquisition = CostCooler(orig_acquisition)
-
-        self.acquisition_sampler = instance_from_map(
-            AcquisitionSamplerMapping,
-            acquisition_sampler,
-            name="acquisition sampler function",
-            kwargs={"patience": self.patience, "pipeline_space": self.pipeline_space},
-        )
-
-    @override
-    def load_optimization_state(
-        self,
-        previous_results: dict[str, ConfigResult],
-        pending_evaluations: dict[str, SearchSpace],
-        budget_info: BudgetInfo | None,
-        optimizer_state: dict[str, Any],
-    ) -> None:
-        # TODO(Jan): read out cost and fit cost model
-        if budget_info is None:
-            raise ValueError(
-                "Used budget is not set in the optimizer state but is required"
-                " for cost cooling, please return a `'cost'` when you return results"
-                " and/or a `max_cost_budget` when running NePS!"
-            )
-        self.used_budget = budget_info.used_cost_budget
-
-        train_x = [el.config for el in previous_results.values()]
-        train_y = [self.get_loss(el.result) for el in previous_results.values()]
-        train_cost = [self.get_cost(el.result) for el in previous_results.values()]
-        self._num_train_x = len(train_x)
-        self._pending_evaluations = list(pending_evaluations.values())
-        if self._num_train_x >= self._initial_design_size:
-            try:
-                if len(self._pending_evaluations) > 0:
-                    # We want to use hallucinated results for the evaluations that have
-                    # not finished yet. For this we fit a model on the finished
-                    # evaluations and add these to the other results to fit another model.
-                    self.surrogate_model.fit(train_x, train_y)
-                    self.cost_model.fit(train_x, train_cost)
-                    ys, _ = self.surrogate_model.predict(self._pending_evaluations)
-                    zs, _ = self.cost_model.predict(self._pending_evaluations)
-                    train_x += self._pending_evaluations
-                    train_y += list(ys.detach().numpy())
-                    train_cost += list(zs.detach().numpy())
-
-                self.surrogate_model.fit(train_x, train_y)
-                self.cost_model.fit(train_x, train_cost)
-                # TODO: set acquisition state
-                self.acquisition.set_state(
-                    self.surrogate_model,
-                    alpha=1
-                    - (budget_info.used_cost_budget / budget_info.max_cost_budget),
-                    cost_model=self.cost_model,
-                )
-                self.acquisition_sampler.set_state(x=train_x, y=train_y)
-
-                self._model_update_failed = False
-            except RuntimeError as runtime_error:
-                self.logger.exception(
-                    "Model could not be updated due to below error. Sampling will not use"
-                    " the model."
-                )
-                if self.loss_value_on_error is None or self.cost_value_on_error is None:
-                    raise ValueError(
-                        "A RuntimeError happened and "
-                        "loss_value_on_error or cost_value_on_error "
-                        "value is not provided, please fix the error or "
-                        "provide the values to continue without "
-                        "updating the model"
-                    ) from runtime_error
-                self._model_update_failed = True

From 0338a99909491c96002223d88989c1d04beebbeb Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Tue, 27 Aug 2024 19:12:47 +0200
Subject: [PATCH 16/63] fix: CenteredPrior prefers [0, 1] sample domain

---
 neps/priors.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/neps/priors.py b/neps/priors.py
index 471718f0..116b31f9 100644
--- a/neps/priors.py
+++ b/neps/priors.py
@@ -118,7 +118,7 @@ def uniform(
         return UniformPrior(domains=domains, device=device)
 
     @classmethod
-    def make_centered(  # noqa: C901
+    def make_centered(
         cls,
         domains: Mapping[str, Domain],
         centers: Mapping[str, tuple[Any, float]],
@@ -189,8 +189,8 @@ def make_centered(  # noqa: C901
             center_confidence = centers.get(name)
             if center_confidence is None:
                 dist = DistributionOverDomain(
-                    distribution=torch.distributions.Uniform(domain.lower, domain.upper),
-                    domain=domain,
+                    distribution=torch.distributions.Uniform(0.0, 1.0),
+                    domain=UNIT_FLOAT_DOMAIN,
                 )
                 continue
 
@@ -227,16 +227,15 @@ def make_centered(  # noqa: C901
                 continue
 
             # We place a truncnorm over a unitnorm
-            if domain.log_bounds is not None:
-                domain.to_unit(torch.tensor(center, device=device, dtype=torch.float64))
-                torch.tensor(1 - confidence, device=device, dtype=torch.float64)
-
+            unit_center = domain.to_unit(
+                torch.tensor(center, device=device, dtype=torch.float64)
+            )
             dist = DistributionOverDomain(
                 distribution=TruncatedNormal(
-                    loc=center,
+                    loc=unit_center,
                     scale=(1 - confidence),
-                    a=domain.lower,
-                    b=domain.upper,
+                    a=0.0,
+                    b=1.0,
                     device=device,
                 ),
                 domain=UNIT_FLOAT_DOMAIN,

From 058ab2b5419f72aed93c6693a9b0b333808e4cc5 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 28 Aug 2024 13:41:42 +0200
Subject: [PATCH 17/63] refactor: Revamp Sampler and Prior

---
 .../bayesian_optimization/models/gp.py        |  85 ++--
 .../bayesian_optimization/optimizer.py        | 149 ++++---
 neps/optimizers/initial_design.py             |  88 +----
 neps/sampling/__init__.py                     |   0
 neps/{ => sampling}/priors.py                 | 270 ++++++++-----
 neps/sampling/samplers.py                     | 120 ++++++
 neps/search_spaces/domain.py                  | 171 +++++---
 neps/search_spaces/encoding.py                | 364 ++++--------------
 neps/search_spaces/search_space.py            |   9 +-
 9 files changed, 652 insertions(+), 604 deletions(-)
 create mode 100644 neps/sampling/__init__.py
 rename neps/{ => sampling}/priors.py (58%)
 create mode 100644 neps/sampling/samplers.py

diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index b302edcd..206078cb 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -2,6 +2,7 @@
 
 import logging
 import math
+from functools import reduce
 from typing import TYPE_CHECKING, Any, Mapping, TypeVar
 
 import gpytorch
@@ -13,11 +14,12 @@
 from botorch.models.transforms.outcome import Standardize
 from botorch.optim import optimize_acqf, optimize_acqf_mixed
 from gpytorch.kernels import MaternKernel, ScaleKernel
+from torch._dynamo.utils import product
 
 from neps.search_spaces.encoding import (
     CategoricalToIntegerTransformer,
-    DataEncoder,
-    DataPack,
+    TensorEncoder,
+    TensorPack,
 )
 
 if TYPE_CHECKING:
@@ -114,7 +116,7 @@ def default_mean() -> gpytorch.means.ConstantMean:
 
 
 def default_matern_kernel(
-    N: int,  # noqa: N803
+    N: int,
     active_dims: tuple[int, ...] | None = None,
 ) -> ScaleKernel:
     lengthscale_prior, lengthscale_constraint = default_lengthscale_prior(N)
@@ -131,7 +133,7 @@ def default_matern_kernel(
 
 
 def default_categorical_kernel(
-    N: int,  # noqa: N803
+    N: int,
     active_dims: tuple[int, ...] | None = None,
 ) -> ScaleKernel:
     # Following BoTorches implementation of the MixedSingleTaskGP
@@ -145,30 +147,20 @@ def default_categorical_kernel(
 
 
 def default_single_obj_gp(
-    x: DataPack,
+    x: TensorPack,
     y: torch.Tensor,
 ) -> SingleTaskGP:
     encoder = x.encoder
-    assert x.tensor is not None
-    assert encoder.tensors is not None
-    # Here, we will collect all graph encoded hyperparameters and assign each
-    # to its own individual WL kernel.
-    if encoder.graphs is not None:
-        raise NotImplementedError("Graphs are not yet supported.")
-
-    numerics: list[str] = []
-    categoricals: list[str] = []
-    for hp_name, transformer in encoder.tensors.transformers.items():
+    numerics: list[int] = []
+    categoricals: list[int] = []
+    for hp_name, transformer in encoder.transformers.items():
         if isinstance(transformer, CategoricalToIntegerTransformer):
-            categoricals.append(hp_name)
+            categoricals.append(encoder.index_of[hp_name])
         else:
-            numerics.append(hp_name)
-
-    categorical_indices = encoder.indices(categoricals)
-    numeric_indices = encoder.indices(numerics)
+            numerics.append(encoder.index_of[hp_name])
 
     # Purely vectorial
-    if len(categorical_indices) == 0:
+    if len(categoricals) == 0:
         return SingleTaskGP(
             train_X=x.tensor,
             train_Y=y,
@@ -180,7 +172,7 @@ def default_single_obj_gp(
         )
 
     # Purely categorical
-    if len(numeric_indices) == 0:
+    if len(numerics) == 0:
         return SingleTaskGP(
             train_X=x.tensor,
             train_Y=y,
@@ -214,7 +206,7 @@ def cont_kernel_factory(
     return MixedSingleTaskGP(
         train_X=x.tensor,
         train_Y=y,
-        cat_dims=list(categorical_indices),
+        cat_dims=categoricals,
         likelihood=default_likelihood_with_prior(),
         cont_kernel_factory=cont_kernel_factory,
         outcome_transform=Standardize(m=1),
@@ -223,25 +215,26 @@ def cont_kernel_factory(
 
 def optimize_acq(
     acq_fn: AcquisitionFunction,
-    encoder: DataEncoder,
+    encoder: TensorEncoder,
     *,
     n_candidates_required: int = 1,
     num_restarts: int = 20,
     n_intial_start_points: int = 512,
     acq_options: Mapping[str, Any] | None = None,
+    maximum_allowed_categorical_combinations: int = 30,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     acq_options = acq_options or {}
-    if encoder.has_graphs():
-        raise NotImplementedError("Graphs are not yet supported.")
 
-    assert encoder.tensors is not None
-    lower = [t.domain.lower for t in encoder.tensors.transformers.values()]
-    upper = [t.domain.upper for t in encoder.tensors.transformers.values()]
+    lower = [domain.lower for domain in encoder.domains.values()]
+    upper = [domain.upper for domain in encoder.domains.values()]
     bounds = torch.tensor([lower, upper], dtype=torch.float)
 
-    fixed_categoricals = encoder.categorical_product_indices()
-
-    if not any(fixed_categoricals):
+    cat_transformers = {
+        name: t
+        for name, t in encoder.transformers.items()
+        if isinstance(t, CategoricalToIntegerTransformer)
+    }
+    if not any(cat_transformers):
         return optimize_acqf(
             acq_function=acq_fn,
             bounds=bounds,
@@ -251,14 +244,36 @@ def optimize_acq(
             **acq_options,
         )
 
-    if len(fixed_categoricals) > 30:
+    # We need to generate the product of all possible combinations of categoricals,
+    # first we do a sanity check
+    n_combos = reduce(
+        lambda x, y: x * y, [len(t.choices) for t in cat_transformers.values()]
+    )
+    if n_combos > maximum_allowed_categorical_combinations:
         raise ValueError(
             "The number of fixed categorical dimensions is too high. "
             "This will lead to an explosion in the number of possible "
-            "combinations. Please reduce the number of fixed categorical "
+            f"combinations. Got: {n_combos} while the setting for the function"
+            f" is: {maximum_allowed_categorical_combinations=}. Consider reducing the "
             "dimensions or consider encoding your categoricals in some other format."
         )
 
+    # Right, now we generate all possible combinations
+    # First, just collect the possible values per cat column
+    # NOTE: Botorchs optim requires them to be as floats
+    cats: dict[int, list[float]] = {
+        encoder.index_of[name]: [float(i) for i in range(len(transformer.choices))]
+        for name, transformer in cat_transformers.items()
+    }
+
+    # Second, generate all possible combinations
+    fixed_cats: list[dict[int, float]]
+    if len(cats) == 1:
+        col, choice_indices = next(iter(cats.items()))
+        fixed_cats = [{col: i} for i in choice_indices]
+    else:
+        fixed_cats = [dict(zip(cats.keys(), combo)) for combo in product(*cats.values())]
+
     # TODO: we should deterministicall shuffle the fixed_categoricals as the
     # underlying function does not.
     return optimize_acqf_mixed(
@@ -267,6 +282,6 @@ def optimize_acq(
         num_restarts=num_restarts,
         raw_samples=n_intial_start_points,
         q=n_candidates_required,
-        fixed_features_list=fixed_categoricals,  # type: ignore
+        fixed_features_list=fixed_cats,
         **acq_options,
     )
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 5a1314db..83a65ce2 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -17,9 +17,10 @@
     default_single_obj_gp,
     optimize_acq,
 )
-from neps.optimizers.initial_design import Sobol
+from neps.optimizers.initial_design import PriorInitialDesign, Sobol
 from neps.priors import Prior
-from neps.search_spaces.encoding import DataEncoder
+from neps.search_spaces.domain import Domain
+from neps.search_spaces.encoding import TensorEncoder, TensorPack
 from neps.search_spaces.hyperparameters.categorical import CategoricalParameter
 
 if TYPE_CHECKING:
@@ -28,11 +29,34 @@
     from neps.search_spaces import (
         SearchSpace,
     )
-    from neps.search_spaces.domain import Domain
-    from neps.search_spaces.encoding import DataPack
+    from neps.search_spaces.hyperparameters.float import FloatParameter
+    from neps.search_spaces.hyperparameters.integer import IntegerParameter
     from neps.state import BudgetInfo, Trial
 
 
+def pibo_acq_beta_and_n(
+    n_sampled_already: int, ndims: int, budget_info: BudgetInfo
+) -> tuple[float, float]:
+    if budget_info.max_evaluations is not None:
+        # From the PIBO paper (Section 4.1)
+        # https://arxiv.org/pdf/2204.11051
+        beta = budget_info.max_evaluations / 10
+        return n_sampled_already, beta
+
+    if budget_info.max_cost_budget is not None:
+        # This might not work well if cost number is high
+        # early on, but it will start to normalize.
+        n = budget_info.used_cost_budget
+        beta = budget_info.max_cost_budget / 10
+        return n, beta
+
+    # Otherwise, just some random heuristic based on the number
+    # of trials and dimensionality of the search space
+    # TODO: Think about and evaluate this more.
+    beta = ndims**2 / 10
+    return n_sampled_already, beta
+
+
 class BayesianOptimization(BaseOptimizer):
     """Implements the basic BO loop."""
 
@@ -41,10 +65,14 @@ def __init__(
         pipeline_space: SearchSpace,
         *,
         initial_design_size: int | None = None,
-        surrogate_model: Literal["gp"] | Callable[[DataPack, torch.Tensor], Model] = "gp",
+        surrogate_model: (
+            Literal["gp"] | Callable[[TensorPack, torch.Tensor], Model]
+        ) = "gp",
         use_priors: bool = False,
         sample_default_first: bool = False,
         device: torch.device | None = None,
+        encoder: TensorEncoder | None = None,
+        treat_fidelity_as_hyperparameters: bool = False,
         **kwargs: Any,  # TODO: Remove
     ):
         """Initialise the BO loop.
@@ -54,8 +82,16 @@ def __init__(
             initial_design_size: Number of samples used before using the surrogate model.
                 If None, it will take `int(log(N) ** 2)` samples where `N` is the number
                 of parameters in the search space.
-            surrogate_model: Surrogate model
+            surrogate_model: Surrogate model, either a known model str or a callable
+                that takes in the training data and returns a model fitted to (X, y).
             use_priors: Whether to use priors set on the hyperparameters during search.
+            sample_default_first: Whether to sample the default configuration first.
+            device: Device to use for the optimization.
+            encoder: Encoder to use for encoding the configurations. If None, it will
+                will use the default encoder.
+            treat_fidelity_as_hyperparameters: Whether to treat fidelities as
+                hyperparameters. If left as False, fidelities will be ignored
+                and configurations will always be sampled at the maximum fidelity.
 
         Raises:
             ValueError: if initial_design_size < 1
@@ -76,12 +112,20 @@ def __init__(
 
         super().__init__(pipeline_space=pipeline_space)
 
-        self.encoder = DataEncoder.default_encoder(
-            pipeline_space,
-            include_fidelities=False,
-        )
-        # We should only be acting on tensor'able hyperparameters for now
-        assert self.encoder.tensors is not None
+        if encoder is None:
+            parameters: dict[
+                str,
+                CategoricalParameter | FloatParameter | IntegerParameter,
+            ] = {
+                **pipeline_space.numerical,
+                **pipeline_space.categoricals,
+            }
+            if treat_fidelity_as_hyperparameters:
+                parameters.update(pipeline_space.fidelities)
+
+            self.encoder = TensorEncoder.default(parameters)
+        else:
+            self.encoder = encoder
 
         # TODO: This needs to be moved to the search space class, however
         # to not break the current prior based APIs used elsewhere, we can
@@ -96,8 +140,7 @@ def __init__(
             domains: dict[str, Domain] = {}
             centers: dict[str, tuple[Any, float]] = {}
             categoricals: set[str] = set()
-            for name in self.encoder.tensors.names():
-                hp = self.pipeline_space.hyperparameters[name]
+            for name, hp in parameters.items():
                 domains[name] = hp.domain  # type: ignore
 
                 if isinstance(hp, CategoricalParameter):
@@ -106,11 +149,13 @@ def __init__(
                 if hp.default is None:
                     continue
 
-                confidence_score: float = hp.default_confidence_choice  # type: ignore
-                if isinstance(hp, CategoricalParameter):
-                    center = hp._default_index
-                else:
-                    center = hp.default
+                confidence_str = hp.default_confidence_choice
+                confidence_score = _mapping[confidence_str]
+                center = (
+                    hp._default_index
+                    if isinstance(hp, CategoricalParameter)
+                    else hp.default
+                )
 
                 centers[name] = (center, confidence_score)
 
@@ -160,11 +205,9 @@ def ask(
 
         space = self.pipeline_space
         config_id = str(len(trials) + 1)
-        assert self.encoder.tensors is not None
 
         # Fill intitial design data if we don't have any...
         if self.initial_design_ is None:
-            size = self.n_initial_design
             self.initial_design_ = []
 
             # Add the default configuration first (maybe)
@@ -172,10 +215,24 @@ def ask(
                 config = space.sample_default_configuration()
                 self.initial_design_.append(config.hp_values())
 
-            # Fill remaining with Sobol sequence samples
-            sobol = Sobol(seed=0, encoder=self.encoder, allow_undersampling=True)
-            sobol_configs = sobol.sample(size - len(self.initial_design_))
-            self.initial_design_.extend(sobol_configs)
+            if self.prior:
+                sampler = PriorInitialDesign(prior=self.prior, seed=0)
+            else:
+                sampler = Sobol(ndim=self.encoder.ncols, seed=0, scramble=True)
+
+            n_samples = self.n_initial_design - len(self.initial_design_)
+
+            # We add a buffer of 2x the samples to help ensure
+            # we get get enough after removing duplicates.
+            x = sampler.sample(n_samples * 2)
+            x = Domain.translate(
+                x,
+                to=self.encoder.domains.values(),
+                frm=sampler.sample_domain,
+            )
+            uniq_x = torch.unique(x, dim=0)
+            configs = self.encoder.decode_dicts(uniq_x[:n_samples])
+            self.initial_design_.extend(configs)
 
         # If we havn't passed the intial design phase, just return
         # the next one.
@@ -187,13 +244,10 @@ def ask(
             )
 
         # Now we actually do the BO loop, start by encoding the data
-        x = self.encoder.encode(x_configs, device=self.device)
+        x = self.encoder.pack(x_configs, device=self.device)
+        x_pending = None
         if any(pending):
-            x_pending = self.encoder.encode(pending, device=self.device)
-            x_pending = x_pending.tensor
-            assert x_pending is not None
-        else:
-            x_pending = None
+            x_pending = self.encoder.pack(pending, device=self.device)
 
         # Get our fitted model
         model = self._get_fitted_model(x, y)
@@ -204,7 +258,7 @@ def ask(
         acq = qLogExpectedImprovement(
             model,
             best_f=y.min(),
-            X_pending=x_pending,
+            X_pending=None if x_pending is None else x_pending.tensor,
             # Unfortunatly, there's no option to indicate that we minimize
             # the AcqFunction so we need to do some kind of transformation.
             # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607
@@ -213,34 +267,15 @@ def ask(
 
         # If we have a prior, then we use it with PiBO
         if self.prior:
-            if budget_info.max_evaluations is not None:
-                # From the PIBO paper (Section 4.1)
-                # https://arxiv.org/pdf/2204.11051
-                n = budget_info.used_evaluations
-                beta = budget_info.max_evaluations / 10
-
-            elif budget_info.max_cost_budget is not None:
-                # This might not work well if cost number is high
-                # early on, but it will start to normalize.
-                n = budget_info.used_cost_budget
-                beta = budget_info.max_cost_budget / 10
-
-            else:
-                # Otherwise, just some random heuristic based on the number
-                # of trials and dimensionality of the search space
-                # TODO: Think about and evaluate this more.
-                ndim = x.tensor.shape[1]  # type: ignore
-                n = len(x_configs)
-                beta = ndim**2 / 10
-
+            n, beta = pibo_acq_beta_and_n(
+                n_sampled_already=len(trials),
+                ndims=self.encoder.ncols,
+                budget_info=budget_info,
+            )
             acq = PiboAcquisition(acq, prior=self.prior, n=n, beta=beta)
 
         # Optimize it
-        candidates, _eis = optimize_acq(
-            acq_fn=acq,
-            encoder=self.encoder,
-            acq_options={},  # options to underlying optim function of botorch
-        )
+        candidates, _eis = optimize_acq(acq_fn=acq, encoder=self.encoder, acq_options={})
 
         # Take the first (and only?) candidate
         assert len(candidates) == 1
diff --git a/neps/optimizers/initial_design.py b/neps/optimizers/initial_design.py
index e80dfe33..c8039c6a 100644
--- a/neps/optimizers/initial_design.py
+++ b/neps/optimizers/initial_design.py
@@ -1,80 +1,34 @@
+"""Initial design of points for optimization."""
+
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Protocol
-
-import torch
-
-from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
+from typing import TYPE_CHECKING
+from typing_extensions import override
 
 if TYPE_CHECKING:
-    from neps.search_spaces.encoding import DataEncoder
-
+    import torch
 
-class InitialDesign(Protocol):
-    def sample(self, n: int) -> list[dict[str, Any]]: ...
+    from neps.priors import Prior
+    from neps.search_spaces.domain import Domain
 
 
 @dataclass
-class Sobol(InitialDesign):
-    seed: int
-    """The seed for the Sobol sequence."""
-
-    encoder: DataEncoder
-    """The encoding used to encode the samples."""
-
-    scramble: bool = True
-    """Whether to scramble the Sobol sequence."""
-
-    buffer_sample_multiplier: int = 2
-    """How many samples to generate in the buffer before checking for uniqueness."""
-
-    allow_undersampling: bool = False
-    """If True, will allow undersampling if we can't generate `n` unique samples."""
-
-    def sample(self, n: int) -> list[dict[str, Any]]:
-        """Sample `n` points from the Sobol sequence.
-
-        !!! warning
-
-            If `self.allow_undersampling` is False, this method will raise a ValueError if
-            it cannot generate `n` unique samples.
-
-        Args:
-            n: The number of points to sample.
-
-        Returns:
-            A list of `n` points sampled from the Sobol sequence.
-        """
-        assert self.encoder.tensors is not None
-
-        if self.encoder.has_graphs():
-            # TODO: Won't work on graphs
-            raise NotImplementedError("Graphs are not yet supported.")
-
-        if self.encoder.n_numerical == 0 and self.encoder.n_categorical > 0:
-            # TODO: We need to do something else if we have only categoricals
-            # as we are going to get a lot of duplicates
-            raise NotImplementedError("Only categorical variables are not yet supported.")
-
-        ndim = self.encoder.n_numerical + self.encoder.n_categorical
-        sobol = torch.quasirandom.SobolEngine(dimension=ndim, scramble=True, seed=5)
+class PriorInitialDesign(InitialDesign):
+    """Sample from a prior distribution."""
 
-        SAMPLE_SIZE = self.buffer_sample_multiplier * n
-        unit_x = sobol.draw(SAMPLE_SIZE, dtype=torch.float64)
+    prior: Prior
+    """The prior to sample from."""
 
-        x = Domain.cast_many(
-            unit_x,
-            to=list(self.encoder.tensors.domains().values()),
-            frm=UNIT_FLOAT_DOMAIN,
-        )
+    # TODO: Right now we don't have a way to set the seed temporarily
+    seed: int | None = None
+    """The seed for sampling."""
 
-        # NOTE: We have to check uniqueness after conversion from unit cube space
-        # as we could have multiple unit floats mapping to the same categories or integers
-        unique_x = torch.unique(x, dim=0)
-        if len(unique_x) < n and not self.allow_undersampling:
-            raise ValueError(
-                f"Could not generate {n} unique samples, got {len(unique_x)}\n{self=}"
-            )
+    @override
+    def sample(self, n: int) -> torch.Tensor:
+        return self.prior.sample(n)
 
-        return self.encoder.decode_dicts(unique_x[:n])
+    @property
+    @override
+    def sample_domain(self) -> list[Domain]:
+        return self.prior.domains
diff --git a/neps/sampling/__init__.py b/neps/sampling/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/neps/priors.py b/neps/sampling/priors.py
similarity index 58%
rename from neps/priors.py
rename to neps/sampling/priors.py
index 116b31f9..8ccc76e9 100644
--- a/neps/priors.py
+++ b/neps/sampling/priors.py
@@ -4,31 +4,31 @@
 variables, i.e. each column of a tensor is assumed to be independent and
 can be acted on independently.
 
-They are not a `torch.distributions.Distribution` subclass as methods like
-`entropy` and `kl_divergence` are just more difficult to implement
-(not impossible, just more difficult and not needed right now).
-
 See the class doc description of [`Prior`][neps.priors.Prior] for more details.
 """
 
 from __future__ import annotations
 
 from dataclasses import dataclass, field
+from functools import reduce
 from typing import TYPE_CHECKING, Any, Container, Mapping, Protocol
 from typing_extensions import override
 
 import torch
 
 from neps.distributions import DistributionOverDomain, TruncatedNormal
+from neps.sampling.samplers import Sampler
 from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
 
 if TYPE_CHECKING:
     from torch.distributions import Distribution
 
 
-class Prior(Protocol):
+class Prior(Sampler, Protocol):
     """A protocol for priors over search spaces.
 
+    Extends from the [`Sampler`][neps.samplers.Sampler] protocol.
+
     At it's core, the two methods that need to be implemented are
     `log_prob` and `sample`. The `log_prob` method should return the
     log probability of a given tensor of samples under its distribution.
@@ -55,67 +55,51 @@ class Prior(Protocol):
         actually be `1` (1 / 1) for any value inside the domain.
     """
 
-    domains: list[Domain]
-    """Domain of values which this prior acts upon.
-
-    Each domain corresponds to the corresponding `ndim` in a tensor
-    (n_samples, ndim).
-    """
-
-    device: torch.device | None
-    """Device to place the tensors on."""
-
-    def log_prob(self, x: torch.Tensor) -> torch.Tensor:
+    def log_prob(
+        self,
+        x: torch.Tensor,
+        *,
+        frm: list[Domain] | Domain,
+    ) -> torch.Tensor:
         """Compute the log probability of values in `x` under a prior.
 
-        All columns of `x` are assumed to be independent, such that the
+        The last dimenion of `x` is assumed to be independent, such that the
         log probability of the entire tensor is the sum of the log
-        probabilities of each column.
+        probabilities of each element in that dimension.
+
+        For example, if `x` is of shape `(n_samples, n_dims)`, then the
+        you will be given back a tensor of shape `(n_samples,)` with the
+        each entry being the log probability of the corresponding sample.
 
         Args:
-            x: Tensor of shape (n_samples, n_dims)
+            x: Tensor of shape (..., n_dims)
                 In the case of a 1D tensor, the shape is assumed to be (n_dims,)
+            frm: The domain of the values in `x`. If a single domain, then all the
+                values are assumed to be from that domain, otherwise each column
+                `n_dims` in (n_samples, n_dims) is from the corresponding domain.
 
         Returns:
-            Tensor of shape (n_samples,) with the log probabilities of each. In the
+            Tensor of shape (...,), with the last dimension reduced out. In the
             case that only single dimensional tensor is passed, the returns value
             is a scalar.
         """
         ...
 
-    def sample(self, n: int) -> torch.Tensor:
-        """Sample from the prior.
-
-        Args:
-            n: Number of samples to draw.
-
-        Returns:
-            Tensor of shape (n, n_dims) with the samples.
-        """
-        ...
-
-    def prob(self, x: torch.Tensor) -> torch.Tensor:
+    def prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor:
         """Compute the probability of values in `x` under a prior.
 
         See [`log_prob()`][neps.priors.Prior.log_prob] for details on shapes.
         """
-        return torch.exp(self.log_prob(x))
+        return torch.exp(self.log_prob(x, frm=frm))
 
     @classmethod
-    def uniform(
-        cls,
-        domains: Mapping[str, Domain] | list[Domain],
-        *,
-        device: torch.device | None = None,
-    ) -> UniformPrior:
+    def uniform(cls, ncols: int) -> UniformPrior:
         """Create a uniform prior for a given list of domains.
 
         Args:
-            domains: domains over which to have a uniform prior.
-            device: Device to place the tensors on.
+            ncols: The number of columns in the tensor to sample.
         """
-        domains = domains if isinstance(domains, list) else list(domains.values())
-        return UniformPrior(domains=domains, device=device)
+        return UniformPrior(ncols=ncols)
 
     @classmethod
     def make_centered(
@@ -242,9 +226,7 @@ def make_centered(
             )
             distributions.append(dist)
 
-        return CenteredPrior(
-            domains=list(domains.values()), distributions=distributions, device=device
-        )
+        return CenteredPrior(distributions=distributions)
 
 
 @dataclass
@@ -261,12 +243,6 @@ class CenteredPrior(Prior):
     [`Prior.make_centered()`][neps.priors.Prior.make_centered].
     """
 
-    domains: list[Domain]
-    """Domain of values."""
-
-    device: torch.device | None
-    """Device to place the tensors on."""
-
     distributions: list[DistributionOverDomain]
     """Distributions along with the corresponding domains they sample from."""
 
@@ -275,11 +251,24 @@ class CenteredPrior(Prior):
     def __post_init__(self):
         self._distribution_domains = [dist.domain for dist in self.distributions]
 
+    @property
+    @override
+    def ncols(self) -> int:
+        return len(self.distributions)
+
     @override
-    def log_prob(self, x: torch.Tensor) -> torch.Tensor:
+    def log_prob(self, x: torch.Tensor, *, frm: list[Domain] | Domain) -> torch.Tensor:
+        if x.ndim == 0:
+            raise ValueError("Expected a tensor of shape (..., ncols).")
+
+        if x.ndim == 1:
+            x = x.unsqueeze(0)
+
         # Cast all values from the value domains to the domain of the sampler.
-        sample_domain_tensor = Domain.cast_many(
-            x, frm=self.domains, to=self._distribution_domains
+        sample_domain_tensor = Domain.translate(
+            x,
+            frm=frm,
+            to=self._distribution_domains,
         )
 
         # Calculate the log probabilities of the sample domain tensors under their
@@ -289,25 +278,34 @@ def log_prob(self, x: torch.Tensor) -> torch.Tensor:
                 dist.distribution.log_prob(sample_domain_tensor[:, i])
                 for i, dist in enumerate(self.distributions)
             ],
-            dim=1,
+            dim=-1,
         )
-        return torch.sum(log_probs, dim=1)
+        return torch.sum(log_probs, dim=-1)
 
     @override
-    def sample(self, n: int) -> torch.Tensor:
-        buffer = torch.empty(
-            n,
-            len(self.distributions),
-            device=self.device,
-            dtype=torch.float64,
+    def sample(
+        self,
+        n: int | torch.Size,
+        *,
+        to: Domain | list[Domain],
+        seed: int | None = None,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        if seed is not None:
+            raise NotImplementedError("Seeding is not yet implemented.")
+
+        _out_shape = (
+            torch.Size((n, self.ncols))
+            if isinstance(n, int)
+            else torch.Size((*n, self.ncols))
         )
+        _n = torch.Size((n,)) if isinstance(n, int) else n
 
-        size = torch.Size((n,))
-        for i, (value_domain, frm) in enumerate(zip(self.domains, self.distributions)):
-            samples = frm.distribution.sample(size)
-            buffer[:, i] = value_domain.cast(samples, frm=frm.domain)
+        out = torch.empty(_out_shape, device=device, dtype=torch.float64)
+        for i, dist in enumerate(self.distributions):
+            out[..., i] = dist.distribution.sample(_n)
 
-        return buffer
+        return Domain.translate(out, frm=self._distribution_domains, to=to)
 
 
 @dataclass
@@ -317,49 +315,119 @@ class UniformPrior(Prior):
     Uses a UnitUniform under the hood before converting to the value domain.
     """
 
-    domains: list[Domain]
-    """Domain of values."""
-
-    device: torch.device | None
-    """Device to place the tensors on."""
+    ncols: int
+    """The number of columns in the tensor to sample from."""
 
     _unit_uniform: Distribution = field(init=False, repr=False)
 
     def __post_init__(self):
         self._unit_uniform = torch.distributions.Uniform(0.0, 1.0)
 
-    def log_prob(self, x: torch.Tensor) -> torch.Tensor:
-        """Compute the log probability of values in `x` under a prior.
+    @override
+    def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor:
+        sample_domain_tensor = Domain.translate(x, frm=frm, to=UNIT_FLOAT_DOMAIN)
+        return torch.sum(self._unit_uniform.log_prob(sample_domain_tensor), dim=-1)
 
-        All columns of `x` are assumed to be independent, such that the
-        log probability of the entire tensor is the sum of the log
-        probabilities of each column.
+    @override
+    def sample(
+        self,
+        n: int | torch.Size,
+        *,
+        to: Domain | list[Domain],
+        seed: int | None = None,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        if seed is not None:
+            raise NotImplementedError("Seeding is not yet implemented.")
+
+        _n = (
+            torch.Size((n, self.ncols))
+            if isinstance(n, int)
+            else torch.Size((*n, self.ncols))
+        )
+        samples = torch.rand(_n, device=device, dtype=torch.float64)
+        return Domain.translate(samples, frm=UNIT_FLOAT_DOMAIN, to=to)
 
-        Args:
-            x: Tensor of shape (n_samples, n_dims)
-                In the case of a 1D tensor, the shape is assumed to be (n_dims,)
 
-        Returns:
-            Tensor of shape (n_samples,) with the log probabilities of each. In the
-            case that only single dimensional tensor is passed, the returns value
-            is a scalar.
-        """
-        sample_domain_tensor = Domain.cast_many(x, frm=self.domains, to=UNIT_FLOAT_DOMAIN)
-        return torch.sum(self._unit_uniform.log_prob(sample_domain_tensor), dim=1)
+@dataclass
+class WeightedPrior(Prior):
+    """A prior consisting of multiple priors with weights."""
 
-    def sample(self, n: int) -> torch.Tensor:
-        """Sample from the prior.
+    priors: list[Prior]
+    weights: torch.Tensor
+    probabilities: torch.Tensor = field(init=False, repr=False)
 
-        Args:
-            n: Number of samples to draw.
+    def __post_init__(self):
+        if len(self.priors) < 2:
+            raise ValueError(f"At least two priors must be given. Got {len(self.priors)}")
 
-        Returns:
-            Tensor of shape (n, n_dims) with the samples.
-        """
-        samples = torch.rand(
-            n,
-            len(self.domains),
-            device=self.device,
-            dtype=torch.float64,
+        if self.weights.ndim != 1:
+            raise ValueError("Weights must be a 1D tensor.")
+
+        if len(self.priors) != len(self.weights):
+            raise ValueError("The number of priors and weights must be the same.")
+
+        self.probabilities = self.weights / self.weights.sum()
+
+    @override
+    def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor:
+        # OPTIM: Avoid an initial allocation by using the output of the first
+        # distribution to store the weighted probabilities
+        itr = zip(self.probabilities, self.priors)
+        first_prob, first_prior = next(itr)
+
+        weighted_probs = first_prob * first_prior.log_prob(x, frm=frm)
+        for prob, prior in itr:
+            weighted_probs += prob * prior.log_prob(x, frm=frm)
+
+        return weighted_probs
+
+    @override
+    def sample(
+        self,
+        n: int | torch.Size,
+        *,
+        to: Domain | list[Domain],
+        seed: int | None = None,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        if seed is not None:
+            raise NotImplementedError("Seeding is not yet implemented.")
+
+        # Calculate the total number of samples required
+        if isinstance(n, int):
+            total_samples = n
+            output_shape = (n, self.ncols)
+        else:
+            total_samples = reduce(lambda x, y: x * y, n)
+            output_shape = (*n, self.ncols)
+
+        # Randomly select which prior to sample from for each of the total_samples
+        chosen_priors = torch.empty((total_samples,), device=device, dtype=torch.int64)
+        chosen_priors = torch.multinomial(
+            self.probabilities,
+            total_samples,
+            replacement=True,
+            out=chosen_priors,
+        )
+
+        # Create an empty tensor to hold all samples
+        output_samples = torch.empty(
+            (total_samples, self.ncols), device=device, dtype=torch.float64
         )
-        return Domain.cast_many(samples, frm=UNIT_FLOAT_DOMAIN, to=self.domains)
+
+        # Loop through each prior and its associated indices
+        for i, prior in enumerate(self.priors):
+            # Find indices where the chosen prior is i
+            _i = torch.tensor(i, dtype=torch.int64, device=device)
+            indices = torch.where(chosen_priors == _i)[0]
+
+            if len(indices) > 0:
+                # Sample from the prior for the required number of indices
+                samples_from_prior = prior.sample(len(indices), to=to, device=device)
+                output_samples[indices] = samples_from_prior
+
+        # Reshape to the output shape including ncols dimension
+        output_samples = output_samples.view(output_shape)
+
+        return Domain.translate(output_samples, frm=UNIT_FLOAT_DOMAIN, to=to)
diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py
new file mode 100644
index 00000000..f0298d84
--- /dev/null
+++ b/neps/sampling/samplers.py
@@ -0,0 +1,120 @@
+"""Samplers for generating points in a search space.
+
+These are similar to [`Prior`][neps.priors.Prior] objects, but they
+do not necessarily have an easily definable pdf.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from functools import reduce
+from typing import Protocol
+from typing_extensions import override
+
+import torch
+
+from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
+
+
+class Sampler(Protocol):
+    """A protocol for sampling tensors and vonerting them to a given domain."""
+
+    @property
+    def ncols(self) -> int:
+        """The number of columns in the samples produced by this sampler."""
+        ...
+
+    def sample(
+        self,
+        n: int | torch.Size,
+        *,
+        to: Domain | list[Domain],
+        seed: int | None = None,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        """Sample `n` points and convert them to the given domain.
+
+        Args:
+            n: The number of points to sample. If a torch.Size, an additional dimension
+                will be added with [`.ncols`][neps.samplers.Sampler.ncols].
+                For example, if `n = 5`, the output will be `(5, ncols)`. If
+                `n = (5, 3)`, the output will be `(5, 3, ncols)`.
+            to: The domain or list of domains to cast the points to.
+                If a single domain, all points are cast to that domain, otherwise
+                each column `ndim_i` in (n, ndim) is cast to the corresponding domain
+                in `to`. As a result, the length of `to` must match the number of columns
+                from [`.ncols`][neps.samplers.Sampler.ncols].
+            seed: The seed for the random number generator.
+            device: The device to cast the samples to.
+
+        Returns:
+            A tensor of (n, ndim) points sampled cast to the given domain.
+        """
+        ...
+
+    @classmethod
+    def sobol(cls, ndim: int, *, scramble: bool = True, seed: int | None = None) -> Sobol:
+        """Create a Sobol sampler.
+
+        Args:
+            ndim: The number of dimensions to sample for.
+            scramble: Whether to scramble the Sobol sequence.
+            seed: The seed for the Sobol sequence.
+
+        Returns:
+            A Sobol sampler.
+        """
+        return Sobol(ndim=ndim, scramble=scramble, seed=seed)
+
+
+# Technically this could be a prior with a uniform distribution
+@dataclass
+class Sobol(Sampler):
+    """Sample from a Sobol sequence."""
+
+    ndim: int
+    """The number of dimensions to sample for."""
+
+    seed: int | None = None
+    """The seed for the Sobol sequence."""
+
+    scramble: bool = True
+    """Whether to scramble the Sobol sequence."""
+
+    @property
+    @override
+    def ncols(self) -> int:
+        return self.ndim
+
+    @override
+    def sample(
+        self,
+        n: int | torch.Size,
+        *,
+        to: Domain | list[Domain],
+        seed: int | None = None,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        if seed is not None:
+            raise NotImplementedError("Setting the seed is not supported yet")
+
+        # Sobol can only produce 2d tensors. To handle batches or arbitrary
+        # dimensions, we get a count of the total number of samples needed
+        # and reshape the output tensor to the desired shape, if needed.
+        _n = n if isinstance(n, int) else reduce(lambda x, y: x * y, n)
+
+        sobol = torch.quasirandom.SobolEngine(
+            dimension=self.ndim,
+            scramble=self.scramble,
+            seed=self.seed,
+        )
+
+        out = torch.empty(_n, self.ncols, dtype=torch.float64, device=device)
+        x = sobol.draw(_n, dtype=torch.float64, out=out)
+
+        # If we got extra dimensions, such as batch dimensions, we need to
+        # reshape the tensor to the desired shape.
+        if isinstance(n, torch.Size):
+            x = x.view(*n, self.ncols)
+
+        return Domain.translate(x, frm=UNIT_FLOAT_DOMAIN, to=to)
diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py
index 2081bf33..5d92fac3 100644
--- a/neps/search_spaces/domain.py
+++ b/neps/search_spaces/domain.py
@@ -8,40 +8,44 @@
 * The midpoint of the domain.
 * Whether the domain is split into bins.
 
-With that, the primary method of a domain is to be able to cast
+With that, the primary method of a domain is to be able to
+[`cast()`][neps.search_spaces.domain.Domain.cast] a tensor of
 values from one to domain to another,
 e.g. `values_a = domain_a.cast(values_b, frm=domain_b)`.
 
 This can be used to convert float samples to integers, integers
 to log space, etc.
 
-The core method to do so is to be able to cast `to_unit` which takes
+The core method to do so is to be able to cast
+[`to_unit()`][neps.search_spaces.domain.Domain.to_unit] which takes
 values to a unit interval [0, 1], and then to be able to cast values in [0, 1]
-to the new domain with `from_unit`.
+to the new domain with [`from_unit()`][neps.search_spaces.domain.Domain.from_unit].
 
 There are some shortcuts implemented in `cast`, such as skipping going through
 the unit interval if the domains are the same, as no transformation is needed.
 
 The primary methods for creating a domain are
 
-* `Domain.float(l, u, ...)` - Used for modelling float ranges
-* `Domain.int(l, u, ...)` - Used for modelling integer ranges
-* `Domain.indices(n)` - Primarly used to model categorical choices
+* [`Domain.float(l, u, ...)`][neps.search_spaces.domain.Domain.float] -
+    Used for modelling float ranges
+* [`Domain.int(l, u, ...)`][neps.search_spaces.domain.Domain.int] -
+    Used for modelling integer ranges
+* [`Domain.indices(n)`][neps.search_spaces.domain.Domain.indices] -
+    Primarly used to model categorical choices
 
 If you have a tensor of values, where each column corresponds to a different domain,
-you can take a look at `Domain.cast_many` to cast all the values in one go.
+you can take a look at [`Domain.translate()`][neps.search_spaces.domain.Domain.translate]
 
-If you need a unit-interval domain, please use the `Domain.unit_float()` or
-`UNIT_FLOAT_DOMAIN` constant.
+If you need a unit-interval domain, please use the
+[`Domain.unit_float()`][neps.search_spaces.domain.Domain.unit_float]
+or `UNIT_FLOAT_DOMAIN` constant.
 """
 
-# TODO: Could theoretically implement dtype,device,out for all methods here but
-# would need to be careful not to accidentally send to and from GPU.
 from __future__ import annotations
 
 import math
 from dataclasses import dataclass, field
-from typing import Generic, Sequence, TypeVar
+from typing import Generic, Iterable, TypeVar
 
 import torch
 from torch import Tensor
@@ -53,11 +57,36 @@
 
 @dataclass(frozen=True)
 class Domain(Generic[V]):
+    """A domain for a value.
+
+    The primary methods for creating a domain are
+
+    * [`Domain.float(l, u, ...)`][neps.search_spaces.domain.Domain.float] -
+        Used for modelling float ranges
+    * [`Domain.int(l, u, ...)`][neps.search_spaces.domain.Domain.int] -
+        Used for modelling integer ranges
+    * [`Domain.indices(n)`][neps.search_spaces.domain.Domain.indices] -
+        Primarly used to model categorical choices
+    """
+
     lower: V
+    """The lower bound of the domain."""
+
     upper: V
+    """The upper bound of the domain."""
+
     round: bool
+    """Whether to round the values to the nearest integer."""
+
     log_bounds: tuple[float, float] | None = None
+    """The log bounds of the domain, if the domain is in log space."""
+
     bins: int | None = None
+    """The number of discrete bins to split the domain into.
+
+    Includes both endpoints of the domain and values are rounded to the nearest bin
+    value.
+    """
 
     dtype: torch.dtype = field(init=False, repr=False)
     is_unit_float: bool = field(init=False, repr=False)
@@ -102,6 +131,17 @@ def float(
         log: bool = False,
         bins: int | None = None,
     ) -> Domain[float]:
+        """Create a domain for a range of float values.
+
+        Args:
+            lower: The lower bound of the domain.
+            upper: The upper bound of the domain.
+            log: Whether the domain is in log space.
+            bins: The number of discrete bins to split the domain into.
+
+        Returns:
+            A domain for a range of float values.
+        """
         return Domain(
             lower=float(lower),
             upper=float(upper),
@@ -119,6 +159,17 @@ def int(
         log: bool = False,
         bins: int | None = None,
     ) -> Domain[int]:
+        """Create a domain for a range of integer values.
+
+        Args:
+            lower: The lower bound of the domain.
+            upper: The upper bound of the domain.
+            log: Whether the domain is in log space.
+            bins: The number of discrete bins to split the domain into.
+
+        Returns:
+            A domain for a range of integer values.
+        """
         return Domain(
             lower=int(round(lower)),
             upper=int(round(upper)),
@@ -134,13 +185,25 @@ def indices(cls, n: int) -> Domain[int]:
         Like range based functions this domain is inclusive of the lower bound
         and exclusive of the upper bound.
 
-        Use this method to create a domain for indices
+        Args:
+            n: The number of indices.
+
+        Returns:
+            A domain for a range of indices.
         """
         return Domain.int(0, n - 1)
 
     def to_unit(self, x: Tensor) -> Tensor:
+        """Transform a tensor of values from this domain to the unit interval [0, 1].
+
+        Args:
+            x: Tensor of values in this domain to convert.
+
+        Returns:
+            Same shape tensor with the values normalized to the unit interval [0, 1].
+        """
         if self.is_unit_float:
-            return x  # type: ignore
+            return x
 
         if self.log_bounds is not None:
             x = torch.log(x)
@@ -151,6 +214,14 @@ def to_unit(self, x: Tensor) -> Tensor:
         return (x - lower) / (upper - lower)
 
     def from_unit(self, x: Tensor) -> Tensor:
+        """Transform a tensor of values from the unit interval [0, 1] to this domain.
+
+        Args:
+            x: A tensor of values in the unit interval [0, 1] to convert.
+
+        Returns:
+            Same shape tensor with the lifted into this domain.
+        """
         if self.is_unit_float:
             return x
 
@@ -173,11 +244,19 @@ def from_unit(self, x: Tensor) -> Tensor:
 
         return x.type(self.dtype)
 
-    def cast(
-        self,
-        x: Tensor,
-        frm: Domain,
-    ) -> Tensor:
+    def cast(self, x: Tensor, frm: Domain) -> Tensor:
+        """Cast a tensor of values frm the domain `frm` to this domain.
+
+        If you need to cast a tensor of mixed domains, use
+        [`Domain.translate()`][neps.search_spaces.domain.Domain.translate].
+
+        Args:
+            x: Tensor of values in the `frm` domain to cast to this domain.
+            frm: The domain to cast from.
+
+        Returns:
+            Same shape tensor with the values cast to this domain.
+        """
         # NOTE: In general, we should always be able to go through the unit interval
         # [0, 1] to be able to transform between domains. However sometimes we can
         # bypass some steps, dependant on the domains, hence the ugliness...
@@ -216,16 +295,20 @@ def cast(
 
     @classmethod
     def unit_float(cls) -> Domain[float]:
+        """Get a domain for the unit interval [0, 1]."""
         return UNIT_FLOAT_DOMAIN
 
     @classmethod
-    def cast_many(
-        cls, x: Tensor, frm: Domain | Sequence[Domain], to: Domain | Sequence[Domain]
+    def translate(
+        cls,
+        x: Tensor,
+        frm: Domain | Iterable[Domain],
+        to: Domain | Iterable[Domain],
     ) -> Tensor:
         """Cast a tensor of mixed domains to a new set of mixed domains.
 
         Args:
-            x: Tensor of shape (n_samples, n_dims) with each dim `i` corresponding
+            x: Tensor of shape (..., n_dims) with each dim `i` corresponding
                 to the domain `frm[i]`.
             frm: List of domains to cast from. If list, must be length of `n_dims`,
                 otherwise we assume the single domain provided is the one to be used
@@ -235,43 +318,43 @@ def cast_many(
                 across all dimensions.
 
         Returns:
-            Tensor of shape (n_samples, n_dims) with each dim `i` transformed
-            from the domain `frm[i]` to the domain `to[i]`.
+            Tensor of the same shape as `x` with the last dimension casted
+                from the domain `frm[i]` to the domain `to[i]`.
         """
+        if x.ndim == 0:
+            raise ValueError("Expected a tensor with at least one dimension.")
+
         if x.ndim == 1:
-            raise ValueError(
-                "Expected a 2D tensor of shape (n_samples, n_dims), got a 1D tensor."
-            )
+            x = x.unsqueeze(0)
 
-        if isinstance(frm, Sequence) and len(frm) != x.shape[1]:
+        ndims = x.shape[-1]
+
+        # If both are not a list, we can just cast the whole tensor
+        if isinstance(frm, Domain) and isinstance(to, Domain):
+            return to.cast(x, frm=frm)
+
+        frm = [frm] * ndims if isinstance(frm, Domain) else list(frm)
+        to = [to] * ndims if isinstance(to, Domain) else list(to)
+
+        if len(frm) != ndims:
             raise ValueError(
                 "The number of domains in `frm` must match the number of tensors"
                 " if provided as a list."
-                f" Expected {x.shape[1]}, got {len(frm)}."
+                f" Expected {ndims} from last dimension of {x.shape}, got {len(frm)}."
             )
 
-        if isinstance(to, Sequence) and len(to) != x.shape[1]:
+        if len(to) != ndims:
             raise ValueError(
                 "The number of domains in `to` must match the number of tensors"
                 " if provided as a list."
-                f" Expected {x.shape[1]}, got {len(to)}."
+                f" Expected {ndims} from last dimension of {x.shape}, got {len(to)}."
             )
 
-        # If both are not a list, we can just cast the whole tensor
-        if not isinstance(frm, Sequence) and not isinstance(to, Sequence):
-            return to.cast(x, frm=frm)
-
-        # Otherwise, we need to go column by column
-        if isinstance(frm, Domain):
-            frm = [frm] * x.shape[1]
-        if isinstance(to, Domain):
-            to = [to] * x.shape[1]
-
-        buffer = torch.empty_like(x)
+        out = torch.empty_like(x)
         for i, (f, t) in enumerate(zip(frm, to)):
-            buffer[:, i] = t.cast(x[:, i], frm=f)
+            out[..., i] = t.cast(x[..., i], frm=f)
 
-        return buffer
+        return out
 
 
 UNIT_FLOAT_DOMAIN = Domain.float(0.0, 1.0)
diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
index 21d7acd4..ad3b00ae 100644
--- a/neps/search_spaces/encoding.py
+++ b/neps/search_spaces/encoding.py
@@ -1,17 +1,14 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from itertools import chain
 from typing import (
     TYPE_CHECKING,
     Any,
     Generic,
     Mapping,
     Sequence,
-    Sized,
     TypeAlias,
     TypeVar,
-    overload,
 )
 from typing_extensions import Protocol, override
 
@@ -19,19 +16,19 @@
 import numpy.typing as npt
 import torch
 from grakel.utils import graph_from_networkx
-from torch._dynamo.utils import product
 
-from neps.search_spaces.architecture.graph_grammar import GraphParameter
 from neps.search_spaces.domain import (
     UNIT_FLOAT_DOMAIN,
     Domain,
 )
+from neps.search_spaces.hyperparameters.categorical import CategoricalParameter
 from neps.search_spaces.hyperparameters.float import FloatParameter
 from neps.search_spaces.hyperparameters.integer import IntegerParameter
 
 if TYPE_CHECKING:
     import networkx as nx
 
+    from neps.search_spaces.parameter import Parameter
     from neps.search_spaces.search_space import SearchSpace
 
 WLInput: TypeAlias = tuple[dict, dict | None, dict | None]
@@ -192,18 +189,18 @@ def decode_dicts(self, x: npt.NDArray[np.object_]) -> list[dict[str, Any]]:
 @dataclass
 class TensorEncoder:
     transformers: dict[str, TensorTransformer]
-    column_lookup: dict[str, int] = field(init=False)
+    index_of: dict[str, int] = field(init=False)
     n_numerical: int = field(init=False)
     n_categorical: int = field(init=False)
 
     def __post_init__(self):
         transformers = sorted(self.transformers.items(), key=lambda t: t[0])
         self.transformers = dict(transformers)
-        self.column_lookup: dict[str, int] = {}
+        self.index_of: dict[str, int] = {}
         n_numerical = 0
         n_categorical = 0
         for i, (name, transformer) in enumerate(self.transformers.items()):
-            self.column_lookup[name] = i
+            self.index_of[name] = i
             if isinstance(transformer, CategoricalToIntegerTransformer):
                 n_categorical += 1
             else:
@@ -212,6 +209,11 @@ def __post_init__(self):
         self.n_numerical = n_numerical
         self.n_categorical = n_categorical
 
+    @property
+    def ncols(self) -> int:
+        return len(self.transformers)
+
+    @property
     def domains(self) -> dict[str, Domain]:
         return {
             name: transformer.domain for name, transformer in self.transformers.items()
@@ -222,10 +224,9 @@ def names(self) -> list[str]:
 
     def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor:
         if isinstance(hp, str):
-            return x[:, self.column_lookup[hp]]
+            return x[:, self.index_of[hp]]
 
-        cols = torch.concatenate([torch.arange(*self.column_lookup[h]) for h in hp])
-        return x[:, cols]
+        return x[:, [self.index_of[h] for h in hp]]
 
     def encode(
         self,
@@ -238,7 +239,7 @@ def encode(
 
         for hp_name, transformer in self.transformers.items():
             values = [conf[hp_name] for conf in x]
-            lookup = self.column_lookup[hp_name]
+            lookup = self.index_of[hp_name]
 
             # Encode directly into buffer
             transformer.encode(
@@ -250,314 +251,85 @@ def encode(
 
         return buffer
 
+    def pack(
+        self, x: Sequence[Mapping[str, Any]], *, device: torch.device | None = None
+    ) -> TensorPack:
+        return TensorPack(self.encode(x, device=device), self)
+
     def decode_dicts(self, x: torch.Tensor) -> list[dict[str, Any]]:
         values: dict[str, list[Any]] = {}
         for hp_name, transformer in self.transformers.items():
-            lookup = self.column_lookup[hp_name]
+            lookup = self.index_of[hp_name]
             tensor = x[:, lookup]
             values[hp_name] = transformer.decode(tensor)
 
         keys = list(values.keys())
         return [dict(zip(keys, vals)) for vals in zip(*values.values())]
 
-
-@dataclass
-class DataEncoder:
-    tensors: TensorEncoder | None = None
-    graphs: GraphEncoder | None = None
-    device: torch.device = field(default_factory=lambda: torch.device("cpu"))
-
-    n_numerical: int = field(init=False)
-    n_categorical: int = field(init=False)
-    n_graphs: int = field(init=False)
-
-    def __post_init__(self):
-        self.n_numerical = 0 if self.tensors is None else self.tensors.n_numerical
-        self.n_categorical = 0 if self.tensors is None else self.tensors.n_categorical
-        self.n_graphs = 0 if self.graphs is None else len(self.graphs.transformers)
-
-    def encode(
-        self,
-        x: Sequence[Mapping[str, Any]],
-        *,
-        device: torch.device | None = None,
-    ) -> DataPack:
-        tensor = self.tensors.encode(x, device=device) if self.tensors else None
-        graphs = self.graphs.encode(x) if self.graphs else None
-        return DataPack(encoder=self, tensor=tensor, graphs=graphs)
-
-    @overload
-    def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor: ...
-
-    @overload
-    def select(
-        self, x: npt.NDArray[np.object_], hp: str | Sequence[str]
-    ) -> npt.NDArray[np.object_]: ...
-
-    def select(
-        self,
-        x: torch.Tensor | npt.NDArray[np.object_],
-        hp: str | Sequence[str],
-    ) -> torch.Tensor | npt.NDArray[np.object_]:
-        if isinstance(x, torch.Tensor):
-            assert self.tensors is not None
-            return self.tensors.select(x, hp)
-
-        assert self.graphs is not None
-        return self.graphs.select(x, hp)
-
-    def decode_dicts(
-        self,
-        x: torch.Tensor
-        | npt.NDArray[np.object_]
-        | tuple[torch.Tensor | None, npt.NDArray[np.object_] | None],
-    ) -> list[dict[str, Any]]:
-        if isinstance(x, tuple):
-            tensors, graphs = x
-        elif isinstance(x, torch.Tensor):
-            tensors, graphs = x, None
-        else:
-            tensors, graphs = None, x
-
-        tensor_values: list[dict[str, Any]] | None = None
-        if tensors is not None:
-            assert self.tensors is not None
-            tensor_values = self.tensors.decode_dicts(tensors)
-
-        graph_values: list[dict[str, Any]] | None = None
-        if graphs is not None:
-            assert self.graphs is not None
-            graph_values = self.graphs.decode_dicts(graphs)
-
-        if tensor_values is not None and graph_values is not None:
-            assert len(tensor_values) == len(graph_values)
-            return [{**t, **g} for t, g in zip(tensor_values, graph_values)]
-
-        if tensor_values is not None:
-            return tensor_values
-
-        assert graph_values is not None
-        return graph_values
-
-    def indices(self, hp: str | Sequence[str]) -> tuple[int, ...]:
-        if isinstance(hp, str):
-            if self.tensors and hp in self.tensors.transformers:
-                lower, upper = self.tensors.column_lookup[hp]
-                return tuple(torch.arange(lower, upper).tolist())
-
-            if self.graphs and hp in self.graphs.transformers:
-                raise ValueError("Cannot select indices from graphs.")
-
-            tkeys = None if self.tensors is None else self.tensors.transformers.keys()
-            gkeys = None if self.graphs is None else self.graphs.transformers.keys()
-            raise KeyError(
-                f"Unknown hyperparameter {hp}. Not in either tensors or graphs"
-                f"\nTensors: {tkeys}"
-                f"\nGraphs: {gkeys}"
-            )
-
-        return tuple(sorted(chain.from_iterable(self.indices(h) for h in hp)))
-
     @classmethod
-    def default_encoder(
-        cls,
-        space: SearchSpace,
-        *,
-        include_fidelities: bool | list[str] = False,
-    ) -> DataEncoder:
-        tensor_transformers: dict[str, TensorTransformer] = {}
-        graph_transformers: dict[str, WLInputTransformer] = {}
-
-        for hp_name, hp in space.categoricals.items():
-            tensor_transformers[hp_name] = CategoricalToIntegerTransformer(hp.choices)
-
-        for hp_name, hp in space.numerical.items():
-            assert isinstance(hp, (FloatParameter, IntegerParameter))
-            tensor_transformers[hp_name] = MinMaxNormalizer(hp.domain)
-
-        for hp_name, hp in space.graphs.items():
-            assert isinstance(hp, GraphParameter)
-            graph_transformers[hp_name] = WLInputTransformer(hp_name)
-
-        if include_fidelities is True:
-            include_fidelities = list(space.fidelities.keys())
-
-        if include_fidelities:
-            for fid_name in include_fidelities:
-                hp = space.fidelities[fid_name]
-                assert isinstance(hp, (FloatParameter, IntegerParameter))
-                tensor_transformers[fid_name] = MinMaxNormalizer(hp.domain)
-
-        tensor_encoder = (
-            TensorEncoder(tensor_transformers) if any(tensor_transformers) else None
-        )
-        graph_encoder = (
-            GraphEncoder(graph_transformers) if any(graph_transformers) else None
-        )
-        return DataEncoder(tensors=tensor_encoder, graphs=graph_encoder)
-
-    def has_categoricals(self) -> bool:
-        return self.tensors is not None and any(
-            isinstance(t, CategoricalToIntegerTransformer)
-            for t in self.tensors.transformers.values()
-        )
-
-    def has_graphs(self) -> bool:
-        return self.graphs is not None
-
-    def has_numericals(self) -> bool:
-        return self.tensors is not None and any(
-            not isinstance(t, CategoricalToIntegerTransformer)
-            for t in self.tensors.transformers.values()
-        )
+    def default(cls, parameters: Mapping[str, Parameter]) -> TensorEncoder:
+        sorted_params = sorted(parameters.items())
+        transformers: dict[str, TensorTransformer] = {}
+        for name, hp in sorted_params:
+            if isinstance(hp, (FloatParameter, IntegerParameter)):
+                transformers[name] = MinMaxNormalizer(hp.domain)
+            else:
+                assert isinstance(hp, CategoricalParameter)
+                transformers[name] = CategoricalToIntegerTransformer(hp.choices)
 
-    def categorical_product_indices(self) -> list[dict[int, int]]:
-        cats: dict[int, list[int]] = {}
-        if self.tensors is None:
-            return []
+        return TensorEncoder(transformers)
 
-        for i, (_hp_name, transformer) in enumerate(self.tensors.transformers.items()):
-            if isinstance(transformer, CategoricalToIntegerTransformer):
-                cats[i] = list(range(len(transformer.choices)))
 
-        if len(cats) == 0:
-            return []
+@dataclass
+class TensorPack:
+    tensor: torch.Tensor
+    encoder: TensorEncoder
 
-        if len(cats) == 1:
-            key, values = cats.popitem()
-            return [{key: v} for v in values]
+    def __len__(self) -> int:
+        return len(self.tensor)
 
-        return [dict(zip(cats.keys(), vs)) for vs in product(*cats.values())]
+    @property
+    def n_numerical(self) -> int:
+        return self.encoder.n_numerical
 
+    @property
+    def n_categorical(self) -> int:
+        return self.encoder.n_categorical
 
-@dataclass
-class DataPack(Sized):
-    encoder: DataEncoder
-    tensor: torch.Tensor | None = None
-    graphs: npt.NDArray[np.object_] | None = None
-    _len: int = field(init=False)
+    @property
+    def ncols(self) -> int:
+        return self.encoder.ncols
 
-    def __post_init__(self):
-        if self.tensor is not None and self.graphs is not None:
-            assert len(self.tensor) == len(self.graphs)
-            self._len = len(self.tensor)
-        elif self.tensor is not None:
-            self._len = len(self.tensor)
-        elif self.graphs is not None:
-            self._len = len(self.graphs)
-        else:
-            raise ValueError("At least one of numerical or graphs must be provided")
-
-    def __len__(self) -> int:
-        return self._len
+    @property
+    def domains(self) -> dict[str, Domain]:
+        return self.encoder.domains
 
     def select(self, hp: str | Sequence[str]) -> torch.Tensor | npt.NDArray[np.object_]:
-        if isinstance(hp, str):
-            if self.encoder.tensors and hp in self.encoder.tensors.transformers:
-                assert self.tensor is not None
-                return self.encoder.tensors.select(self.tensor, hp)
-
-            if self.encoder.graphs and hp in self.encoder.graphs.transformers:
-                assert self.graphs is not None
-                return self.encoder.graphs.select(self.graphs, hp)
-
-            tkeys = (
-                None
-                if self.encoder.tensors is None
-                else self.encoder.tensors.transformers.keys()
-            )
-            gkeys = (
-                None
-                if self.encoder.graphs is None
-                else self.encoder.graphs.transformers.keys()
-            )
-            raise KeyError(
-                f"Unknown hyperparameter {hp}. Not in either tensors or graphs"
-                f"\nTensors: {tkeys}"
-                f"\nGraphs: {gkeys}"
-            )
+        return self.encoder.select(self.tensor, hp)
 
-        all_in_tensors = False
-        all_in_graphs = False
-        tkeys = None
-        gkeys = None
-        if self.encoder.tensors:
-            all_in_tensors = all(h in self.encoder.tensors.transformers for h in hp)
-
-        if self.encoder.graphs:
-            all_in_graphs = all(h in self.encoder.graphs.transformers for h in hp)
-            gkeys = self.encoder.graphs.transformers.keys()
-
-        if not all_in_tensors and not all_in_graphs:
-            raise ValueError(
-                "Cannot select from both tensors and graphs!"
-                f"Got keys: {hp}"
-                f"\nTensors: {tkeys}"
-                f"\nGraphs: {gkeys}"
-            )
+    def names(self) -> list[str]:
+        return self.encoder.names()
 
-        if all_in_tensors:
-            assert self.tensor is not None
-            assert self.encoder.tensors is not None
-            return self.encoder.tensors.select(self.tensor, hp)
-
-        assert self.graphs is not None
-        assert self.encoder.graphs is not None
-        return self.encoder.graphs.select(self.graphs, hp)
-
-    def decode(self, space: SearchSpace) -> list[SearchSpace]:
-        return [
-            space.from_dict(d)
-            for d in self.encoder.decode_dicts((self.tensor, self.graphs))
-        ]
-
-    def split(self, index: int) -> tuple[DataPack, DataPack]:
-        if self.tensor is not None:
-            numerical_left = self.tensor[:index]
-            numerical_right = self.tensor[index:]
-        else:
-            numerical_left = None
-            numerical_right = None
+    def to_dicts(self) -> list[dict[str, Any]]:
+        return self.encoder.decode_dicts(self.tensor)
 
-        if self.graphs is not None:
-            graphs_left = self.graphs[:index]
-            graphs_right = self.graphs[:index]
-        else:
-            graphs_left = None
-            graphs_right = None
-
-        return (
-            DataPack(
-                self.encoder,
-                tensor=numerical_left,
-                graphs=graphs_left,
-            ),
-            DataPack(
-                self.encoder,
-                tensor=numerical_right,
-                graphs=graphs_right,
-            ),
-        )
+    def split(self, index: int) -> tuple[TensorPack, TensorPack]:
+        left = TensorPack(self.encoder, tensor=self.tensor[:index])
+        right = TensorPack(self.encoder, tensor=self.tensor[index:])
+        return left, right
 
-    def join(self, *other: DataPack) -> DataPack:
+    def join(self, *other: TensorPack) -> TensorPack:
         assert all(o.encoder == self.encoder for o in other)
 
-        if self.tensor is not None:
-            other_numericals = []
-            for o in other:
-                assert o.tensor is not None
-                other_numericals.append(o.tensor)
-            numerical = torch.cat([self.tensor, *other_numericals], dim=0)
-        else:
-            numerical = None
-
-        if self.graphs is not None:
-            other_graphs = []
-            for o in other:
-                assert o.graphs is not None
-                other_graphs.append(o.graphs)
-            graphs = np.concatenate([self.graphs, *other_graphs], axis=0)
-        else:
-            graphs = None
+        numerical = torch.cat([self.tensor, *[o.tensor for o in other]], dim=0)
+        return TensorPack(self.encoder, tensor=numerical)
 
-        return DataPack(self.encoder, tensor=numerical, graphs=graphs)
+    @classmethod
+    def default_encoding(
+        cls,
+        x: Sequence[Mapping[str, Any]],
+        space: SearchSpace,
+    ) -> TensorPack:
+        default_encoder = TensorEncoder.default(space)
+        tensor = default_encoder.encode(x)
+        return TensorPack(default_encoder, tensor)
diff --git a/neps/search_spaces/search_space.py b/neps/search_spaces/search_space.py
index 40ecd0cf..671728f0 100644
--- a/neps/search_spaces/search_space.py
+++ b/neps/search_spaces/search_space.py
@@ -235,10 +235,10 @@ def __init__(self, **hyperparameters: Parameter):
         self.categoricals: Mapping[str, CategoricalParameter] = {
             k: hp for k, hp in _hyperparameters if isinstance(hp, CategoricalParameter)
         }
-        self.numerical: Mapping[str, NumericalParameter] = {
+        self.numerical: Mapping[str, IntegerParameter | FloatParameter] = {
             k: hp
             for k, hp in _hyperparameters
-            if isinstance(hp, NumericalParameter) and not hp.is_fidelity
+            if isinstance(hp, IntegerParameter | FloatParameter) and not hp.is_fidelity
         }
         self.graphs: Mapping[str, GraphParameter] = {
             k: hp for k, hp in _hyperparameters if isinstance(hp, GraphParameter)
@@ -247,8 +247,9 @@ def __init__(self, **hyperparameters: Parameter):
             k: hp.value for k, hp in _hyperparameters if isinstance(hp, ConstantParameter)
         }
         # NOTE: For future of multiple fidelities
-        self.fidelities: Mapping[str, NumericalParameter] = {}
-        if _fidelity_param is not None and _fidelity_name is None:
+        self.fidelities: Mapping[str, IntegerParameter | FloatParameter] = {}
+        if _fidelity_param is not None and _fidelity_name is not None:
+            assert isinstance(_fidelity_param, IntegerParameter | FloatParameter)
             self.fidelities = {_fidelity_name: _fidelity_param}
 
     def set_custom_grid_space(

From 1cdc3074636024c5ad693b66a99486cfb6b6a013 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 28 Aug 2024 13:55:17 +0200
Subject: [PATCH 18/63] refactor: Weighted Sampler

---
 .../bayesian_optimization/optimizer.py        |  7 +-
 neps/sampling/__init__.py                     |  4 +
 neps/sampling/priors.py                       | 85 +++++++----------
 neps/sampling/samplers.py                     | 95 ++++++++++++++++++-
 4 files changed, 133 insertions(+), 58 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 83a65ce2..3b7d7f4b 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -17,8 +17,7 @@
     default_single_obj_gp,
     optimize_acq,
 )
-from neps.optimizers.initial_design import PriorInitialDesign, Sobol
-from neps.priors import Prior
+from neps.sampling import Prior, Sampler
 from neps.search_spaces.domain import Domain
 from neps.search_spaces.encoding import TensorEncoder, TensorPack
 from neps.search_spaces.hyperparameters.categorical import CategoricalParameter
@@ -216,9 +215,9 @@ def ask(
                 self.initial_design_.append(config.hp_values())
 
             if self.prior:
-                sampler = PriorInitialDesign(prior=self.prior, seed=0)
+                sampler = self.prior
             else:
-                sampler = Sobol(ndim=self.encoder.ncols, seed=0, scramble=True)
+                sampler = Sampler.sobol(ndim=self.encoder.ncols, seed=0, scramble=True)
 
             n_samples = self.n_initial_design - len(self.initial_design_)
 
diff --git a/neps/sampling/__init__.py b/neps/sampling/__init__.py
index e69de29b..a7f4f36f 100644
--- a/neps/sampling/__init__.py
+++ b/neps/sampling/__init__.py
@@ -0,0 +1,4 @@
+from neps.sampling.priors import CenteredPrior, Prior, UniformPrior, WeightedPrior
+from neps.sampling.samplers import Sampler, Sobol
+
+__all__ = ["Sobol", "Sampler", "Prior", "UniformPrior", "CenteredPrior", "WeightedPrior"]
diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py
index 8ccc76e9..f7976bf4 100644
--- a/neps/sampling/priors.py
+++ b/neps/sampling/priors.py
@@ -10,14 +10,13 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from functools import reduce
-from typing import TYPE_CHECKING, Any, Container, Mapping, Protocol
+from typing import TYPE_CHECKING, Any, Container, Iterable, Mapping, Protocol, Sequence
 from typing_extensions import override
 
 import torch
 
 from neps.distributions import DistributionOverDomain, TruncatedNormal
-from neps.sampling.samplers import Sampler
+from neps.sampling.samplers import Sampler, WeightedSampler
 from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
 
 if TYPE_CHECKING:
@@ -228,6 +227,18 @@ def make_centered(
 
         return CenteredPrior(distributions=distributions)
 
+    @classmethod
+    def weighted(cls, priors: Iterable[Prior], weights: torch.Tensor) -> WeightedPrior:
+        """Create a weighted prior for a given list of priors.
+
+        Args:
+            priors: The list of priors to sample from.
+            weights: The weights for each prior. Will be normalized to sum to 1.
+                Please specify the device of your weights if required.
+        """
+        priors = list(priors)
+        return WeightedPrior(priors=list(priors), weights=weights)
+
 
 @dataclass
 class CenteredPrior(Prior):
@@ -353,21 +364,30 @@ def sample(
 class WeightedPrior(Prior):
     """A prior consisting of multiple priors with weights."""
 
-    priors: list[Prior]
+    priors: Sequence[Prior]
+    """The list of priors to sample from."""
+
     weights: torch.Tensor
-    probabilities: torch.Tensor = field(init=False, repr=False)
+    """The weights for each prior."""
+
+    _weighted_sampler: WeightedSampler = field(init=False, repr=False)
 
     def __post_init__(self):
-        if len(self.priors) < 2:
-            raise ValueError(f"At least two priors must be given. Got {len(self.priors)}")
+        from neps.sampling.samplers import WeightedSampler
 
-        if self.weights.ndim != 1:
-            raise ValueError("Weights must be a 1D tensor.")
+        self._weighted_sampler = WeightedSampler(
+            samplers=self.priors, weights=self.weights
+        )
 
-        if len(self.priors) != len(self.weights):
-            raise ValueError("The number of priors and weights must be the same.")
+    @property
+    def probabilities(self) -> torch.Tensor:
+        """The probabilities for each sampler. Normalized weights."""
+        return self._weighted_sampler.probabilities
 
-        self.probabilities = self.weights / self.weights.sum()
+    @property
+    @override
+    def ncols(self) -> int:
+        return self._weighted_sampler.ncols
 
     @override
     def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor:
@@ -391,43 +411,4 @@ def sample(
         seed: int | None = None,
         device: torch.device | None = None,
     ) -> torch.Tensor:
-        if seed is not None:
-            raise NotImplementedError("Seeding is not yet implemented.")
-
-        # Calculate the total number of samples required
-        if isinstance(n, int):
-            total_samples = n
-            output_shape = (n, self.ncols)
-        else:
-            total_samples = reduce(lambda x, y: x * y, n)
-            output_shape = (*n, self.ncols)
-
-        # Randomly select which prior to sample from for each of the total_samples
-        chosen_priors = torch.empty((total_samples,), device=device, dtype=torch.int64)
-        chosen_priors = torch.multinomial(
-            self.probabilities,
-            total_samples,
-            replacement=True,
-            out=chosen_priors,
-        )
-
-        # Create an empty tensor to hold all samples
-        output_samples = torch.empty(
-            (total_samples, self.ncols), device=device, dtype=torch.float64
-        )
-
-        # Loop through each prior and its associated indices
-        for i, prior in enumerate(self.priors):
-            # Find indices where the chosen prior is i
-            _i = torch.tensor(i, dtype=torch.int64, device=device)
-            indices = torch.where(chosen_priors == _i)[0]
-
-            if len(indices) > 0:
-                # Sample from the prior for the required number of indices
-                samples_from_prior = prior.sample(len(indices), to=to, device=device)
-                output_samples[indices] = samples_from_prior
-
-        # Reshape to the output shape including ncols dimension
-        output_samples = output_samples.view(output_shape)
-
-        return Domain.translate(output_samples, frm=UNIT_FLOAT_DOMAIN, to=to)
+        return self._weighted_sampler.sample(n, to=to, seed=seed, device=device)
diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py
index f0298d84..6802f6d7 100644
--- a/neps/sampling/samplers.py
+++ b/neps/sampling/samplers.py
@@ -6,12 +6,13 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from functools import reduce
-from typing import Protocol
+from typing import Protocol, Sequence
 from typing_extensions import override
 
 import torch
+from more_itertools import all_equal
 
 from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
 
@@ -118,3 +119,93 @@ def sample(
             x = x.view(*n, self.ncols)
 
         return Domain.translate(x, frm=UNIT_FLOAT_DOMAIN, to=to)
+
+
+@dataclass
+class WeightedSampler(Sampler):
+    """A sampler that samples from a weighted combination of samplers."""
+
+    samplers: Sequence[Sampler]
+    """The samplers to sample from."""
+
+    weights: torch.Tensor
+    """The weights for each sampler."""
+
+    probabilities: torch.Tensor = field(init=False, repr=False)
+    """The probabilities for each sampler. Normalized weights."""
+
+    def __post_init__(self):
+        if len(self.samplers) < 2:
+            raise ValueError(
+                f"At least two samplers must be given. Got {len(self.samplers)}"
+            )
+
+        if self.weights.ndim != 1:
+            raise ValueError("Weights must be a 1D tensor.")
+
+        if len(self.samplers) != len(self.weights):
+            raise ValueError("The number of samplers and weights must be the same.")
+
+        ncols = [sampler.ncols for sampler in self.samplers]
+        if not all_equal(ncols):
+            raise ValueError(
+                "All samplers must have the same number of columns." f" Got {ncols}."
+            )
+
+        self._ncols = ncols[0]
+        self.probabilities = self.weights / self.weights.sum()
+
+    @property
+    @override
+    def ncols(self) -> int:
+        return self._ncols
+
+    @override
+    def sample(
+        self,
+        n: int | torch.Size,
+        *,
+        to: Domain | list[Domain],
+        seed: int | None = None,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        if seed is not None:
+            raise NotImplementedError("Seeding is not yet implemented.")
+
+        # Calculate the total number of samples required
+        if isinstance(n, int):
+            total_samples = n
+            output_shape = (n, self.ncols)
+        else:
+            total_samples = reduce(lambda x, y: x * y, n)
+            output_shape = (*n, self.ncols)
+
+        # Randomly select which prior to sample from for each of the total_samples
+        chosen_priors = torch.empty((total_samples,), device=device, dtype=torch.int64)
+        chosen_priors = torch.multinomial(
+            self.probabilities,
+            total_samples,
+            replacement=True,
+            out=chosen_priors,
+        )
+
+        # Create an empty tensor to hold all samples
+        output_samples = torch.empty(
+            (total_samples, self.ncols), device=device, dtype=torch.float64
+        )
+
+        # Loop through each prior and its associated indices
+        for i, prior in enumerate(self.samplers):
+            # Find indices where the chosen prior is i
+            _i = torch.tensor(i, dtype=torch.int64, device=device)
+            indices = torch.where(chosen_priors == _i)[0]
+
+            if len(indices) > 0:
+                # Sample from the prior for the required number of indices
+                samples_from_prior = prior.sample(len(indices), to=to, device=device)
+                output_samples[indices] = samples_from_prior
+
+        # Reshape to the output shape including ncols dimension
+        output_samples = output_samples.view(output_shape)
+
+        return Domain.translate(output_samples, frm=UNIT_FLOAT_DOMAIN, to=to)

From dd7200887038be419febc2b8cdda9951404f1fad Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 28 Aug 2024 14:17:20 +0200
Subject: [PATCH 19/63] refactor: Simplify BO some more

---
 .../bayesian_optimization/optimizer.py        | 231 ++++++++----------
 neps/search_spaces/encoding.py                |  18 +-
 2 files changed, 108 insertions(+), 141 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 3b7d7f4b..700ce673 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -18,7 +18,6 @@
     optimize_acq,
 )
 from neps.sampling import Prior, Sampler
-from neps.search_spaces.domain import Domain
 from neps.search_spaces.encoding import TensorEncoder, TensorPack
 from neps.search_spaces.hyperparameters.categorical import CategoricalParameter
 
@@ -28,13 +27,16 @@
     from neps.search_spaces import (
         SearchSpace,
     )
+    from neps.search_spaces.domain import Domain
     from neps.search_spaces.hyperparameters.float import FloatParameter
     from neps.search_spaces.hyperparameters.integer import IntegerParameter
     from neps.state import BudgetInfo, Trial
 
 
-def pibo_acq_beta_and_n(
-    n_sampled_already: int, ndims: int, budget_info: BudgetInfo
+def _pibo_acq_beta_and_n(
+    n_sampled_already: int,
+    ndims: int,
+    budget_info: BudgetInfo,
 ) -> tuple[float, float]:
     if budget_info.max_evaluations is not None:
         # From the PIBO paper (Section 4.1)
@@ -56,10 +58,47 @@ def pibo_acq_beta_and_n(
     return n_sampled_already, beta
 
 
+# TODO: This needs to be moved to the search space class, however
+# to not break the current prior based APIs used elsewhere, we can
+# just manually create this here.
+# We use confidence here where `0` means no confidence and `1` means
+# absolute confidence. This gets translated in to std's and weights
+# accordingly in a `CenteredPrior`
+def _make_prior(
+    parameters: dict[str, CategoricalParameter | FloatParameter | IntegerParameter],
+) -> Prior:
+    _mapping = {"low": 0.25, "medium": 0.5, "high": 0.75}
+
+    domains: dict[str, Domain] = {}
+    centers: dict[str, tuple[Any, float]] = {}
+    categoricals: set[str] = set()
+    for name, hp in parameters.items():
+        domains[name] = hp.domain  # type: ignore
+
+        if isinstance(hp, CategoricalParameter):
+            categoricals.add(name)
+
+        if hp.default is None:
+            continue
+
+        confidence_str = hp.default_confidence_choice
+        confidence_score = _mapping[confidence_str]
+        center = hp._default_index if isinstance(hp, CategoricalParameter) else hp.default
+
+        centers[name] = (center, confidence_score)
+
+    # Uses truncnorms for numerical and weighted choices categoricals
+    return Prior.make_centered(
+        domains=domains,
+        centers=centers,
+        categoricals=categoricals,
+    )
+
+
 class BayesianOptimization(BaseOptimizer):
     """Implements the basic BO loop."""
 
-    def __init__(
+    def __init__(  # noqa: D417
         self,
         pipeline_space: SearchSpace,
         *,
@@ -97,83 +136,30 @@ def __init__(
             ValueError: if no kernel is provided
         """
         if any(pipeline_space.graphs):
-            raise ValueError(
-                "BayesianOptimization currently only supports flat search spaces"
-            )
+            raise NotImplementedError("Only supports flat search spaces for now!")
+        super().__init__(pipeline_space=pipeline_space)
 
         if initial_design_size is None:
             N = len(pipeline_space.hyperparameters)
             initial_design_size = int(max(1, math.log(N) ** 2))
         elif initial_design_size < 1:
-            raise ValueError(
-                "BayesianOptimization needs initial_design_size to be at least 1"
-            )
-
-        super().__init__(pipeline_space=pipeline_space)
+            raise ValueError("Initial_design_size to be at least 1")
 
-        if encoder is None:
-            parameters: dict[
-                str,
-                CategoricalParameter | FloatParameter | IntegerParameter,
-            ] = {
-                **pipeline_space.numerical,
-                **pipeline_space.categoricals,
-            }
-            if treat_fidelity_as_hyperparameters:
-                parameters.update(pipeline_space.fidelities)
-
-            self.encoder = TensorEncoder.default(parameters)
-        else:
-            self.encoder = encoder
-
-        # TODO: This needs to be moved to the search space class, however
-        # to not break the current prior based APIs used elsewhere, we can
-        # just manually create this here.
-        # We use confidence here where `0` means no confidence and `1` means
-        # absolute confidence. This gets translated in to std's and weights
-        # accordingly in a `CenteredPrior`
-        self.prior: Prior | None = None
-        if use_priors:
-            _mapping = {"low": 0.25, "medium": 0.5, "high": 0.75}
-
-            domains: dict[str, Domain] = {}
-            centers: dict[str, tuple[Any, float]] = {}
-            categoricals: set[str] = set()
-            for name, hp in parameters.items():
-                domains[name] = hp.domain  # type: ignore
-
-                if isinstance(hp, CategoricalParameter):
-                    categoricals.add(name)
-
-                if hp.default is None:
-                    continue
-
-                confidence_str = hp.default_confidence_choice
-                confidence_score = _mapping[confidence_str]
-                center = (
-                    hp._default_index
-                    if isinstance(hp, CategoricalParameter)
-                    else hp.default
-                )
-
-                centers[name] = (center, confidence_score)
-
-            # Uses truncnorms for numerical and weighted choices categoricals
-            self.prior = Prior.make_centered(
-                domains=domains,
-                centers=centers,
-                categoricals=categoricals,
-            )
-        else:
-            self.prior = None
+        params: dict[str, CategoricalParameter | FloatParameter | IntegerParameter] = {
+            **pipeline_space.numerical,
+            **pipeline_space.categoricals,
+        }
+        if treat_fidelity_as_hyperparameters:
+            params.update(pipeline_space.fidelities)
 
+        self.encoder = TensorEncoder.default(params) if encoder is None else encoder
+        self.prior = _make_prior(params) if use_priors is True else None
         self.device = device
         self.sample_default_first = sample_default_first
         self.n_initial_design = initial_design_size
-        if surrogate_model == "gp":
-            self._get_fitted_model = default_single_obj_gp
-        else:
-            self._get_fitted_model = surrogate_model
+        self._get_fitted_model = (
+            default_single_obj_gp if surrogate_model == "gp" else surrogate_model
+        )
 
         self.initial_design_: list[dict[str, Any]] | None = None
 
@@ -182,25 +168,12 @@ def ask(
         trials: Mapping[str, Trial],
         budget_info: BudgetInfo,
         optimizer_state: dict[str, Any],
+        seed: int | None = None,
     ) -> tuple[SampledConfig, dict[str, Any]]:
-        # TODO: Lift this into runtime, let the
-        # optimizer advertise the encoding wants...
-        completed = [
-            t
-            for t in trials.values()
-            if t.report is not None and t.report.loss is not None
-        ]
-        x_configs = [t.config for t in completed]
-        y = torch.as_tensor(
-            [t.report.loss for t in completed],
-            dtype=torch.float64,
-            device=self.device,
-        )  # type: ignore
-
-        if y.ndim == 1:
-            y = y.unsqueeze(1)
-
-        pending = [t.config for t in trials.values() if t.state.pending()]
+        if seed is not None:
+            raise NotImplementedError(
+                "Seed is not yet implemented for BayesianOptimization"
+            )
 
         space = self.pipeline_space
         config_id = str(len(trials) + 1)
@@ -209,51 +182,55 @@ def ask(
         if self.initial_design_ is None:
             self.initial_design_ = []
 
-            # Add the default configuration first (maybe)
             if self.sample_default_first:
                 config = space.sample_default_configuration()
                 self.initial_design_.append(config.hp_values())
 
-            if self.prior:
-                sampler = self.prior
-            else:
-                sampler = Sampler.sobol(ndim=self.encoder.ncols, seed=0, scramble=True)
-
+            sampler = (
+                self.prior if self.prior else Sampler.sobol(self.encoder.ncols, seed=seed)
+            )
             n_samples = self.n_initial_design - len(self.initial_design_)
 
-            # We add a buffer of 2x the samples to help ensure
-            # we get get enough after removing duplicates.
-            x = sampler.sample(n_samples * 2)
-            x = Domain.translate(
-                x,
-                to=self.encoder.domains.values(),
-                frm=sampler.sample_domain,
+            x = sampler.sample(
+                n_samples * 2,
+                to=self.encoder.domains,
+                seed=seed,
+                device=self.device,
             )
             uniq_x = torch.unique(x, dim=0)
-            configs = self.encoder.decode_dicts(uniq_x[:n_samples])
+            configs = self.encoder.unpack(uniq_x[:n_samples])
             self.initial_design_.extend(configs)
 
-        # If we havn't passed the intial design phase, just return
-        # the next one.
+        # If we havn't passed the intial design phase
         if len(trials) < len(self.initial_design_):
-            config = self.initial_design_[len(trials)]
-            return (
-                SampledConfig(id=config_id, config=config, previous_config_id=None),
-                optimizer_state,
-            )
+            config = self.initial_design_[len(trials) - 1]
+            sample = SampledConfig(id=config_id, config=config, previous_config_id=None)
+            return sample, optimizer_state
 
         # Now we actually do the BO loop, start by encoding the data
+        # TODO: Lift this into runtime, let the optimizer advertise the encoding wants...
+        x_configs: list[dict[str, Any]] = []
+        ys: list[float] = []
+        pending: list[dict[str, Any]] = []
+        for trial in trials.values():
+            if trial.state.pending():
+                pending.append(trial.config)
+            else:
+                assert trial.report is not None
+                assert trial.report.loss is not None
+                x_configs.append(trial.config)
+                ys.append(trial.report.loss)
+
         x = self.encoder.pack(x_configs, device=self.device)
-        x_pending = None
-        if any(pending):
-            x_pending = self.encoder.pack(pending, device=self.device)
+        x_pending = (
+            None if len(pending) == 0 else self.encoder.pack(pending, device=self.device)
+        )
+        y = torch.tensor(ys, dtype=torch.float64, device=self.device)
+        if y.ndim == 1:
+            y = y.unsqueeze(1)
 
-        # Get our fitted model
         model = self._get_fitted_model(x, y)
 
-        # Build our acquisition function. This takes care of pending
-        # configs through x_pending.
-        # TODO: We should evaluate whether LogNoisyEI is better than LogEI
         acq = qLogExpectedImprovement(
             model,
             best_f=y.min(),
@@ -263,24 +240,14 @@ def ask(
             # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607
             objective=LinearMCObjective(weights=torch.tensor([-1.0])),
         )
-
-        # If we have a prior, then we use it with PiBO
         if self.prior:
-            n, beta = pibo_acq_beta_and_n(
-                n_sampled_already=len(trials),
-                ndims=self.encoder.ncols,
-                budget_info=budget_info,
-            )
+            n, beta = _pibo_acq_beta_and_n(len(trials), self.encoder.ncols, budget_info)
             acq = PiboAcquisition(acq, prior=self.prior, n=n, beta=beta)
 
-        # Optimize it
         candidates, _eis = optimize_acq(acq_fn=acq, encoder=self.encoder, acq_options={})
 
-        # Take the first (and only?) candidate
-        assert len(candidates) == 1
-        config = self.encoder.decode_dicts(candidates)[0]
+        assert len(candidates) == 1, "Expected only one candidate!"
+        config = self.encoder.unpack(candidates)[0]
 
-        return (
-            SampledConfig(id=config_id, config=config, previous_config_id=None),
-            optimizer_state,
-        )
+        sample = SampledConfig(id=config_id, config=config, previous_config_id=None)
+        return sample, optimizer_state
diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
index ad3b00ae..71555fef 100644
--- a/neps/search_spaces/encoding.py
+++ b/neps/search_spaces/encoding.py
@@ -190,22 +190,24 @@ def decode_dicts(self, x: npt.NDArray[np.object_]) -> list[dict[str, Any]]:
 class TensorEncoder:
     transformers: dict[str, TensorTransformer]
     index_of: dict[str, int] = field(init=False)
+    domain_of: dict[str, Domain] = field(init=False)
     n_numerical: int = field(init=False)
     n_categorical: int = field(init=False)
 
     def __post_init__(self):
         transformers = sorted(self.transformers.items(), key=lambda t: t[0])
         self.transformers = dict(transformers)
-        self.index_of: dict[str, int] = {}
+
         n_numerical = 0
         n_categorical = 0
-        for i, (name, transformer) in enumerate(self.transformers.items()):
-            self.index_of[name] = i
+        for _, transformer in transformers:
             if isinstance(transformer, CategoricalToIntegerTransformer):
                 n_categorical += 1
             else:
                 n_numerical += 1
 
+        self.index_of = {name: i for i, name in enumerate(self.transformers.keys())}
+        self.domain_of = {name: t.domain for name, t in self.transformers.items()}
         self.n_numerical = n_numerical
         self.n_categorical = n_categorical
 
@@ -214,10 +216,8 @@ def ncols(self) -> int:
         return len(self.transformers)
 
     @property
-    def domains(self) -> dict[str, Domain]:
-        return {
-            name: transformer.domain for name, transformer in self.transformers.items()
-        }
+    def domains(self) -> list[Domain]:
+        return list(self.domain_of.values())
 
     def names(self) -> list[str]:
         return list(self.transformers.keys())
@@ -256,7 +256,7 @@ def pack(
     ) -> TensorPack:
         return TensorPack(self.encode(x, device=device), self)
 
-    def decode_dicts(self, x: torch.Tensor) -> list[dict[str, Any]]:
+    def unpack(self, x: torch.Tensor) -> list[dict[str, Any]]:
         values: dict[str, list[Any]] = {}
         for hp_name, transformer in self.transformers.items():
             lookup = self.index_of[hp_name]
@@ -311,7 +311,7 @@ def names(self) -> list[str]:
         return self.encoder.names()
 
     def to_dicts(self) -> list[dict[str, Any]]:
-        return self.encoder.decode_dicts(self.tensor)
+        return self.encoder.unpack(self.tensor)
 
     def split(self, index: int) -> tuple[TensorPack, TensorPack]:
         left = TensorPack(self.encoder, tensor=self.tensor[:index])

From 8658d70f088b1b86be965e05b8b0bc3b2204889b Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 28 Aug 2024 14:24:06 +0200
Subject: [PATCH 20/63] fix: Need at least 2 points for intial design

---
 neps/optimizers/bayesian_optimization/optimizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 700ce673..9499926c 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -141,7 +141,7 @@ def __init__(  # noqa: D417
 
         if initial_design_size is None:
             N = len(pipeline_space.hyperparameters)
-            initial_design_size = int(max(1, math.log(N) ** 2))
+            initial_design_size = int(max(2, math.log(N) ** 2))
         elif initial_design_size < 1:
             raise ValueError("Initial_design_size to be at least 1")
 

From b135c748062df903d073bc2cdde320b9a4236da6 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 28 Aug 2024 14:25:33 +0200
Subject: [PATCH 21/63] fix: Ensure we do the last intial design

---
 neps/optimizers/bayesian_optimization/optimizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 9499926c..139de839 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -202,7 +202,7 @@ def ask(
             self.initial_design_.extend(configs)
 
         # If we havn't passed the intial design phase
-        if len(trials) < len(self.initial_design_):
+        if len(trials) <= len(self.initial_design_):
             config = self.initial_design_[len(trials) - 1]
             sample = SampledConfig(id=config_id, config=config, previous_config_id=None)
             return sample, optimizer_state

From 6fd4a2914b84cbf77ee5a08cb90d1ecf224c061a Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 28 Aug 2024 14:27:46 +0200
Subject: [PATCH 22/63] doc: Add todo note on no reported loss

---
 neps/optimizers/bayesian_optimization/optimizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 139de839..355fc39a 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -217,6 +217,7 @@ def ask(
                 pending.append(trial.config)
             else:
                 assert trial.report is not None
+                # TODO: Figure out what to do if there's no reported loss value.
                 assert trial.report.loss is not None
                 x_configs.append(trial.config)
                 ys.append(trial.report.loss)

From bb0eb38dac025145ff1b569f41771733c306a6fe Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Thu, 29 Aug 2024 11:13:08 +0200
Subject: [PATCH 23/63] fix: Add in the GP optimization

---
 neps/optimizers/bayesian_optimization/optimizer.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 355fc39a..d9f322aa 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -8,6 +8,7 @@
     LinearMCObjective,
     qLogExpectedImprovement,
 )
+from gpytorch import ExactMarginalLogLikelihood
 
 from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig
 from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import (
@@ -157,7 +158,7 @@ def __init__(  # noqa: D417
         self.device = device
         self.sample_default_first = sample_default_first
         self.n_initial_design = initial_design_size
-        self._get_fitted_model = (
+        self._get_model = (
             default_single_obj_gp if surrogate_model == "gp" else surrogate_model
         )
 
@@ -230,7 +231,12 @@ def ask(
         if y.ndim == 1:
             y = y.unsqueeze(1)
 
-        model = self._get_fitted_model(x, y)
+        model = self._get_model(x, y)
+
+        from botorch.fit import fit_gpytorch_mll
+
+        mll = ExactMarginalLogLikelihood(likelihood=model.likelihood, model=model)
+        _fit_mll = fit_gpytorch_mll(mll)
 
         acq = qLogExpectedImprovement(
             model,

From 41deea6f6cac8602f95fda3b732ea80038a39726 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Thu, 29 Aug 2024 11:14:01 +0200
Subject: [PATCH 24/63] fix: Memory efficient log_prior for CenteredPrior

---
 neps/sampling/priors.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py
index f7976bf4..2deda010 100644
--- a/neps/sampling/priors.py
+++ b/neps/sampling/priors.py
@@ -284,14 +284,15 @@ def log_prob(self, x: torch.Tensor, *, frm: list[Domain] | Domain) -> torch.Tens
 
         # Calculate the log probabilities of the sample domain tensors under their
         # respective distributions.
-        log_probs = torch.cat(
-            [
-                dist.distribution.log_prob(sample_domain_tensor[:, i])
-                for i, dist in enumerate(self.distributions)
-            ],
-            dim=-1,
-        )
-        return torch.sum(log_probs, dim=-1)
+        itr = enumerate(self.distributions)
+        first_i, first_dist = next(itr)
+
+        log_probs = first_dist.distribution.log_prob(sample_domain_tensor[..., first_i])
+        for i, dist in itr:
+            log_probs = log_probs + dist.distribution.log_prob(
+                sample_domain_tensor[..., i]
+            )
+        return log_probs
 
     @override
     def sample(
@@ -398,7 +399,7 @@ def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tens
 
         weighted_probs = first_prob * first_prior.log_prob(x, frm=frm)
         for prob, prior in itr:
-            weighted_probs += prob * prior.log_prob(x, frm=frm)
+            weighted_probs = weighted_probs + prob * prior.log_prob(x, frm=frm)
 
         return weighted_probs
 

From 205f503b4b9fb4603b7b3a7975a684716f733da9 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Thu, 29 Aug 2024 18:35:55 +0200
Subject: [PATCH 25/63] feat: pibo and cost cooling

---
 .../acquisition_functions/_ehvi.py            | 213 ------
 .../acquisition_functions/base_acquisition.py |  17 -
 .../acquisition_functions/cost_cooling.py     |  99 +--
 .../acquisition_functions/ei.py               | 120 ---
 .../acquisition_functions/mf_ei.py            |  35 +-
 .../acquisition_functions/pibo.py             |  63 ++
 .../acquisition_functions/prior_weighted.py   | 111 ---
 .../acquisition_functions/ucb.py              |  60 --
 .../weighted_acquisition.py                   | 147 ++++
 .../bayesian_optimization/mf_tpe.py           | 719 ------------------
 .../bayesian_optimization/models/gp.py        |  23 +-
 .../bayesian_optimization/optimizer.py        | 187 ++++-
 .../optimizers/bayesian_optimization/sobol.py |   0
 neps/optimizers/initial_design.py             |  34 -
 neps/{ => sampling}/distributions.py          |   2 +-
 neps/sampling/priors.py                       |  12 +-
 neps/sampling/samplers.py                     |  39 +-
 neps/search_spaces/distributions/__init__.py  |  16 -
 .../distributions/distribution.py             |  21 -
 neps/search_spaces/distributions/truncnorm.py | 112 ---
 .../distributions/uniform_float.py            |  47 --
 .../distributions/uniform_int.py              |  46 --
 .../distributions/weighted_ints.py            |  91 ---
 neps/search_spaces/samplers/__init__.py       |   9 -
 neps/search_spaces/samplers/model.py          | 186 -----
 neps/search_spaces/samplers/prior.py          | 110 ---
 neps/search_spaces/samplers/sampler.py        |  22 -
 neps/search_spaces/samplers/uniform.py        |  79 --
 .../samplers/weighted_sampler.py              |  51 --
 29 files changed, 476 insertions(+), 2195 deletions(-)
 delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py
 delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py
 delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
 create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py
 delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
 delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py
 create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
 delete mode 100644 neps/optimizers/bayesian_optimization/mf_tpe.py
 delete mode 100644 neps/optimizers/bayesian_optimization/sobol.py
 delete mode 100644 neps/optimizers/initial_design.py
 rename neps/{ => sampling}/distributions.py (99%)
 delete mode 100644 neps/search_spaces/distributions/__init__.py
 delete mode 100644 neps/search_spaces/distributions/distribution.py
 delete mode 100644 neps/search_spaces/distributions/truncnorm.py
 delete mode 100644 neps/search_spaces/distributions/uniform_float.py
 delete mode 100644 neps/search_spaces/distributions/uniform_int.py
 delete mode 100644 neps/search_spaces/distributions/weighted_ints.py
 delete mode 100644 neps/search_spaces/samplers/__init__.py
 delete mode 100644 neps/search_spaces/samplers/model.py
 delete mode 100644 neps/search_spaces/samplers/prior.py
 delete mode 100644 neps/search_spaces/samplers/sampler.py
 delete mode 100644 neps/search_spaces/samplers/uniform.py
 delete mode 100644 neps/search_spaces/samplers/weighted_sampler.py

diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py b/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py
deleted file mode 100644
index 8722c545..00000000
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# from abc import ABC, abstractmethod
-from itertools import product
-
-import torch
-from torch import Tensor
-from torch.distributions import Normal
-from torch.nn import Module
-
-# class MultiObjectiveBaseAcqusition(ABC):
-#     def __init__(self, surrogate_models: dict):
-#         self.surrogate_models = surrogate_models
-#
-#     def propose_location(self, *args):
-#         """Propose new locations for subsequent sampling
-#         This method should be overriden by respective acquisition function implementations."""
-#         raise NotImplementedError
-#
-#     def optimize(self):
-#         """This is the method that user should call for the Bayesian optimisation main loop."""
-#         raise NotImplementedError
-#
-#     @abstractmethod
-#     def eval(self, x, asscalar: bool = False):
-#         """Evaluate the acquisition function at point x2. This should be overridden by respective acquisition
-#         function implementations"""
-#         raise NotImplementedError
-#
-#     def __call__(self, *args, **kwargs):
-#         return self.eval(*args, **kwargs)
-#
-#     def reset_surrogate_model(self, surrogate_models: dict):
-#         for objective, surrogate_model in surrogate_models.items():
-#             self.surrogate_models[objective] = surrogate_model
-#
-
-
-class ExpectedHypervolumeImprovement(Module):  # , MultiObjectiveBaseAcqusition):
-    def __init__(
-        self,
-        model,
-        ref_point,
-        partitioning,
-    ) -> None:
-        r"""Expected Hypervolume Improvement supporting m>=2 outcomes.
-
-        Implementation from BOtorch, adapted from
-        https://github.com/pytorch/botorch/blob/353f37649fa8d90d881e8ea20c11986b15723ef1/botorch/acquisition/multi_objective/analytic.py#L78
-
-        This implements the computes EHVI using the algorithm from [Yang2019]_, but
-        additionally computes gradients via auto-differentiation as proposed by
-        [Daulton2020qehvi]_.
-
-        Note: this is currently inefficient in two ways due to the binary partitioning
-        algorithm that we use for the box decomposition:
-
-            - We have more boxes in our decomposition
-            - If we used a box decomposition that used `inf` as the upper bound for
-                the last dimension *in all hypercells*, then we could reduce the number
-                of terms we need to compute from 2^m to 2^(m-1). [Yang2019]_ do this
-                by using DKLV17 and LKF17 for the box decomposition.
-
-        TODO: Use DKLV17 and LKF17 for the box decomposition as in [Yang2019]_ for
-        greater efficiency.
-
-        TODO: Add support for outcome constraints.
-
-        Example:
-            >>> model = SingleTaskGP(train_X, train_Y)
-            >>> ref_point = [0.0, 0.0]
-            >>> EHVI = ExpectedHypervolumeImprovement(model, ref_point, partitioning)
-            >>> ehvi = EHVI(test_X)
-
-        Args:
-            model: A fitted model.
-            ref_point: A list with `m` elements representing the reference point (in the
-                outcome space) w.r.t. to which compute the hypervolume. This is a
-                reference point for the objective values (i.e. after applying
-                `objective` to the samples).
-            partitioning: A `NondominatedPartitioning` module that provides the non-
-                dominated front and a partitioning of the non-dominated space in hyper-
-                rectangles.
-            objective: An `AnalyticMultiOutputObjective`.
-        """
-        # TODO: we could refactor this __init__ logic into a
-        # HypervolumeAcquisitionFunction Mixin
-        if len(ref_point) != partitioning.num_outcomes:
-            raise ValueError(
-                "The length of the reference point must match the number of outcomes. "
-                f"Got ref_point with {len(ref_point)} elements, but expected "
-                f"{partitioning.num_outcomes}."
-            )
-        ref_point = torch.tensor(
-            ref_point,
-            dtype=partitioning.pareto_Y.dtype,
-            device=partitioning.pareto_Y.device,
-        )
-        better_than_ref = (partitioning.pareto_Y > ref_point).all(dim=1)
-        if not better_than_ref.any() and partitioning.pareto_Y.shape[0] > 0:
-            raise ValueError(
-                "At least one pareto point must be better than the reference point."
-            )
-        super().__init__()
-        self.model = model
-        self.register_buffer("ref_point", ref_point)
-        self.partitioning = partitioning
-        cell_bounds = self.partitioning.get_hypercell_bounds()
-        self.register_buffer("cell_lower_bounds", cell_bounds[0])
-        self.register_buffer("cell_upper_bounds", cell_bounds[1])
-        # create indexing tensor of shape `2^m x m`
-        self._cross_product_indices = torch.tensor(
-            list(product(*[[0, 1] for _ in range(ref_point.shape[0])])),
-            dtype=torch.long,
-            device=ref_point.device,
-        )
-        self.normal = Normal(0, 1)
-
-    def psi(self, lower: Tensor, upper: Tensor, mu: Tensor, sigma: Tensor) -> None:
-        r"""Compute Psi function.
-
-        For each cell i and outcome k:
-
-            Psi(lower_{i,k}, upper_{i,k}, mu_k, sigma_k) = (
-            sigma_k * PDF((upper_{i,k} - mu_k) / sigma_k) + (
-            mu_k - lower_{i,k}
-            ) * (1 - CDF(upper_{i,k} - mu_k) / sigma_k)
-            )
-
-        See Equation 19 in [Yang2019]_ for more details.
-
-        Args:
-            lower: A `num_cells x m`-dim tensor of lower cell bounds
-            upper: A `num_cells x m`-dim tensor of upper cell bounds
-            mu: A `batch_shape x 1 x m`-dim tensor of means
-            sigma: A `batch_shape x 1 x m`-dim tensor of standard deviations (clamped).
-
-        Returns:
-            A `batch_shape x num_cells x m`-dim tensor of values.
-        """
-        u = (upper - mu) / sigma
-        return sigma * self.normal.log_prob(u).exp() + (mu - lower) * (
-            1 - self.normal.cdf(u)
-        )
-
-    def nu(self, lower: Tensor, upper: Tensor, mu: Tensor, sigma: Tensor) -> None:
-        r"""Compute Nu function.
-
-        For each cell i and outcome k:
-
-            nu(lower_{i,k}, upper_{i,k}, mu_k, sigma_k) = (
-            upper_{i,k} - lower_{i,k}
-            ) * (1 - CDF((upper_{i,k} - mu_k) / sigma_k))
-
-        See Equation 25 in [Yang2019]_ for more details.
-
-        Args:
-            lower: A `num_cells x m`-dim tensor of lower cell bounds
-            upper: A `num_cells x m`-dim tensor of upper cell bounds
-            mu: A `batch_shape x 1 x m`-dim tensor of means
-            sigma: A `batch_shape x 1 x m`-dim tensor of standard deviations (clamped).
-
-        Returns:
-            A `batch_shape x num_cells x m`-dim tensor of values.
-        """
-        return (upper - lower) * (1 - self.normal.cdf((upper - mu) / sigma))
-
-    def forward(self, X: Tensor) -> Tensor:
-        posterior = [[_m.predict(_x) for _m in self.model] for _x in X]
-        mu = torch.tensor([[_m[0].item() for _m in _p] for _p in posterior])[:, None, :]
-        sigma = torch.tensor([[_s[1].item() for _s in _p] for _p in posterior])[
-            :, None, :
-        ]
-
-        # clamp here, since upper_bounds will contain `inf`s, which
-        # are not differentiable
-        cell_upper_bounds = self.cell_upper_bounds.clamp_max(1e8)
-        # Compute psi(lower_i, upper_i, mu_i, sigma_i) for i=0, ... m-2
-        psi_lu = self.psi(
-            lower=self.cell_lower_bounds, upper=cell_upper_bounds, mu=mu, sigma=sigma
-        )
-        # Compute psi(lower_m, lower_m, mu_m, sigma_m)
-        psi_ll = self.psi(
-            lower=self.cell_lower_bounds,
-            upper=self.cell_lower_bounds,
-            mu=mu,
-            sigma=sigma,
-        )
-        # Compute nu(lower_m, upper_m, mu_m, sigma_m)
-        nu = self.nu(
-            lower=self.cell_lower_bounds, upper=cell_upper_bounds, mu=mu, sigma=sigma
-        )
-        # compute the difference psi_ll - psi_lu
-        psi_diff = psi_ll - psi_lu
-
-        # this is batch_shape x num_cells x 2 x (m-1)
-        stacked_factors = torch.stack([psi_diff, nu], dim=-2)
-
-        # Take the cross product of psi_diff and nu across all outcomes
-        # e.g. for m = 2
-        # for each batch and cell, compute
-        # [psi_diff_0, psi_diff_1]
-        # [nu_0, psi_diff_1]
-        # [psi_diff_0, nu_1]
-        # [nu_0, nu_1]
-        # this tensor has shape: `batch_shape x num_cells x 2^m x m`
-        all_factors_up_to_last = stacked_factors.gather(
-            dim=-2,
-            index=self._cross_product_indices.expand(
-                stacked_factors.shape[:-2] + self._cross_product_indices.shape
-            ),
-        )
-        # compute product for all 2^m terms,
-        # sum across all terms and hypercells
-        return all_factors_up_to_last.prod(dim=-1).sum(dim=-1).sum(dim=-1)
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py
deleted file mode 100644
index 7249c0fd..00000000
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from abc import ABC, abstractmethod
-
-
-class BaseAcquisition(ABC):
-    def __init__(self):
-        self.surrogate_model = None
-
-    @abstractmethod
-    def eval(self, x, asscalar: bool = False):
-        """Evaluate the acquisition function at point x2."""
-        raise NotImplementedError
-
-    def __call__(self, *args, **kwargs):
-        return self.eval(*args, **kwargs)
-
-    def set_state(self, surrogate_model, **kwargs):
-        self.surrogate_model = surrogate_model
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py b/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py
index a45cd051..4741705f 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py
@@ -1,46 +1,53 @@
-from typing import Iterable, Union
-
-import numpy as np
-import torch
-
-from .base_acquisition import BaseAcquisition
-from .ei import ComprehensiveExpectedImprovement
-
-
-class CostCooler(BaseAcquisition):
-    def __init__(
-        self,
-        base_acquisition: BaseAcquisition = ComprehensiveExpectedImprovement,
-    ):
-        self.base_acquisition = base_acquisition
-        self.cost_model = None
-        self.alpha = None
-
-    def eval(
-        self,
-        x: Iterable,
-        **base_acquisition_kwargs,
-    ) -> Union[np.ndarray, torch.Tensor, float]:
-        base_acquisition_value = self.base_acquisition.eval(
-            x=x, **base_acquisition_kwargs
-        )
-        costs, _ = self.cost_model.predict(x)
-        # if costs < 0.001:
-        #     costs = 1
-        if torch.is_tensor(costs):
-            cost_cooled = torch.zeros_like(costs)
-            index = 0
-            for _, y in enumerate(costs.detach().numpy()):
-                if y < 0.0001:
-                    cost_cooled[index] = base_acquisition_value[index]
-                else:
-                    cost_cooled[index] = base_acquisition_value[index] / (y**self.alpha)
-                index += 1
-        # return base_acquisition_value # / (costs**self.alpha).detach().numpy()
-        return cost_cooled
-
-    def set_state(self, surrogate_model, alpha, cost_model, **kwargs):
-        super().set_state(surrogate_model=surrogate_model)
-        self.base_acquisition.set_state(surrogate_model=surrogate_model, **kwargs)
-        self.alpha = alpha
-        self.cost_model = cost_model
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from botorch.acquisition.logei import partial
+
+from neps.optimizers.bayesian_optimization.acquisition_functions.weighted_acquisition import (
+    WeightedAcquisition,
+)
+
+if TYPE_CHECKING:
+    import torch
+    from botorch.acquisition import AcquisitionFunction
+    from botorch.models.gp_regression import Likelihood
+    from botorch.models.model import Model
+    from torch import Tensor
+
+
+def apply_cost_cooling(
+    acq_values: Tensor,
+    X: Tensor,
+    acq: AcquisitionFunction,
+    cost_model: Model,
+    likelihood: Likelihood,
+    alpha: float,
+) -> Tensor:
+    posterior = likelihood(cost_model(X))
+    cost = posterior.mean
+
+    if acq._log:
+        # can derive from eq log(x) = log(acq / cost^alpha)
+        return acq_values - alpha * cost.log()
+    return acq_values / cost.pow(alpha)
+
+
+def cost_cooled_acq(
+    acq_fn: AcquisitionFunction,
+    model: Model,
+    likelihood: Likelihood,
+    used_budget_percentage: float,
+    X_pending: torch.Tensor | None = None,
+) -> WeightedAcquisition:
+    assert 0 <= used_budget_percentage <= 1
+    return WeightedAcquisition(
+        acq=acq_fn,
+        apply_weight=partial(
+            apply_cost_cooling,
+            cost_model=model,
+            likelihood=likelihood,
+            alpha=1 - used_budget_percentage,
+        ),
+        X_pending=X_pending,
+    )
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
deleted file mode 100644
index 1a4e24d0..00000000
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
+++ /dev/null
@@ -1,120 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Sequence
-
-import torch
-from torch.distributions import Normal
-
-from .base_acquisition import BaseAcquisition
-
-if TYPE_CHECKING:
-    import numpy as np
-
-    from neps.search_spaces import SearchSpace
-
-
-class ComprehensiveExpectedImprovement(BaseAcquisition):
-    def __init__(
-        self,
-        augmented_ei: bool = False,
-        xi: float = 0.0,
-        in_fill: str = "best",
-        log_ei: bool = False,
-        optimize_on_max_fidelity: bool = True,
-    ):
-        """This is the graph BO version of the expected improvement
-        key differences are:
-
-        1. The input x2 is a networkx graph instead of a vectorial input
-
-        2. The search space (a collection of x1_graphs) is discrete, so there is no
-           gradient-based optimisation. Instead, we compute the EI at all candidate points
-           and empirically select the best position during optimisation
-
-        Args:
-            augmented_ei: Using the Augmented EI heuristic modification to the standard
-                expected improvement algorithm according to Huang (2006).
-            xi: manual exploration-exploitation trade-off parameter.
-            in_fill: the criterion to be used for in-fill for the determination of mu_star
-                'best' means the empirical best observation so far (but could be
-                susceptible to noise), 'posterior' means the best *posterior GP mean*
-                encountered so far, and is recommended for optimization of more noisy
-                functions. Defaults to "best".
-            log_ei: log-EI if true otherwise usual EI.
-        """
-        super().__init__()
-
-        if in_fill not in ["best", "posterior"]:
-            raise ValueError(f"Invalid value for in_fill ({in_fill})")
-        self.augmented_ei = augmented_ei
-        self.xi = xi
-        self.in_fill = in_fill
-        self.log_ei = log_ei
-        self.incumbent = None
-        self.optimize_on_max_fidelity = optimize_on_max_fidelity
-
-    def eval(
-        self,
-        x: Sequence[SearchSpace],
-        asscalar: bool = False,
-    ) -> np.ndarray | torch.Tensor | float:
-        """Return the negative expected improvement at the query point x2."""
-        assert self.incumbent is not None, "EI function not fitted on model"
-
-        if x[0].has_fidelity and self.optimize_on_max_fidelity:
-            _x = [e.clone() for e in x]
-            for e in _x:
-                e.set_to_max_fidelity()
-        else:
-            _x = x
-
-        mu, cov = self.surrogate_model.predict(_x)
-
-        std = torch.sqrt(torch.diag(cov))
-        mu_star = self.incumbent
-
-        gauss = Normal(torch.zeros(1, device=mu.device), torch.ones(1, device=mu.device))
-        # u = (mu - mu_star - self.xi) / std
-        # ei = std * updf + (mu - mu_star - self.xi) * ucdf
-        if self.log_ei:
-            # we expect that f_min is in log-space
-            f_min = mu_star - self.xi
-            v = (f_min - mu) / std
-            ei = torch.exp(f_min) * gauss.cdf(v) - torch.exp(
-                0.5 * torch.diag(cov) + mu
-            ) * gauss.cdf(v - std)
-        else:
-            u = (mu_star - mu - self.xi) / std
-            try:
-                ucdf = gauss.cdf(u)
-            except ValueError as e:
-                print(f"u: {u}")  # noqa: T201
-                print(f"mu_star: {mu_star}")  # noqa: T201
-                print(f"mu: {mu}")  # noqa: T201
-                print(f"std: {std}")  # noqa: T201
-                print(f"diag: {cov.diag()}")  # noqa: T201
-                raise e
-            updf = torch.exp(gauss.log_prob(u))
-            ei = std * updf + (mu_star - mu - self.xi) * ucdf
-        if self.augmented_ei:
-            sigma_n = self.surrogate_model.likelihood
-            ei *= 1.0 - torch.sqrt(torch.tensor(sigma_n, device=mu.device)) / torch.sqrt(
-                sigma_n + torch.diag(cov)
-            )
-        if isinstance(_x, list) and asscalar:
-            return ei.detach().numpy()
-        if asscalar:
-            ei = ei.detach().numpy().item()
-        return ei
-
-    def set_state(self, surrogate_model, **kwargs):
-        super().set_state(surrogate_model, **kwargs)
-
-        # Compute incumbent
-        if self.in_fill == "best":
-            self.incumbent = torch.min(self.surrogate_model.y_)
-        else:
-            x = self.surrogate_model.x
-            mu_train, _ = self.surrogate_model.predict(x)
-            incumbent_idx = torch.argmin(mu_train)
-            self.incumbent = self.surrogate_model.y_[incumbent_idx]
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py
index 3d19040d..c8502ca1 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py
@@ -1,22 +1,28 @@
+# Left in as reference for now.
 # type: ignore
-from typing import Any, Iterable, Tuple, Union
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Iterable
 
 import numpy as np
 import pandas as pd
 import torch
 from torch.distributions import Normal
 
-from ....optimizers.utils import map_real_hyperparameters_from_tabular_ids
-from ....search_spaces.search_space import SearchSpace
-from ...multi_fidelity.utils import MFObservedData
+from neps.optimizers.utils import map_real_hyperparameters_from_tabular_ids
+
 from .ei import ComprehensiveExpectedImprovement
 
+if TYPE_CHECKING:
+    from neps.optimizers.multi_fidelity.utils import MFObservedData
+    from neps.search_spaces.search_space import SearchSpace
+
 
 class MFEI(ComprehensiveExpectedImprovement):
     def __init__(
         self,
         pipeline_space: SearchSpace,
-        surrogate_model_name: str = None,
+        surrogate_model_name: str | None = None,
         augmented_ei: bool = False,
         xi: float = 0.0,
         in_fill: str = "best",
@@ -32,7 +38,7 @@ def __init__(
     def get_budget_level(self, config) -> int:
         return int((config.fidelity.value - config.fidelity.lower) / self.b_step)
 
-    def preprocess(self, x: pd.Series) -> Tuple[Iterable, Iterable]:
+    def preprocess(self, x: pd.Series) -> tuple[Iterable, Iterable]:
         """Prepares the configurations for appropriate EI calculation.
 
         Takes a set of points and computes the budget and incumbent for each point, as
@@ -65,7 +71,7 @@ def preprocess(self, x: pd.Series) -> Tuple[Iterable, Iterable]:
                 budget_list.append(self.get_budget_level(config))
 
         # Drop unused configs
-        x.drop(labels=indices_to_drop, inplace=True)
+        x = x.drop(labels=indices_to_drop)
 
         performances = self.observations.get_best_performance_for_each_budget()
         inc_list = []
@@ -78,11 +84,11 @@ def preprocess(self, x: pd.Series) -> Tuple[Iterable, Iterable]:
 
         return x, torch.Tensor(inc_list)
 
-    def preprocess_gp(self, x: Iterable) -> Tuple[Iterable, Iterable]:
+    def preprocess_gp(self, x: Iterable) -> tuple[Iterable, Iterable]:
         x, inc_list = self.preprocess(x)
         return x.values.tolist(), inc_list
 
-    def preprocess_deep_gp(self, x: Iterable) -> Tuple[Iterable, Iterable]:
+    def preprocess_deep_gp(self, x: Iterable) -> tuple[Iterable, Iterable]:
         x, inc_list = self.preprocess(x)
         x_lcs = []
         for idx in x.index:
@@ -97,7 +103,7 @@ def preprocess_deep_gp(self, x: Iterable) -> Tuple[Iterable, Iterable]:
         self.surrogate_model.set_prediction_learning_curves(x_lcs)
         return x.values.tolist(), inc_list
 
-    def preprocess_pfn(self, x: Iterable) -> Tuple[Iterable, Iterable, Iterable]:
+    def preprocess_pfn(self, x: Iterable) -> tuple[Iterable, Iterable, Iterable]:
         """Prepares the configurations for appropriate EI calculation.
 
         Takes a set of points and computes the budget and incumbent for each point, as
@@ -114,7 +120,7 @@ def preprocess_pfn(self, x: Iterable) -> Tuple[Iterable, Iterable, Iterable]:
         ) / self.b_step
         return _x_tok, _x, inc_list
 
-    def eval(self, x: pd.Series, asscalar: bool = False) -> Tuple[np.ndarray, pd.Series]:
+    def eval(self, x: pd.Series, asscalar: bool = False) -> tuple[np.ndarray, pd.Series]:
         # _x = x.copy()  # preprocessing needs to change the reference x Series so we don't copy here
         if self.surrogate_model_name == "pfn":
             _x_tok, _x, inc_list = self.preprocess_pfn(
@@ -143,7 +149,7 @@ def eval(self, x: pd.Series, asscalar: bool = False) -> Tuple[np.ndarray, pd.Ser
 
     def eval_pfn_ei(
         self, x: Iterable, inc_list: Iterable
-    ) -> Union[np.ndarray, torch.Tensor, float]:
+    ) -> np.ndarray | torch.Tensor | float:
         """PFN-EI modified to preprocess samples and accept list of incumbents."""
         # x, inc_list = self.preprocess(x)  # IMPORTANT change from vanilla-EI
         # _x = x.copy()
@@ -154,7 +160,7 @@ def eval_pfn_ei(
 
     def eval_gp_ei(
         self, x: Iterable, inc_list: Iterable
-    ) -> Union[np.ndarray, torch.Tensor, float]:
+    ) -> np.ndarray | torch.Tensor | float:
         """Vanilla-EI modified to preprocess samples and accept list of incumbents."""
         # x, inc_list = self.preprocess(x)  # IMPORTANT change from vanilla-EI
         _x = x.copy()
@@ -194,7 +200,7 @@ def set_state(
         pipeline_space: SearchSpace,
         surrogate_model: Any,
         observations: MFObservedData,
-        b_step: Union[int, float],
+        b_step: int | float,
         **kwargs,
     ):
         # overload to select incumbent differently through observations
@@ -202,4 +208,3 @@ def set_state(
         self.surrogate_model = surrogate_model
         self.observations = observations
         self.b_step = b_step
-        return
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py
new file mode 100644
index 00000000..0f1668f1
--- /dev/null
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py
@@ -0,0 +1,63 @@
+"""# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+Prior-Guided Acquisition Functions
+
+References:
+
+.. [Hvarfner2022]
+    C. Hvarfner, D. Stoll, A. Souza, M. Lindauer, F. Hutter, L. Nardi. PiBO:
+    Augmenting Acquisition Functions with User Beliefs for Bayesian Optimization.
+    ICLR 2022.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from botorch.acquisition.logei import partial
+
+from neps.optimizers.bayesian_optimization.acquisition_functions.weighted_acquisition import (
+    WeightedAcquisition,
+)
+
+if TYPE_CHECKING:
+    from botorch.acquisition.acquisition import AcquisitionFunction
+    from torch import Tensor
+
+    from neps.sampling.priors import Prior
+    from neps.search_spaces.domain import Domain
+
+
+def apply_pibo_acquisition_weight(
+    acq_values: Tensor,
+    X: Tensor,
+    acq: AcquisitionFunction,
+    *,
+    prior: Prior,
+    x_domain: Domain | list[Domain],
+    prior_exponent: float,
+):
+    if acq._log:
+        return acq_values + prior.log_prob(X, frm=x_domain) * prior_exponent
+    return acq_values * prior.prob(X, frm=x_domain).pow(prior_exponent)
+
+
+def pibo_acquisition(
+    acq_fn: AcquisitionFunction,
+    prior: Prior,
+    prior_exponent: float,
+    x_domain: Domain | list[Domain],
+    X_pending: Tensor | None = None,
+) -> WeightedAcquisition:
+    return WeightedAcquisition(
+        acq=acq_fn,
+        apply_weight=partial(
+            apply_pibo_acquisition_weight,
+            prior=prior,
+            x_domain=x_domain,
+            prior_exponent=prior_exponent,
+        ),
+        X_pending=X_pending,
+    )
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
deleted file mode 100644
index 8a735d58..00000000
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
+++ /dev/null
@@ -1,111 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Iterable
-from typing_extensions import override
-
-import numpy as np
-import torch
-from botorch.acquisition import MCAcquisitionFunction
-
-from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
-    BaseAcquisition,
-)
-
-if TYPE_CHECKING:
-    from neps.priors import Prior
-
-
-class PiboAcquisition(MCAcquisitionFunction):
-    """Compute a prior weighted acquisition function according to PiBO.
-
-    * https://arxiv.org/pdf/2204.11051
-    """
-
-    def __init__(
-        self,
-        acq_fn: MCAcquisitionFunction,
-        prior: Prior,
-        beta: float,
-        n: float,
-    ):
-        """Initialize the acquisition function.
-
-        Args:
-            acq_fn: The acquisition function to be weighted.
-            prior: The prior distribution to be used for weighting.
-            beta: The beta parameter for weighting.
-            n: The denominator for the beta parameter.
-        """
-        self._log = self.acq_fn._log
-        self.acq_fn = acq_fn
-
-        self.beta = beta
-        self.n = n
-        self.prior = prior
-
-    @override
-    def forward(self, X: torch.Tensor) -> torch.Tensor:
-        weight = self.beta / self.n
-        acq = self.acq_fn(X)
-
-        # The weight is shown as being applied to the pdf and not the log_pdf
-        values = acq * self.prior.prob(X) * weight
-
-        # However, if the base acq function advertises as being log,
-        # i.e. self._log, then we should return the log of the values
-        return torch.log(values) if self._log else values
-
-
-class DecayingPriorWeightedAcquisition(BaseAcquisition):
-    def __init__(
-        self,
-        base_acquisition,
-        pibo_beta=10,
-        log: bool = False,
-    ):
-        super().__init__()
-        self.pibo_beta = pibo_beta
-        self.base_acquisition = base_acquisition
-        self.log = log
-        self.decay_t = 0.0
-
-    def eval(
-        self,
-        x: Iterable,
-        **base_acquisition_kwargs,
-    ) -> np.ndarray | torch.Tensor | float:
-        acquisition = self.base_acquisition(x, **base_acquisition_kwargs)
-
-        if self.log:
-            min_acq_val = abs(min(acquisition)) if min(acquisition) < 0 else 0
-
-        for i, candidate in enumerate(x):
-            prior_weight = candidate.compute_prior(log=self.log)
-            if prior_weight != 1.0:
-                if self.log:
-                    # for log -> the smaller the prior_weight,
-                    # the more unlikely it is from the prior
-                    # also shift acquisition values to avoid negativ values
-                    acquisition[i] = (
-                        np.log(acquisition[i] + min_acq_val + 1e-12)
-                        + (self.pibo_beta / self.decay_t) * prior_weight
-                    )
-                else:
-                    acquisition[i] *= np.power(
-                        prior_weight + 1e-12, self.pibo_beta / self.decay_t
-                    )
-        return acquisition
-
-    def set_state(self, surrogate_model, **kwargs):
-        if "decay_t" in kwargs:
-            decay_t = kwargs.pop("decay_t")
-        else:
-            train_x = surrogate_model.x
-            if train_x[0].has_fidelity:
-                decay_t = np.sum(
-                    [float(_x.fidelity.value >= _x.fidelity.upper) for _x in train_x]
-                )
-            else:
-                decay_t = len(train_x)
-        self.decay_t = decay_t
-        self.base_acquisition.set_state(surrogate_model, **kwargs)
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py
deleted file mode 100644
index adf57266..00000000
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from typing import Iterable, Union
-
-import numpy as np
-import torch
-
-from .base_acquisition import BaseAcquisition
-
-
-class UpperConfidenceBound(BaseAcquisition):
-    def __init__(self, beta: float=1.0, maximize: bool=False):
-        """Upper Confidence Bound (UCB) acquisition function.
-
-        Args:
-            beta: Controls the balance between exploration and exploitation.
-            maximize: If True, maximize the given model, else minimize.
-                DEFAULT=False, assumes minimzation.
-        """
-        super().__init__()
-        self.beta = beta  # can be updated as part of the state for dynamism or a schedule
-        self.maximize = maximize
-        
-        # to be initialized as part of the state
-        self.surrogate_model = None
-
-    def set_state(self, surrogate_model, **kwargs):
-        super().set_state(surrogate_model)
-        self.surrogate_model = surrogate_model
-        if "beta" in kwargs:
-            if not isinstance(kwargs["beta"], (list, np.array)):
-                self.beta = kwargs["beta"]
-            else:
-                self.logger.warning("Beta is a list, not updating beta value!")
-        
-    def eval(
-        self, x: Iterable, asscalar: bool = False
-    ) -> Union[np.ndarray, torch.Tensor, float]:
-        try:
-            mu, cov = self.surrogate_model.predict(x)
-            std = torch.sqrt(torch.diag(cov))
-        except ValueError as e:
-            raise e
-        sign = 1 if self.maximize else -1  # LCB is performed if minimize=True
-        ucb_scores = mu + sign * np.sqrt(self.beta) * std
-        # if LCB, minimize acquisition, or maximize -acquisition  
-        ucb_scores = ucb_scores.detach().numpy() * sign  
-
-        return ucb_scores
-
-
-class MF_UCB(UpperConfidenceBound):
-
-    def preprocess(self, x: Iterable) -> Iterable:
-        performances = self.observations.get_best_performance_for_each_budget()
-        pass
-
-    def eval(
-        self, x: Iterable, asscalar: bool = False
-    ) -> Union[np.ndarray, torch.Tensor, float]:
-        x = self.preprocess(x)
-        return self.eval(x, asscalar=asscalar)
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
new file mode 100644
index 00000000..488c57f4
--- /dev/null
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
@@ -0,0 +1,147 @@
+"""This module provides most of the functionality we require in NePS for now,
+i.e., we need the ability to apply an arbitrary weight to an acquisition function.
+
+I spent some time understanding the meaning of the various dimensions of botorch/gpytorch.
+
+The two primary dimensions to consider are:
+
+* `d` - The dimensionality of the design space, i.e. how many hyperparameters.
+* `batch` - The number of independent evaluations to make, i.e. how many times to
+    evaluate the acquisition function.
+
+There are two extra dimensions which are special cases and need to be accounted for.
+
+* `q` - Comes from the `qXXX` variants of acquisition, these will add an extra dimension
+    `q` to each `batch`, where instead of a `batch` representing a single config to get
+    the acquisition of, we might instead be getting the acquisition of 5 configs together,
+    representing the joint utility of evaluating these 5 configs, relative to other sets
+    of 5 configs. This dimension is _reduced_ away in the final step of the acquisition
+    when suggesting which set of group of 5 configs to suggest.
+
+* `mc_samples` - Comes from the `SampleReducdingXXX` variants of acquisition, will add an
+    extra dimension `mc_samples` which represent the amount of Monte Carlo samples used
+    to estimate the acquisition. These will eventually be _reduced_ away but are present
+    in the intermediate steps. These variants also seem to have `q` variants implicitly
+    and so you are likely to see the `q` dimension whever you see the `mc_samples`
+    dimension, even if it is just `q=1`.
+
+* `m` - The number of objectives in the multi-objective case. We will
+    specifically ignore this for now, however it exists as the last dimension (after `d`)
+    and is the first to be reduced away. They are also used in _constrainted_ settings
+    which we will also ignore for now.
+
+The most expanded tensor shape is the following, with the usual order of reduction being
+the following below. If you are not using a SamplingReducing variant, you will not see
+`mc_samples` and if you are not using a `q` variant, you will not see `q`. The simplest
+case then being `acq(tensor: batch x d)`.
+
+* `batch x q x d`.
+        reduce(..., d) = Config -> Single number  (!!!Acq applies here!!!)
+* `batch x q`.
+        expand(mc_samples , ...) = MC Sampling from posterior (I think)
+* `mc_samples x batch x q`.
+        reduce(..., q) = Joint-Config-Group -> Single number.
+* `mc_samples x batch`
+        reduce(mc_samples, ...) = MC-samples -> statistical estimate
+* `batch`
+
+Finally we get out a batch of values we can argmax over, used to index into either a
+single configuration or a single index into a joint-group of `q` configurations.
+
+!!! tip
+
+    The `mc_samples` is not of concern to the `WeightedAcquisition` below, and
+    broadcasting can be used, as a result, the `apply_weight` function only needs
+    to be able to handle:
+
+    * (X: batch x q x d, acq_values: batch x q, acq: A) -> batch x q
+
+    If utilizing the configurations `X` for weighting, you effectively will want
+    to reduce the `d` dimension.
+
+As a result of this, acquisition functions need to be able to handle arbitrary dimensions
+and act accordingly.
+
+This module mostly follows the structure of the
+`PriorGuidedAcquisitionFunction` which weights the acquisition function by a prior.
+
+* https://botorch.org/api/_modules/botorch/acquisition/prior_guided.html#PriorGuidedAcquisitionFunction
+
+We use this to create a more generic `WeightedAcquisition` which follows the required
+structure to make new weightings easier to implement, but also to serve as an educational
+reference.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable, TypeVar
+
+from botorch.acquisition import SampleReducingMCAcquisitionFunction
+from botorch.acquisition.analytic import AcquisitionFunction, t_batch_mode_transform
+from botorch.acquisition.monte_carlo import concatenate_pending_points
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+A = TypeVar("A", bound=AcquisitionFunction)
+
+
+class WeightedAcquisition(AcquisitionFunction):
+    """Class for weighting acquisition functions.
+
+    Please see module docstring for more information.
+    """
+
+    def __init__(
+        self,
+        acq: A,
+        apply_weight: Callable[[Tensor, Tensor, A], Tensor],
+        X_pending: Tensor | None = None,
+    ) -> None:
+        """Initialize the weighted acquisition function.
+
+        Args:
+            acq: The base acquisition function.
+            apply_weight: A function that takes the acquisition function values, the
+                design points and the acquisition function itself and returns the
+                weighted acquisition function values.
+
+                Please see the module docstring for more information on the dimensions
+                and how to handle them.
+            X_pending: `n x d` Tensor with `n` `d`-dim design points that have
+                been submitted for evaluation but have not yet been evaluated.
+        """
+        super().__init__(model=acq.model)
+        # NOTE: We remove the X_pending from the base acquisition function as we will get
+        # it in our own forward with `@concatenate_pending_points` and pass that forward.
+        # This avoids possible duplicates
+        self.acq.set_X_pending(None)
+        self.set_X_pending(X_pending)
+        self.apply_weight = apply_weight
+        self.acq = acq
+
+    # Taken from PiBO implementation in botorch (PriorGuidedAcquisitionFunction).
+    @concatenate_pending_points
+    @t_batch_mode_transform()  # type: ignore
+    def forward(self, X: Tensor) -> Tensor:
+        """Evaluate a weighted acquisition function on the candidate set X.
+
+        Args:
+            X: A tensor of size `batch_shape x q x d`-dim tensor of `q` `d`-dim
+                design points.
+
+        Returns:
+            A tensor with the `d` dimension reduced away, representing the
+            weighted acquisition function values at the given design points `X`.
+        """
+        if isinstance(self.acq, SampleReducingMCAcquisitionFunction):
+            # shape: mc_samples x batch x q-candidates
+            acq_values = self.acq._non_reduce_forward(X)
+            weighted_acq_values = self.apply_weight(acq_values, X, self.acq)
+            vals = self.acq._sample_reduction(self.acq._q_reduction(weighted_acq_values))
+            return vals.squeeze(-1)
+
+        # shape: batch x q-candidates
+        acq_values = self.acq(X).unsqueeze(-1)
+        weighted_acq_values = self.apply_weight(acq_values, X, self.acq)
+        return weighted_acq_values.squeeze(-1)
diff --git a/neps/optimizers/bayesian_optimization/mf_tpe.py b/neps/optimizers/bayesian_optimization/mf_tpe.py
deleted file mode 100644
index 45e4adc4..00000000
--- a/neps/optimizers/bayesian_optimization/mf_tpe.py
+++ /dev/null
@@ -1,719 +0,0 @@
-from __future__ import annotations
-
-import random
-from copy import deepcopy
-from typing import Any, Iterable
-
-import numpy as np
-import torch
-from scipy.stats import spearmanr
-from typing_extensions import Literal, override
-
-from neps.state.optimizer import BudgetInfo, OptimizationState
-from neps.utils.types import ConfigResult, RawConfig
-from neps.utils.common import instance_from_map
-from neps.search_spaces import (
-    CategoricalParameter,
-    ConstantParameter,
-    FloatParameter,
-    IntegerParameter,
-    SearchSpace,
-)
-from neps.optimizers.base_optimizer import BaseOptimizer
-from neps.optimizers.bayesian_optimization.acquisition_samplers import (
-    AcquisitionSamplerMapping,
-)
-from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-    AcquisitionSampler,
-)
-from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
-
-CUSTOM_FLOAT_CONFIDENCE_SCORES = dict(FloatParameter.DEFAULT_CONFIDENCE_SCORES)
-CUSTOM_FLOAT_CONFIDENCE_SCORES.update({"ultra": 0.05})
-
-CUSTOM_CATEGORICAL_CONFIDENCE_SCORES = dict(
-    CategoricalParameter.DEFAULT_CONFIDENCE_SCORES
-)
-CUSTOM_CATEGORICAL_CONFIDENCE_SCORES.update({"ultra": 8})
-
-
-class MultiFidelityPriorWeightedTreeParzenEstimator(BaseOptimizer):
-    def __init__(
-        self,
-        pipeline_space: SearchSpace,
-        use_priors: bool = True,
-        prior_num_evals: float = 2.5,
-        good_fraction: float = 0.3334,
-        random_interleave_prob: float = 0.0,
-        initial_design_size: int = 0,
-        prior_as_samples: bool = True,
-        pending_as_bad: bool = True,
-        fidelity_weighting: Literal["linear", "spearman"] = "spearman",
-        surrogate_model: str = "kde",
-        good_model_bw_factor: int = 1.5,
-        joint_kde_modelling: bool = False,
-        threshold_improvement: bool = True,
-        promote_from_acq: bool = True,
-        acquisition_sampler: str | AcquisitionSampler = "mutation",
-        prior_draws: int = 1000,
-        prior_confidence: Literal["low", "medium", "high"] = "medium",
-        surrogate_model_args: dict = None,
-        soft_promotion: bool = True,
-        patience: int = 50,
-        logger=None,
-        budget: None | int | float = None,
-        loss_value_on_error: None | float = None,
-        cost_value_on_error: None | float = None,
-    ):
-        """[summary]
-
-        Args:
-            pipeline_space: Space in which to search
-            prior_num_evals (float, optional): [description]. Defaults to 2.5.
-            good_fraction (float, optional): [description]. Defaults to 0.333.
-            random_interleave_prob: Frequency at which random configurations are sampled
-                instead of configurations from the acquisition strategy.
-            initial_design_size: Number of 'x' samples that are to be evaluated before
-                selecting a sample using a strategy instead of randomly. If there is a
-                user prior, we can rely on the model from the very first iteration.
-            prior_as_samples: Whether to sample from the KDE and incorporate that way, or
-            just have the distribution be an linear combination of the KDE and the prior.
-            Should be True if the prior happens to be unnormalized.
-            pending_as_bad: Whether to treat pending observations as bad, assigning them to
-            the bad KDE to encourage diversity among samples queried in parallel
-            prior_draws: The number of samples drawn from the prior if there is one. This
-            # does not affect the strength of the prior, just how accurately it
-            # is reconstructed by the KDE.
-            patience: How many times we try something that fails before giving up.
-            budget: Maximum budget
-            loss_value_on_error: Setting this and cost_value_on_error to any float will
-                supress any error during bayesian optimization and will use given loss
-                value instead. default: None
-            cost_value_on_error: Setting this and loss_value_on_error to any float will
-                supress any error during bayesian optimization and will use given cost
-                value instead. default: None
-            logger: logger object, or None to use the neps logger
-        """
-        super().__init__(
-            pipeline_space=pipeline_space,
-            patience=patience,
-            logger=logger,
-            budget=budget,
-            loss_value_on_error=loss_value_on_error,
-            cost_value_on_error=cost_value_on_error,
-        )
-        self.pipeline_space = pipeline_space
-        self.good_fraction = good_fraction
-        if self.pipeline_space.has_fidelity:
-            self.min_fidelity = pipeline_space.fidelity.lower
-            self.max_fidelity = pipeline_space.fidelity.upper
-            self.rung_map, self.inverse_rung_map = self._get_rung_maps()
-            self.min_rung = 0
-            self.max_rung = len(self.rung_map) - 1
-
-        else:
-            self.min_rung = 0
-            self.max_rung = 0
-            self.min_fidelity = 1
-            self.max_fidelity = 1
-            self.rung_map, self.inverse_rung_map = self._get_rung_maps()
-
-        if initial_design_size == 0:
-            self._initial_design_size = len(self.pipeline_space) * np.round(
-                1 / self.good_fraction
-            ).astype(int)
-        else:
-            self._initial_design_size = initial_design_size
-        self.promote_from_acq = promote_from_acq
-
-        self.num_rungs = len(self.rung_map)
-        self.use_priors = use_priors
-        self.prior_num_evals = prior_num_evals
-        self._random_interleave_prob = random_interleave_prob
-        self._pending_as_bad = pending_as_bad
-        self.prior_draws = prior_draws
-        self._has_promotable_configs = False
-        self.soft_promotion = soft_promotion
-        self.joint_kde_modelling = joint_kde_modelling
-        # if we use priors, we don't add conigurations as good until is is within the top fraction
-        # This heuristic has not been tried further, but makes sense in the context when we have priors
-        self.round_up = not use_priors
-        self.fidelity_weighting = fidelity_weighting
-        self.threshold_improvement = threshold_improvement
-        # TODO have this read in as part of load_results - it cannot be saved as an attribute when
-        # running parallel instances of the algorithm (since the old configs are shared, not instance-specific)
-        self.old_configs_per_fid = [[] for i in range(self.num_rungs)]
-        # We assume that the information conveyed per fidelity (and the cost) is linear in the
-        # fidelity levels if nothing else is specified
-        if surrogate_model != "kde":
-            raise NotImplementedError(
-                "Only supports KDEs for now. Could (maybe?) support binary classification in the future."
-            )
-        self.acquisition_sampler = instance_from_map(
-            AcquisitionSamplerMapping,
-            acquisition_sampler,
-            name="acquisition sampler function",
-            kwargs={"patience": self.patience, "pipeline_space": self.pipeline_space},
-        )
-        self.prior_confidence = prior_confidence
-        self._enhance_priors()
-        surrogate_model_args = surrogate_model_args or {}
-
-        param_types, num_options, logged_params, is_fidelity = self._get_types()
-        surrogate_model_args["param_types"] = param_types
-        surrogate_model_args["num_options"] = num_options
-        surrogate_model_args["is_fidelity"] = is_fidelity
-        surrogate_model_args["logged_params"] = logged_params
-        good_model_args = deepcopy(surrogate_model_args)
-        good_model_args["bandwidth_factor"] = good_model_bw_factor
-        if self.pipeline_space.has_prior and use_priors:
-            if prior_as_samples:
-                self.prior_samples = [
-                    self.pipeline_space.sample(
-                        patience=self.patience, user_priors=True, ignore_fidelity=False
-                    )
-                    for idx in range(self.prior_draws)
-                ]
-            else:
-                pass
-                # TODO work out affine combination
-        else:
-            self.prior_samples = []
-
-        self.surrogate_models = {
-            "good": instance_from_map(
-                SurrogateModelMapping,
-                surrogate_model,
-                name="surrogate model",
-                kwargs=good_model_args,
-            ),
-            "bad": instance_from_map(
-                SurrogateModelMapping,
-                surrogate_model,
-                name="surrogate model",
-                kwargs=surrogate_model_args,
-            ),
-            "all": instance_from_map(
-                SurrogateModelMapping,
-                surrogate_model,
-                name="surrogate model",
-                kwargs=surrogate_model_args,
-            ),
-        }
-        self.acquisition = self
-        self.acquisition_sampler = instance_from_map(
-            AcquisitionSamplerMapping,
-            acquisition_sampler,
-            name="acquisition sampler function",
-            kwargs={"patience": self.patience, "pipeline_space": self.pipeline_space},
-        )
-
-    def _enhance_priors(self):
-        """Only applicable when priors are given along with a confidence."""
-        if not self.use_priors and self.prior_confidence is None:
-            return
-        for k in self.pipeline_space.keys():
-            if self.pipeline_space[k].is_fidelity:
-                continue
-            elif isinstance(self.pipeline_space[k], (FloatParameter, IntegerParameter)):
-                confidence = CUSTOM_FLOAT_CONFIDENCE_SCORES[self.prior_confidence]
-                self.pipeline_space[k].default_confidence_score = confidence
-            elif isinstance(self.pipeline_space[k], CategoricalParameter):
-                confidence = CUSTOM_CATEGORICAL_CONFIDENCE_SCORES[self.prior_confidence]
-                self.pipeline_space[k].default_confidence_score = confidence
-
-    def _get_rung_maps(self, s: int = 0) -> dict:
-        """Maps rungs (0,1,...,k) to a fidelity value based on fidelity bounds, eta, s."""
-        eta = round(1 / self.good_fraction)
-        new_min_budget = self.min_fidelity * (1 / eta**s)
-        nrungs = (
-            np.floor(np.log(self.max_fidelity / new_min_budget) / np.log(eta)).astype(int)
-            + 1
-        )
-        _max_budget = self.max_fidelity
-        rung_map = dict()
-        inverse_rung_map = dict()
-        for i in reversed(range(nrungs)):
-            # TODO: add +s to keys and TEST
-            rung_value = (
-                int(_max_budget)
-                if isinstance(self.pipeline_space.fidelity, IntegerParameter)
-                else _max_budget
-            )
-
-            rung_map[i + s] = rung_value
-            inverse_rung_map[rung_value] = i + s
-            _max_budget /= eta
-        return rung_map, inverse_rung_map
-
-    def _get_types(self):
-        """extracts the needed types from the configspace for faster retrival later
-
-        type = 0 - numerical (continuous or integer) parameter
-        type >=1 - categorical parameter
-
-        TODO: figure out a way to properly handle ordinal parameters
-
-        """
-        types = []
-        num_values = []
-        logs = []
-        is_fidelity = []
-        for _, hp in self.pipeline_space.items():
-            is_fidelity.append(hp.is_fidelity)
-            if isinstance(hp, CategoricalParameter):
-                # u as in unordered - used to play nice with the statsmodels KDE implementation
-                types.append("u")
-                logs.append(False)
-                num_values.append(len(hp.choices))
-            elif isinstance(hp, IntegerParameter):
-                # o as in ordered
-                types.append("o")
-                logs.append(False)
-                num_values.append(hp.upper - hp.lower + 1)
-            elif isinstance(hp, FloatParameter):
-                # c as in continous
-                types.append("f")
-                logs.append(hp.log)
-                num_values.append(np.inf)
-            elif isinstance(hp, ConstantParameter):
-                # c as in continous
-                types.append("c")
-                logs.append(False)
-                num_values.append(1)
-
-            else:
-                raise ValueError("Unsupported Parametertype %s" % type(hp))
-
-        return types, num_values, logs, is_fidelity
-
-    def __call__(
-        self,
-        x: Iterable,
-        asscalar: bool = False,
-        only_lowest_fidelity=True,
-        only_good=False,
-    ) -> np.ndarray | torch.Tensor | float:
-        """
-        Return the negative expected improvement at the query point
-        """
-        # this is to only make the lowest fidelity viable
-        # TODO have this as a setting in the acq_sampler instead
-        if only_lowest_fidelity:
-            is_lowest_fidelity = (
-                np.array([x_.fidelity.value for x_ in x]) == self.rung_map[self.min_rung]
-            )
-            return np.log(self.surrogate_models["good"].pdf(x)) - np.log(
-                self.surrogate_models["bad"].pdf(x)
-            )
-        else:
-            return np.log(self.surrogate_models["good"].pdf(x)) - np.log(
-                self.surrogate_models["bad"].pdf(x)
-            )
-
-    def _split_by_fidelity(self, configs, losses):
-        if self.pipeline_space.has_fidelity:
-            configs_per_fidelity = [[] for i in range(self.num_rungs)]
-            losses_per_fidelity = [[] for i in range(self.num_rungs)]
-            # per fidelity, add a list to make it a nested list of lists
-            # [[config_A at fid1, config_B at fid1], [config_C at fid2], ...]
-            for config, loss in zip(configs, losses):
-                rung = self.inverse_rung_map[int(config.fidelity.value)]
-                configs_per_fidelity[rung].append(config)
-                losses_per_fidelity[rung].append(loss)
-            return configs_per_fidelity, losses_per_fidelity
-        else:
-            return [configs], [losses]
-
-    def _split_configs(
-        self, configs_per_fid, losses_per_fid, weight_per_fidelity, good_fraction=None
-    ):
-        """Splits configs into good and bad for the KDEs.
-
-        Args:
-            configs ([type]): [description]
-            losses ([type]): [description]
-            round_up (bool, optional): [description]. Defaults to True.
-
-        Returns:
-            [type]: [description]
-        """
-        if good_fraction is None:
-            good_fraction = self.good_fraction
-
-        good_configs, bad_configs = [], []
-        good_configs_weights, bad_configs_weights = [], []
-
-        for fid, (configs_fid, losses_fid) in enumerate(
-            zip(configs_per_fid, losses_per_fid)
-        ):
-            if self.round_up:
-                num_good_configs = np.ceil(len(configs_fid) * good_fraction).astype(int)
-            else:
-                num_good_configs = np.floor(len(configs_fid) * good_fraction).astype(int)
-
-            ordered_loss_indices = np.argsort(losses_fid)
-            good_indices = ordered_loss_indices[0:num_good_configs]
-            bad_indices = ordered_loss_indices[num_good_configs:]
-            good_configs_fid = [configs_fid[idx] for idx in good_indices]
-            bad_configs_fid = [configs_fid[idx] for idx in bad_indices]
-            good_configs.extend(good_configs_fid)
-            bad_configs.extend(bad_configs_fid)
-
-            if self.threshold_improvement:
-                good_configs_weights.extend(
-                    self._compute_improvement_weights(
-                        losses_fid, num_good_configs, weight_per_fidelity[fid]
-                    )
-                )
-            else:
-                good_configs_weights.extend(
-                    [weight_per_fidelity[fid]] * len(good_configs_fid)
-                )
-            bad_configs_weights.extend([weight_per_fidelity[fid]] * len(bad_configs_fid))
-        return good_configs, bad_configs, good_configs_weights, bad_configs_weights
-
-    def _compute_improvement_weights(self, losses, num_good_configs, max_weight):
-        if num_good_configs == 0:
-            return []
-
-        ordered_losses = np.sort(losses)
-        best_bad_loss = ordered_losses[num_good_configs]
-        good_losses = ordered_losses[0:num_good_configs]
-        relative_improvements = (best_bad_loss - good_losses) / (
-            best_bad_loss - good_losses.min()
-        )
-        improvement_weights = max_weight * relative_improvements
-        return improvement_weights
-
-    def compute_fidelity_weights(self, configs_per_fid, losses_per_fid) -> list:
-        # TODO consider pending configurations - will default to a linear weighting
-        # which is not necessarily correct
-        if self.fidelity_weighting == "linear":
-            weight_per_fidelity = self._compute_linear_weights()
-        elif self.fidelity_weighting == "spearman":
-            weight_per_fidelity = self._compute_spearman_weights(
-                configs_per_fid, losses_per_fid
-            )
-        else:
-            raise ValueError(
-                f"No weighting scheme {self.fidelity_weighting} is available."
-            )
-        return weight_per_fidelity
-
-    def _compute_linear_weights(self):
-        return (1 + np.arange(self.min_rung, self.max_rung + 1)) / self.num_rungs
-
-    def _compute_spearman_weights(self, configs_per_fid, losses_per_fid) -> list:
-        min_number_samples = np.round(1 / self.good_fraction).astype(int)
-        samples_per_fid = np.array([len(cfgs_fid) for cfgs_fid in configs_per_fid])
-        max_comparable_fid = (
-            self.max_rung - np.argmax(np.flip(samples_per_fid) >= min_number_samples)
-        ).astype(int)
-        if max_comparable_fid == 0:
-            # if we cannot compare to any otḧer fidelity, return default
-            return self._compute_linear_weights()
-        else:
-            # compare the rankings of the existing configurations to the ranking
-            # of the same configurations at lower rungs
-            spearman = np.ones(self.num_rungs)
-            for fid_idx, (cfgs, losses) in enumerate(
-                zip(configs_per_fid, losses_per_fid)
-            ):
-                if fid_idx >= max_comparable_fid:
-                    spearman[fid_idx] = 1
-
-                else:
-                    comp_losses = losses_per_fid[fid_idx + 1]
-                    comp_configs = configs_per_fid[fid_idx + 1]
-
-                    lower_fid_configs = [None] * len(comp_configs)
-                    lower_fid_losses = [None] * len(comp_configs)
-                    for cfg, loss in zip(cfgs, losses):
-                        # check if the config at the lower fidelity level is in the comparison set
-                        # TODO make this more efficient - probably embarrasingly slof for now
-                        # with the triple-nested loop (although number of configs per level is pretty low)
-                        is_equal_config = [
-                            cfg.is_equal_value(comp_cfg, include_fidelity=False)
-                            for comp_cfg in comp_configs
-                        ]
-                        if any(is_equal_config):
-                            equal_index = np.argmax(is_equal_config)
-                            lower_fid_configs[equal_index] = cfg
-                            lower_fid_losses[equal_index] = loss
-
-                    spearman[fid_idx] = spearmanr(
-                        lower_fid_losses, comp_losses
-                    ).correlation
-
-            spearman = np.clip(spearman, a_min=0, a_max=1)
-            # The correlation with Z_max at fidelity Z-k cannot be larger than at Z-k+1
-            spearman = np.flip(np.multiply.accumulate(np.flip(spearman)))
-            fidelity_weights = spearman * (max_comparable_fid + 1) / (self.max_rung + 1)
-        return fidelity_weights
-
-    def is_init_phase(self) -> bool:
-        """Decides if optimization is still under the warmstart phase/model-based search."""
-        if self._num_train_x >= self._initial_design_size:
-            return False
-        return True
-
-    @override
-    def load_optimization_state(
-        self,
-        previous_results: dict[str, ConfigResult],
-        pending_evaluations: dict[str, SearchSpace],
-        budget_info: BudgetInfo | None,
-        optimizer_state: dict[str, Any],
-    ) -> None:
-        # TODO remove doubles from previous results
-        train_y = [self.get_loss(el.result) for el in previous_results.values()]
-
-        train_x_configs = [el.config for el in previous_results.values()]
-        pending_configs = list(pending_evaluations.values())
-
-        filtered_configs, filtered_indices = self._filter_old_configs(train_x_configs)
-        filtered_y = np.array(train_y)[filtered_indices].tolist()
-
-        self.train_x_configs = train_x_configs
-        self.train_y = train_y
-
-        self._pending_evaluations = pending_evaluations
-        self._num_train_x = len(self.train_x_configs)
-        if not self.is_init_phase():
-            # This is to extract the configurations as numpy arrays on the format num_data x num_dim
-            # TODO when a config is removed in the filtering process, that means that some other
-            # configuration at the lower fidelity will become good, that was previously bad. This
-            # may be good or bad, but I'm not sure. / Carl
-            configs_per_fid, losses_per_fid = self._split_by_fidelity(
-                train_x_configs, train_y
-            )
-            filtered_configs_per_fid, filtered_losses_per_fid = self._split_by_fidelity(
-                filtered_configs, filtered_y
-            )
-            weight_per_fidelity = self.compute_fidelity_weights(
-                configs_per_fid, losses_per_fid
-            )
-
-            good_configs, bad_configs, good_weights, bad_weights = self._split_configs(
-                filtered_configs_per_fid, filtered_losses_per_fid, weight_per_fidelity
-            )
-            if self.use_priors:
-                num_prior_configs = len(self.prior_samples)
-                good_configs.extend(self.prior_samples)
-                prior_sample_constant = self.prior_num_evals / num_prior_configs
-                good_weights.extend([prior_sample_constant] * num_prior_configs)
-
-            fixed_bw = None
-            self.surrogate_models["all"].fit(filtered_configs)
-            if self.joint_kde_modelling:
-                fixed_bw = self.surrogate_models["all"].bw
-
-            self.surrogate_models["good"].fit(
-                good_configs, fixed_bw=fixed_bw, config_weights=good_weights
-            )
-            if self._pending_as_bad:
-                # This is only to compute the weights of the pending configs
-                _, pending_configs, _, pending_weights = self._split_configs(
-                    pending_configs,
-                    [np.inf] * len(pending_configs),
-                    weight_per_fidelity,
-                    good_fraction=0.0,
-                )
-                bad_configs.extend(pending_configs)
-                bad_weights.extend(pending_weights)
-
-            self.surrogate_models["bad"].fit(
-                bad_configs, fixed_bw=fixed_bw, config_weights=bad_weights
-            )
-            # self.visualize_acq(previous_results, weight_per_fidelity)
-
-    def _filter_old_configs(self, configs):
-        new_configs = []
-        new_indices = []
-        old_configs_flat = []
-        for cfgs in self.old_configs_per_fid:
-            old_configs_flat.extend(cfgs)
-
-        for idx, cfg in enumerate(configs):
-            if any([cfg.is_equal_value(old_cfg) for old_cfg in old_configs_flat]):
-                # If true, configs are equal and shouldn't be added
-                continue
-            else:
-                new_configs.append(cfg)
-                new_indices.append(idx)
-        return new_configs, new_indices
-
-    def _get_promotable_configs(self, configs):
-        if self.soft_promotion:
-            configs_for_promotion = self._get_soft_promotable(configs)
-        else:
-            configs_for_promotion = self._get_hard_promotable(configs)
-        return configs_for_promotion
-
-    def _get_hard_promotable(self, configs):
-        # count the number of configs that are at or above any given rung
-        configs_per_rung = np.zeros(self.num_rungs)
-        # check the number of configs per fidelity level
-        for config in configs:
-            rung = self.inverse_rung_map[int(config.fidelity.value)]
-            configs_per_rung[rung] += 1
-
-        cumulative_per_rung = np.flip(np.cumsum(np.flip(configs_per_rung)))
-        cumulative_above = np.append(np.flip(np.cumsum(np.flip(configs_per_rung[1:]))), 0)
-        # then check which one can make the most informed decision on promotions
-        rungs_to_promote = cumulative_per_rung * self.good_fraction - cumulative_above
-
-        # this defaults to max_fidelity if there is no promotable config (cannot promote from)
-        # the top fidelity anyway
-        fid_to_promote = self.num_rungs - np.argmax(np.flip(rungs_to_promote) > 1)
-
-        # TODO check if this returns empty when it needs to
-        if fid_to_promote == self.max_rung:
-            return []
-        return [cfg for cfg in configs if cfg.fidelity.value == fid_to_promote]
-
-    def _get_soft_promotable(self, configs):
-        # TODO implement
-        # count the number of configs that are at or above any given rung
-        new_configs, _ = self._filter_old_configs(configs)
-        configs_per_rung = np.zeros(self.num_rungs)
-
-        # check the number of configs per fidelity level
-        for config in new_configs:
-            rung = self.inverse_rung_map[int(config.fidelity.value)]
-            configs_per_rung[rung] += 1
-
-        # The square root means that we keep the approximate distribution between
-        # rungs as HyperBand
-        rungs_to_promote = configs_per_rung * np.power(
-            self.good_fraction, np.flip(np.sqrt(np.arange(self.num_rungs)))
-        )
-        rungs_to_promote[-1] = 0
-        next_rung_to_promote = np.arange(self.num_rungs)[rungs_to_promote > 1]
-        if len(next_rung_to_promote) == 0:
-            return []
-
-        next_fid_to_promote = self.rung_map[next_rung_to_promote[0]]
-        return [cfg for cfg in new_configs if cfg.fidelity.value == next_fid_to_promote]
-
-    def _promote_existing(self, configs_for_promotion):
-        # TODO we still need to REMOVE the observation at the lower fidelity
-        # i.e. give it zero weight in the KDE, and ensure the count is correct
-        assert len(configs_for_promotion) > 0, "No promotable configurations"
-        if self.promote_from_acq:
-            acq_values = self.__call__(configs_for_promotion, only_lowest_fidelity=False)
-        else:
-            acq_values = self.__call__(
-                configs_for_promotion, only_lowest_fidelity=False, only_good=True
-            )
-
-        next_config = configs_for_promotion[np.argmax(acq_values)]
-        current_rung = self.inverse_rung_map[next_config.fidelity.value]
-        self.old_configs_per_fid[current_rung].append(next_config.copy())
-        new_fidelity = self.rung_map[current_rung + 1]
-        next_config.fidelity.set_value(new_fidelity)
-        return next_config
-
-    def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]:
-        if self._num_train_x == 0 and self._initial_design_size >= 1:
-            # TODO only at lowest fidelity
-            config = self.pipeline_space.sample(
-                patience=self.patience, user_priors=True, ignore_fidelity=False
-            )
-            config.fidelity.set_value(self.rung_map[self.min_rung])
-
-        elif self.is_init_phase():
-            config = self.pipeline_space.sample(
-                patience=self.patience, user_priors=True, ignore_fidelity=True
-            )
-            config.fidelity.set_value(self.rung_map[self.min_rung])
-
-        elif random.random() < self._random_interleave_prob:
-            # TODO only at lowest fidelity
-            config = self.pipeline_space.sample(
-                patience=self.patience, ignore_fidelity=False, user_priors=False
-            )
-            config.fidelity.set_vlaue(self.rung_map[self.min_rung])
-        elif len(self._get_promotable_configs(self.train_x_configs)) > 0:
-            configs_for_promotion = self._get_promotable_configs(self.train_x_configs)
-            config = self._promote_existing(configs_for_promotion)
-
-        else:
-            config = self.acquisition_sampler.sample(self.acquisition)
-            config.fidelity.set_value(self.rung_map[self.min_rung])
-
-        config_id = str(self._num_train_x + len(self._pending_evaluations) + 1)
-        return config.hp_values(), config_id, None
-
-    def visualize_2d(
-        self, ax, previous_results, grid_points: int = 101, color: str = "k"
-    ):
-        X1 = np.linspace(0, 1, grid_points)
-        X2 = np.linspace(0, 1, grid_points)
-        X1, X2 = np.meshgrid(X1, X2)
-        X = np.append(X1.reshape(-1, 1), X2.reshape(-1, 1), axis=1)
-        Z = self.surrogate_models["good"]._pdf(X) / self.surrogate_models["bad"]._pdf(X)
-        Z_min, Z_max = -np.abs(Z).max(), np.abs(Z).max()
-
-        Z = Z.reshape(grid_points, grid_points)
-
-        c = ax.pcolormesh(X1, X2, Z, cmap=color, vmin=Z_min, vmax=Z_max)
-        ax.set_title("pcolormesh")
-        # set the limits of the plot to the limits of the data
-        ax.axis([0, 1, 0, 1])
-        train_x_configs = [el.config for el in previous_results.values()]
-        np_X = self.surrogate_models["good"]._convert_configs_to_numpy(train_x_configs)
-        ax.scatter(np_X[:, 0], np_X[:, 1], s=100)
-        # ax.scatter(np_X[-1, 0], np_X[-1, 1], s=100, c='yellow')
-
-        return ax
-
-    def visualize_acq(self, previous_results, weights_per_fidelity):
-        import matplotlib.pyplot as plt
-
-        train_x_configs = [el.config for el in previous_results.values()]
-        train_y = [self.get_loss(el.result) for el in previous_results.values()]
-
-        filtered_configs, filtered_indices = self._filter_old_configs(train_x_configs)
-        configs_per_fid, losses_per_fid = self._split_by_fidelity(
-            train_x_configs, train_y
-        )
-        filtered_y = np.array(train_y)[filtered_indices].tolist()
-        filtered_configs_per_fid, filtered_losses_per_fid = self._split_by_fidelity(
-            filtered_configs, filtered_y
-        )
-        weight_per_fidelity = self.compute_fidelity_weights(
-            configs_per_fid, losses_per_fid
-        )
-        good_configs, bad_configs, good_weights, bad_weights = self._split_configs(
-            filtered_configs_per_fid, filtered_losses_per_fid, weight_per_fidelity
-        )
-        good_configs_np = self.surrogate_models["all"]._convert_configs_to_numpy(
-            good_configs
-        )
-        bad_configs_np = self.surrogate_models["all"]._convert_configs_to_numpy(
-            bad_configs
-        )
-
-        fig, axes = plt.subplots(1, 3, figsize=(16, 9))
-        axes[0] = self.surrogate_models["good"].visualize_2d(axes[0], color="RdBu")
-        axes[0].scatter(
-            good_configs_np[:, 0],
-            good_configs_np[:, 1],
-            c=good_weights,
-            cmap="spring",
-            s=50,
-            marker="x",
-        )
-        axes[1] = self.surrogate_models["bad"].visualize_2d(axes[1], color="RdBu_r")
-        axes[1].scatter(
-            bad_configs_np[:, 0],
-            bad_configs_np[:, 1],
-            c=bad_weights,
-            s=50,
-            cmap="spring",
-            marker="x",
-        )
-        axes[2] = self.visualize_2d(axes[2], previous_results, color="BrBG")
-        plt.show()
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index 206078cb..307f806b 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -10,7 +10,7 @@
 import torch
 from botorch.acquisition.analytic import SingleTaskGP
 from botorch.models import MixedSingleTaskGP
-from botorch.models.gp_regression_mixed import CategoricalKernel
+from botorch.models.gp_regression_mixed import CategoricalKernel, Likelihood
 from botorch.models.transforms.outcome import Standardize
 from botorch.optim import optimize_acqf, optimize_acqf_mixed
 from gpytorch.kernels import MaternKernel, ScaleKernel
@@ -149,7 +149,9 @@ def default_categorical_kernel(
 def default_single_obj_gp(
     x: TensorPack,
     y: torch.Tensor,
-) -> SingleTaskGP:
+) -> tuple[SingleTaskGP, Likelihood]:
+    if y.ndim == 1:
+        y = y.unsqueeze(-1)
     encoder = x.encoder
     numerics: list[int] = []
     categoricals: list[int] = []
@@ -159,29 +161,33 @@ def default_single_obj_gp(
         else:
             numerics.append(encoder.index_of[hp_name])
 
+    likelihood = default_likelihood_with_prior()
+
     # Purely vectorial
     if len(categoricals) == 0:
-        return SingleTaskGP(
+        gp = SingleTaskGP(
             train_X=x.tensor,
             train_Y=y,
             mean_module=default_mean(),
-            likelihood=default_likelihood_with_prior(),
+            likelihood=likelihood,
             # Only matern kernel
             covar_module=default_matern_kernel(len(numerics)),
             outcome_transform=Standardize(m=1),
         )
+        return gp, likelihood
 
     # Purely categorical
     if len(numerics) == 0:
-        return SingleTaskGP(
+        gp = SingleTaskGP(
             train_X=x.tensor,
             train_Y=y,
             mean_module=default_mean(),
-            likelihood=default_likelihood_with_prior(),
+            likelihood=likelihood,
             # Only categorical kernel
             covar_module=default_categorical_kernel(len(categoricals)),
             outcome_transform=Standardize(m=1),
         )
+        return gp, likelihood
 
     # Mixed
     def cont_kernel_factory(
@@ -203,14 +209,15 @@ def cont_kernel_factory(
             ),
         )
 
-    return MixedSingleTaskGP(
+    gp = MixedSingleTaskGP(
         train_X=x.tensor,
         train_Y=y,
         cat_dims=categoricals,
-        likelihood=default_likelihood_with_prior(),
+        likelihood=likelihood,
         cont_kernel_factory=cont_kernel_factory,
         outcome_transform=Standardize(m=1),
     )
+    return gp, likelihood
 
 
 def optimize_acq(
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index d9f322aa..2c4e5eeb 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -8,11 +8,15 @@
     LinearMCObjective,
     qLogExpectedImprovement,
 )
+from botorch.fit import fit_gpytorch_mll
 from gpytorch import ExactMarginalLogLikelihood
 
 from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig
-from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import (
-    PiboAcquisition,
+from neps.optimizers.bayesian_optimization.acquisition_functions.cost_cooling import (
+    cost_cooled_acq,
+)
+from neps.optimizers.bayesian_optimization.acquisition_functions.pibo import (
+    pibo_acquisition,
 )
 from neps.optimizers.bayesian_optimization.models.gp import (
     default_single_obj_gp,
@@ -23,6 +27,7 @@
 from neps.search_spaces.hyperparameters.categorical import CategoricalParameter
 
 if TYPE_CHECKING:
+    from botorch.models.gp_regression_mixed import Likelihood
     from botorch.models.model import Model
 
     from neps.search_spaces import (
@@ -34,29 +39,88 @@
     from neps.state import BudgetInfo, Trial
 
 
-def _pibo_acq_beta_and_n(
-    n_sampled_already: int,
-    ndims: int,
-    budget_info: BudgetInfo,
-) -> tuple[float, float]:
+def _missing_fill_strategy(
+    y: torch.Tensor,
+    strategy: Literal["mean", "worst", "3std", "nan"],
+    *,
+    lower_is_better: bool,
+) -> torch.Tensor:
+    # Assumes minimization
+    if y.ndim != 1:
+        raise ValueError("Only supports single objective optimization for now!")
+
+    match strategy:
+        case "nan":
+            return y
+        case "mean":
+            return torch.nan_to_num(y, nan=y.mean().item())
+        case "worst":
+            worst = y.min() if lower_is_better else y.max()
+            return torch.nan_to_num(y, nan=worst.item())
+        case "3std":
+            sign = 1 if lower_is_better else -1
+            std = y.std()
+            return torch.nan_to_num(y, nan=y.mean().item() + sign * 3 * std.item())
+        case _:
+            raise ValueError(f"Unknown strategy: {strategy}")
+
+
+def _missing_y_strategy(y: torch.Tensor) -> torch.Tensor:
+    # TODO: Figure out what to do if there's no reported loss value.
+    # Some strategies:
+    # 1. Replace with NaN, in which case GPYtorch ignores it
+    #   * Good if crash is random crash, in which case we do not wish to model
+    #   a performance because of it.
+    # 2. Replace with worst value seen so far
+    #   * Good if crash is systematic, in which case we wish to model it as
+    #   basically, "don't go here" while remaining in the range of possible
+    #   values for the GP.
+    # 3. Replace with mean
+    #   * Same as above but keeps the optimization of the GP landscape
+    #   smoother. Good if we have a mix of non-systematic and systematic
+    #   crashed. Likely the safest option as GP will likely be unconfident in
+    #   unsystematic crash cases, especially if it seems like a rare-event.
+    #   Will also unlikely be a candidate region if systematic and we observe
+    #   a few crashes there. However would take longer to learn of systematic
+    #   crash regions.
+    return _missing_fill_strategy(y, strategy="mean", lower_is_better=True)
+
+
+def _missing_cost_strategy(cost: torch.Tensor) -> torch.Tensor:
+    # TODO: Figure out what to do if there's no reported cost value
+    # Likely best to just fill in worst cost seen so far as this crash
+    # cost us a lot of time and we do not want to waste time on this
+    # region again. However if the crash was random, we might enter some
+    # issues.
+    return _missing_fill_strategy(cost, strategy="3std", lower_is_better=True)
+
+
+def _pibo_exp_term(n_sampled_already: int, ndims: int, budget_info: BudgetInfo) -> float:
     if budget_info.max_evaluations is not None:
         # From the PIBO paper (Section 4.1)
         # https://arxiv.org/pdf/2204.11051
+        n = n_sampled_already
         beta = budget_info.max_evaluations / 10
-        return n_sampled_already, beta
-
-    if budget_info.max_cost_budget is not None:
+    elif budget_info.max_cost_budget is not None:
         # This might not work well if cost number is high
         # early on, but it will start to normalize.
         n = budget_info.used_cost_budget
         beta = budget_info.max_cost_budget / 10
-        return n, beta
+    else:
+        # Otherwise, just some random heuristic based on the number
+        # of trials and dimensionality of the search space
+        # TODO: Think about and evaluate this more.
+        n = n_sampled_already
+        beta = ndims**2 / 10
 
-    # Otherwise, just some random heuristic based on the number
-    # of trials and dimensionality of the search space
-    # TODO: Think about and evaluate this more.
-    beta = ndims**2 / 10
-    return n_sampled_already, beta
+    return beta / n
+
+
+def _cost_used_budget_percentage(budget_info: BudgetInfo) -> float:
+    if budget_info.max_cost_budget is not None:
+        return budget_info.used_cost_budget / budget_info.max_cost_budget
+
+    raise ValueError("No cost budget provided!")
 
 
 # TODO: This needs to be moved to the search space class, however
@@ -105,9 +169,10 @@ def __init__(  # noqa: D417
         *,
         initial_design_size: int | None = None,
         surrogate_model: (
-            Literal["gp"] | Callable[[TensorPack, torch.Tensor], Model]
+            Literal["gp"] | Callable[[TensorPack, torch.Tensor], tuple[Model, Likelihood]]
         ) = "gp",
         use_priors: bool = False,
+        use_cost: bool = False,
         sample_default_first: bool = False,
         device: torch.device | None = None,
         encoder: TensorEncoder | None = None,
@@ -124,6 +189,15 @@ def __init__(  # noqa: D417
             surrogate_model: Surrogate model, either a known model str or a callable
                 that takes in the training data and returns a model fitted to (X, y).
             use_priors: Whether to use priors set on the hyperparameters during search.
+            use_cost: Whether to consider reported "cost" from configurations in decision
+                making. If True, the optimizer will weigh potential candidates by how much
+                they cost, incentivising the optimizer to explore cheap, good performing
+                configurations. This amount is modified over time
+
+                !!! warning
+
+                    If using `cost`, cost must be provided in the reports of the trials.
+
             sample_default_first: Whether to sample the default configuration first.
             device: Device to use for the optimization.
             encoder: Encoder to use for encoding the configurations. If None, it will
@@ -154,6 +228,7 @@ def __init__(  # noqa: D417
             params.update(pipeline_space.fidelities)
 
         self.encoder = TensorEncoder.default(params) if encoder is None else encoder
+        self.use_cost = use_cost
         self.prior = _make_prior(params) if use_priors is True else None
         self.device = device
         self.sample_default_first = sample_default_first
@@ -176,8 +251,9 @@ def ask(
                 "Seed is not yet implemented for BayesianOptimization"
             )
 
+        n_trials = len(trials)
         space = self.pipeline_space
-        config_id = str(len(trials) + 1)
+        config_id = str(n_trials + 1)
 
         # Fill intitial design data if we don't have any...
         if self.initial_design_ is None:
@@ -203,8 +279,8 @@ def ask(
             self.initial_design_.extend(configs)
 
         # If we havn't passed the intial design phase
-        if len(trials) <= len(self.initial_design_):
-            config = self.initial_design_[len(trials) - 1]
+        if n_trials <= len(self.initial_design_):
+            config = self.initial_design_[n_trials - 1]
             sample = SampledConfig(id=config_id, config=config, previous_config_id=None)
             return sample, optimizer_state
 
@@ -212,45 +288,80 @@ def ask(
         # TODO: Lift this into runtime, let the optimizer advertise the encoding wants...
         x_configs: list[dict[str, Any]] = []
         ys: list[float] = []
+        costs: list[float] = []
         pending: list[dict[str, Any]] = []
         for trial in trials.values():
             if trial.state.pending():
                 pending.append(trial.config)
             else:
                 assert trial.report is not None
-                # TODO: Figure out what to do if there's no reported loss value.
-                assert trial.report.loss is not None
                 x_configs.append(trial.config)
-                ys.append(trial.report.loss)
+                ys.append(
+                    trial.report.loss if trial.report.loss is not None else torch.nan
+                )
+                if self.use_cost:
+                    cost = trial.report.cost
+                    costs.append(cost if cost is not None else torch.nan)
 
         x = self.encoder.pack(x_configs, device=self.device)
-        x_pending = (
-            None if len(pending) == 0 else self.encoder.pack(pending, device=self.device)
-        )
-        y = torch.tensor(ys, dtype=torch.float64, device=self.device)
-        if y.ndim == 1:
-            y = y.unsqueeze(1)
-
-        model = self._get_model(x, y)
+        maybe_x_pending_tensor = None
+        if len(pending) > 0:
+            x_pending = self.encoder.pack(pending, device=self.device)
+            maybe_x_pending_tensor = x_pending.tensor
 
-        from botorch.fit import fit_gpytorch_mll
+        y = torch.tensor(ys, dtype=torch.float64, device=self.device)
+        y = _missing_y_strategy(y)
 
-        mll = ExactMarginalLogLikelihood(likelihood=model.likelihood, model=model)
-        _fit_mll = fit_gpytorch_mll(mll)
+        # Now fit our model
+        y_model, y_likelihood = self._get_model(x, y)
+        fit_gpytorch_mll(
+            ExactMarginalLogLikelihood(likelihood=y_likelihood, model=y_model)
+        )
 
         acq = qLogExpectedImprovement(
-            model,
+            y_model,
             best_f=y.min(),
-            X_pending=None if x_pending is None else x_pending.tensor,
+            X_pending=maybe_x_pending_tensor,
             # Unfortunatly, there's no option to indicate that we minimize
             # the AcqFunction so we need to do some kind of transformation.
             # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607
             objective=LinearMCObjective(weights=torch.tensor([-1.0])),
         )
+
+        # If we should use the prior, weight the acquisition function by
+        # the probability of it being sampled from the prior.
         if self.prior:
-            n, beta = _pibo_acq_beta_and_n(len(trials), self.encoder.ncols, budget_info)
-            acq = PiboAcquisition(acq, prior=self.prior, n=n, beta=beta)
+            acq = pibo_acquisition(
+                acq,
+                prior=self.prior,
+                prior_exponent=_pibo_exp_term(n_trials, self.encoder.ncols, budget_info),
+                x_domain=self.encoder.domains,
+                X_pending=maybe_x_pending_tensor,
+            )
+
+        # If we should use cost, weight the acquisition function by the cost
+        # of the configurations.
+        if self.use_cost:
+            cost = torch.tensor(costs, dtype=torch.float64, device=self.device)
+            cost = _missing_cost_strategy(cost)
+
+            # TODO: We might want a different model for cost estimation... one reason
+            # is that cost estimates are likely to be a lot noisier than the likelihood
+            # we have by default.
+            cost_model, cost_likelihood = self._get_model(x, cost)
+
+            # Optimize the cost model
+            fit_gpytorch_mll(
+                ExactMarginalLogLikelihood(likelihood=cost_likelihood, model=cost_model)
+            )
+            acq = cost_cooled_acq(
+                acq_fn=acq,
+                model=cost_model,
+                likelihood=cost_likelihood,
+                used_budget_percentage=_cost_used_budget_percentage(budget_info),
+            )
 
+        # Finally, optimize the acquisition function to get a configuration
         candidates, _eis = optimize_acq(acq_fn=acq, encoder=self.encoder, acq_options={})
 
         assert len(candidates) == 1, "Expected only one candidate!"
diff --git a/neps/optimizers/bayesian_optimization/sobol.py b/neps/optimizers/bayesian_optimization/sobol.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/neps/optimizers/initial_design.py b/neps/optimizers/initial_design.py
deleted file mode 100644
index c8039c6a..00000000
--- a/neps/optimizers/initial_design.py
+++ /dev/null
@@ -1,34 +0,0 @@
-"""Initial design of points for optimization."""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import TYPE_CHECKING
-from typing_extensions import override
-
-if TYPE_CHECKING:
-    import torch
-
-    from neps.priors import Prior
-    from neps.search_spaces.domain import Domain
-
-
-@dataclass
-class PriorInitialDesign(InitialDesign):
-    """Sample from a prior distribution."""
-
-    prior: Prior
-    """The prior to sample from."""
-
-    # TODO: Right now we don't have a way to set the seed temporarily
-    seed: int | None = None
-    """The seed for sampling."""
-
-    @override
-    def sample(self, n: int) -> torch.Tensor:
-        return self.prior.sample(n)
-
-    @property
-    @override
-    def sample_domain(self) -> list[Domain]:
-        return self.prior.domains
diff --git a/neps/distributions.py b/neps/sampling/distributions.py
similarity index 99%
rename from neps/distributions.py
rename to neps/sampling/distributions.py
index 2361e191..fb552949 100644
--- a/neps/distributions.py
+++ b/neps/sampling/distributions.py
@@ -225,6 +225,6 @@ def log_prob(self, value):
 
 
 @dataclass
-class DistributionOverDomain:
+class TorchDistributionWithDomain:
     distribution: Distribution
     domain: Domain
diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py
index 2deda010..03b64122 100644
--- a/neps/sampling/priors.py
+++ b/neps/sampling/priors.py
@@ -15,7 +15,7 @@
 
 import torch
 
-from neps.distributions import DistributionOverDomain, TruncatedNormal
+from neps.sampling.distributions import TorchDistributionWithDomain, TruncatedNormal
 from neps.sampling.samplers import Sampler, WeightedSampler
 from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
 
@@ -167,11 +167,11 @@ def make_centered(
                     f"Please provide a center for all domains."
                 )
 
-        distributions: list[DistributionOverDomain] = []
+        distributions: list[TorchDistributionWithDomain] = []
         for name, domain in domains.items():
             center_confidence = centers.get(name)
             if center_confidence is None:
-                dist = DistributionOverDomain(
+                dist = TorchDistributionWithDomain(
                     distribution=torch.distributions.Uniform(0.0, 1.0),
                     domain=UNIT_FLOAT_DOMAIN,
                 )
@@ -202,7 +202,7 @@ def make_centered(
 
                 weights[center] = confidence
 
-                dist = DistributionOverDomain(
+                dist = TorchDistributionWithDomain(
                     distribution=torch.distributions.Categorical(probs=weights),
                     domain=domain,
                 )
@@ -213,7 +213,7 @@ def make_centered(
             unit_center = domain.to_unit(
                 torch.tensor(center, device=device, dtype=torch.float64)
             )
-            dist = DistributionOverDomain(
+            dist = TorchDistributionWithDomain(
                 distribution=TruncatedNormal(
                     loc=unit_center,
                     scale=(1 - confidence),
@@ -254,7 +254,7 @@ class CenteredPrior(Prior):
     [`Prior.make_centered()`][neps.priors.Prior.make_centered].
     """
 
-    distributions: list[DistributionOverDomain]
+    distributions: list[TorchDistributionWithDomain]
     """Distributions along with the corresponding domains they sample from."""
 
     _distribution_domains: list[Domain] = field(init=False, repr=False)
diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py
index 6802f6d7..c7456155 100644
--- a/neps/sampling/samplers.py
+++ b/neps/sampling/samplers.py
@@ -40,11 +40,9 @@ def sample(
                 will be added with [`.ncols`][neps.samplers.Sampler.ncols].
                 For example, if `n = 5`, the output will be `(5, ncols)`. If
                 `n = (5, 3)`, the output will be `(5, 3, ncols)`.
-            to: The domain or list of domains to cast the points to.
-                If a single domain, all points are cast to that domain, otherwise
-                each column `ndim_i` in (n, ndim) is cast to the corresponding domain
-                in `to`. As a result, the length of `to` must match the number of columns
-                from [`.ncols`][neps.samplers.Sampler.ncols].
+            to: If a single domain, `.ncols` columns will be produced form that one
+                domain. If a list of domains, then it must have the same length as the
+                number of columns, with each column being in the corresponding domain.
             seed: The seed for the random number generator.
             device: The device to cast the samples to.
 
@@ -58,7 +56,7 @@ def sobol(cls, ndim: int, *, scramble: bool = True, seed: int | None = None) ->
         """Create a Sobol sampler.
 
         Args:
-            ndim: The number of dimensions to sample for.
+            ndim: The number of columns to sample.
             scramble: Whether to scramble the Sobol sequence.
             seed: The seed for the Sobol sequence.
 
@@ -82,6 +80,13 @@ class Sobol(Sampler):
     scramble: bool = True
     """Whether to scramble the Sobol sequence."""
 
+    def __post_init__(self):
+        if self.ndim < 1:
+            raise ValueError(
+                "The number of dimensions must be at least 1."
+                f" Got {self.ndim} dimensions."
+            )
+
     @property
     @override
     def ncols(self) -> int:
@@ -180,13 +185,13 @@ def sample(
             total_samples = reduce(lambda x, y: x * y, n)
             output_shape = (*n, self.ncols)
 
-        # Randomly select which prior to sample from for each of the total_samples
-        chosen_priors = torch.empty((total_samples,), device=device, dtype=torch.int64)
-        chosen_priors = torch.multinomial(
+        # Randomly select which sampler to sample from for each of the total_samples
+        chosen_samplers = torch.empty((total_samples,), device=device, dtype=torch.int64)
+        chosen_samplers = torch.multinomial(
             self.probabilities,
             total_samples,
             replacement=True,
-            out=chosen_priors,
+            out=chosen_samplers,
         )
 
         # Create an empty tensor to hold all samples
@@ -194,16 +199,16 @@ def sample(
             (total_samples, self.ncols), device=device, dtype=torch.float64
         )
 
-        # Loop through each prior and its associated indices
-        for i, prior in enumerate(self.samplers):
-            # Find indices where the chosen prior is i
+        # Loop through each sampler and its associated indices
+        for i, sampler in enumerate(self.samplers):
+            # Find indices where the chosen sampler is i
             _i = torch.tensor(i, dtype=torch.int64, device=device)
-            indices = torch.where(chosen_priors == _i)[0]
+            indices = torch.where(chosen_samplers == _i)[0]
 
             if len(indices) > 0:
-                # Sample from the prior for the required number of indices
-                samples_from_prior = prior.sample(len(indices), to=to, device=device)
-                output_samples[indices] = samples_from_prior
+                # Sample from the sampler for the required number of indices
+                samples_from_sampler = sampler.sample(len(indices), to=to, device=device)
+                output_samples[indices] = samples_from_sampler
 
         # Reshape to the output shape including ncols dimension
         output_samples = output_samples.view(output_shape)
diff --git a/neps/search_spaces/distributions/__init__.py b/neps/search_spaces/distributions/__init__.py
deleted file mode 100644
index 65151e66..00000000
--- a/neps/search_spaces/distributions/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from neps.search_spaces.distributions.distribution import Distribution
-from neps.search_spaces.distributions.truncnorm import TruncNormDistribution
-from neps.search_spaces.distributions.uniform_float import UniformFloatDistribution
-from neps.search_spaces.distributions.uniform_int import UniformIntDistribution
-from neps.search_spaces.distributions.weighted_ints import WeightedIntsDistribution
-
-UNIT_UNIFORM = UniformFloatDistribution.new(0.0, 1.0)
-
-__all__ = [
-    "Distribution",
-    "TruncNormDistribution",
-    "UniformFloatDistribution",
-    "UniformIntDistribution",
-    "UNIT_UNIFORM",
-    "WeightedIntsDistribution",
-]
diff --git a/neps/search_spaces/distributions/distribution.py b/neps/search_spaces/distributions/distribution.py
deleted file mode 100644
index 7ab4dd6f..00000000
--- a/neps/search_spaces/distributions/distribution.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, TypeVar
-from typing_extensions import Protocol
-
-V = TypeVar("V", int, float)
-
-
-if TYPE_CHECKING:
-    from torch import Generator, Tensor
-
-    from neps.search_spaces.domain import Domain
-
-
-class Distribution(Protocol[V]):
-    @property
-    def domain(self) -> Domain[V]: ...
-
-    def sample(self, n: int, to: Domain, *, seed: Generator) -> Tensor: ...
-
-    def likelihood(self, value: Tensor) -> Tensor: ...
diff --git a/neps/search_spaces/distributions/truncnorm.py b/neps/search_spaces/distributions/truncnorm.py
deleted file mode 100644
index 3938cf1c..00000000
--- a/neps/search_spaces/distributions/truncnorm.py
+++ /dev/null
@@ -1,112 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass
-from functools import lru_cache
-from typing import TYPE_CHECKING, Any
-from typing_extensions import override
-
-import torch
-from torch import Tensor
-
-from neps.search_spaces.distributions.distribution import Distribution
-from neps.search_spaces.domain import Domain
-
-if TYPE_CHECKING:
-    from neps.utils.types import Number
-
-INT_HIGH = 1_000_000
-
-
-@lru_cache
-def _truncnorm(a: float, b: float, loc: float, scale: float) -> Any:
-    from scipy.stats import truncnorm
-
-    return truncnorm(a=a, b=b, loc=loc, scale=scale)
-
-
-@dataclass(frozen=True)
-class TruncNormDistribution(Distribution[float]):
-    domain: Domain[float]
-    center: float
-    std: float
-    truncnorm: Any
-
-    @override
-    def sample(self, n: int, seed: torch.Generator) -> Tensor:
-        random_state = torch.randint(INT_HIGH, size=(1,), generator=seed)
-        rv = self.truncnorm.rvs(size=n, random_state=random_state.item())
-        return torch.tensor(rv, dtype=self.domain.dtype)
-
-    @override
-    def likelihood(self, value: Tensor) -> Tensor:
-        return self.truncnorm.pdf(value.numpy())
-
-    def normalize(self) -> TruncNormDistribution:
-        # Send to unit domain
-        center = float(self.domain.from_unit(torch.tensor(self.center)).item())
-        std = self.std / self.domain.length
-
-        return TruncNormDistribution(
-            domain=Domain.unit_float(),
-            center=center,
-            std=std,
-            truncnorm=_truncnorm(
-                a=(0 - center) / std,
-                b=(1 - center) / std,
-                loc=center,
-                scale=std,
-            ),
-        )
-
-    def with_center_and_confidence(
-        self,
-        center: Number,
-        confidence: float,
-    ) -> TruncNormDistribution:
-        assert 0 <= confidence <= 1
-        assert self.domain.lower <= center <= self.domain.upper
-        std = 1 - confidence
-        center = float(center)
-        return TruncNormDistribution(
-            domain=self.domain,
-            center=center,
-            std=std,
-            truncnorm=_truncnorm(
-                a=(self.domain.lower - center) / std,
-                b=(self.domain.upper - center) / std,
-                loc=center,
-                scale=std,
-            ),
-        )
-
-    @classmethod
-    def new(
-        cls,
-        lower: Number,
-        center: Number,
-        upper: Number,
-        *,
-        std: Number,
-        std_is_normalized: bool,
-    ) -> TruncNormDistribution:
-        assert lower <= center <= upper, f"{lower} <= {center} <= {upper}"
-        center = float(center)
-
-        if std_is_normalized:
-            assert 0 <= std <= 1
-            std = float((upper - lower) * std)
-        else:
-            assert std > 0
-            std = float(std)
-
-        return cls(
-            domain=Domain.float(float(lower), float(upper)),
-            center=center,
-            std=std,
-            truncnorm=_truncnorm(
-                a=(lower - center) / std,
-                b=(upper - center) / std,
-                loc=center,
-                scale=std,
-            ),
-        )
diff --git a/neps/search_spaces/distributions/uniform_float.py b/neps/search_spaces/distributions/uniform_float.py
deleted file mode 100644
index bdb43ee8..00000000
--- a/neps/search_spaces/distributions/uniform_float.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from typing_extensions import override
-
-import torch
-from torch import Tensor
-
-from neps.search_spaces.distributions.distribution import Distribution
-from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
-
-INT_HIGH = 1_000_000
-
-
-@dataclass(frozen=True)
-class UniformFloatDistribution(Distribution[float]):
-    domain: Domain[float]
-    _pdf: float = field(repr=False)
-
-    @override
-    def sample(self, n: int, to: Domain, seed: torch.Generator) -> Tensor:
-        # This creates samples in a unit float domain, rather than
-        # the `.domain` attribute of this distribution. Rather than scale
-        # up twice, we just scale directly form the UNIT_FLOAT_DOMAIN
-        # We still however need the `.domain` attribute for `likelihood`
-        unit_samples = torch.rand(n, generator=seed)
-        return to.cast(unit_samples, UNIT_FLOAT_DOMAIN)
-
-    @override
-    def likelihood(self, value: Tensor) -> Tensor:
-        return torch.where(
-            (value >= self.domain.lower) & (value <= self.domain.upper),
-            self._pdf,
-            0.0,
-        )
-
-    @classmethod
-    def new(cls, lower: int | float, upper: int | float) -> UniformFloatDistribution:
-        _pdf = 1.0 / (upper - lower)
-        return cls(Domain.float(lower, upper), _pdf=_pdf)
-
-    @classmethod
-    def unit_distribution(cls) -> UniformFloatDistribution:
-        return UNIT_UNIFORM_FLOAT
-
-
-UNIT_UNIFORM_FLOAT = UniformFloatDistribution.new(0.0, 1.0)
diff --git a/neps/search_spaces/distributions/uniform_int.py b/neps/search_spaces/distributions/uniform_int.py
deleted file mode 100644
index 8fd7b043..00000000
--- a/neps/search_spaces/distributions/uniform_int.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from typing import TYPE_CHECKING
-from typing_extensions import override
-
-import torch
-from torch import Tensor
-
-from neps.search_spaces.distributions.distribution import Distribution
-from neps.search_spaces.domain import Domain
-
-if TYPE_CHECKING:
-    from neps.utils.types import Number
-
-
-@dataclass(frozen=True)
-class UniformIntDistribution(Distribution[int]):
-    domain: Domain[int]
-    _pdf: float = field(repr=False)
-
-    @override
-    def sample(self, n: int, to: Domain, *, seed: torch.Generator) -> Tensor:
-        samples = torch.randint(
-            self.domain.lower,
-            self.domain.upper,
-            size=(n,),
-            generator=seed,
-        )
-        return to.cast(samples, frm=self.domain)
-
-    @override
-    def likelihood(self, value: Tensor) -> Tensor:
-        return torch.where(
-            (value >= self.domain.lower) & (value <= self.domain.upper),
-            self._pdf,
-            0.0,
-        )
-
-    @classmethod
-    def indices(cls, n: int) -> UniformIntDistribution:
-        return cls(Domain.int(0, n - 1), _pdf=1.0 / n)
-
-    @classmethod
-    def new(cls, lower: Number, upper: Number) -> UniformIntDistribution:
-        return cls(Domain.int(lower, upper), _pdf=1.0 / (upper - lower))
diff --git a/neps/search_spaces/distributions/weighted_ints.py b/neps/search_spaces/distributions/weighted_ints.py
deleted file mode 100644
index 3c8c60c5..00000000
--- a/neps/search_spaces/distributions/weighted_ints.py
+++ /dev/null
@@ -1,91 +0,0 @@
-from __future__ import annotations
-
-import warnings
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, ClassVar, Sequence
-from typing_extensions import override
-
-import torch
-from torch import Tensor
-
-from neps.search_spaces.distributions.distribution import Distribution
-from neps.search_spaces.domain import Domain
-
-if TYPE_CHECKING:
-    from neps.utils.types import Number
-
-
-@dataclass(frozen=True)
-class WeightedIntsDistribution(Distribution[int]):
-    # NOTE: Having a Million weights is very resource intense and super slow
-    # for sampling, especially given our common use case is to have only one weight
-    # with the rest being uniform. 100 is well out of scope for what was intended,
-    # as this is mostly intended for categoricals.
-    # If we need this, then we should make a more efficient implementation,
-    # such as one that uniform samples and then with probability `weight`
-    # replaces the value with the favoured value.
-    LIMIT_FOR_WEIGHTED_INTS: ClassVar[int] = 200
-
-    domain: Domain[int]
-    weights: Tensor
-
-    @override
-    def sample(self, n: int, to: Domain, *, seed: torch.Generator) -> Tensor:
-        rand_tensor = torch.multinomial(
-            self.weights,
-            n,
-            replacement=True,
-            generator=seed,
-        )
-        return to.cast(rand_tensor, frm=self.domain)
-
-    @override
-    def likelihood(self, value: Tensor) -> Tensor:
-        valid_indices = torch.logical_and(
-            value >= self.domain.lower, value <= self.domain.upper
-        )
-        psuedo_indices = torch.where(valid_indices, value, 0)
-        probs = self.weights[psuedo_indices]
-        return torch.where(valid_indices, probs, 0)
-
-    @classmethod
-    def new(cls, weights: Sequence[Number] | Tensor) -> WeightedIntsDistribution:
-        if len(weights) > cls.LIMIT_FOR_WEIGHTED_INTS:
-            raise ValueError(
-                f"Having {len(weights)} weights is very resource intense and slow"
-                " for sampling. Consider using a more efficient implementation"
-                " if you need this many weights.",
-            )
-        return cls(
-            weights=torch.as_tensor(weights, dtype=torch.float64),
-            domain=Domain.indices(len(weights)),
-        )
-
-    @classmethod
-    def with_favoured(
-        cls,
-        n: int,
-        favoured: int,
-        confidence: float,
-    ) -> WeightedIntsDistribution:
-        if n > cls.LIMIT_FOR_WEIGHTED_INTS:
-            raise ValueError(
-                f"Having {n} weights is very resource intense and slow"
-                " for sampling. Consider using a more efficient implementation"
-                " if you need this many weights.",
-            )
-
-        assert 0.0 <= confidence <= 1.0
-        remaining = 1.0 - confidence
-        rest = remaining / (n - 1)
-        if confidence < rest:
-            warnings.warn(
-                f"Weight {confidence} is less than the rest {rest}."
-                " This will make the favoured value less likely to be sampled"
-                " than the rest of the values.",
-                UserWarning,
-                stacklevel=2,
-            )
-        dist = torch.full(size=(n,), fill_value=rest, dtype=torch.float64)
-        dist[favoured] = confidence
-        return cls(weights=dist, domain=Domain.indices(n))
diff --git a/neps/search_spaces/samplers/__init__.py b/neps/search_spaces/samplers/__init__.py
deleted file mode 100644
index 784b5aa4..00000000
--- a/neps/search_spaces/samplers/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from neps.search_spaces.samplers.prior import PriorSampler
-from neps.search_spaces.samplers.sampler import Sampler
-from neps.search_spaces.samplers.uniform import UniformSampler
-
-__all__ = [
-    "Sampler",
-    "UniformSampler",
-    "PriorSampler",
-]
diff --git a/neps/search_spaces/samplers/model.py b/neps/search_spaces/samplers/model.py
deleted file mode 100644
index c413b6bf..00000000
--- a/neps/search_spaces/samplers/model.py
+++ /dev/null
@@ -1,186 +0,0 @@
-from __future__ import annotations
-
-import logging
-from typing import TYPE_CHECKING, Any, Mapping
-
-import numpy as np
-
-from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping
-from neps.optimizers.bayesian_optimization.acquisition_samplers import (
-    AcquisitionSamplerMapping,
-)
-from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_kernels
-from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
-from neps.search_spaces.samplers.sampler import Sampler
-from neps.search_spaces.samplers.uniform import UniformSampler
-from neps.utils.common import instance_from_map
-
-logger = logging.getLogger(__name__)
-
-if TYPE_CHECKING:
-    from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
-        BaseAcquisition,
-    )
-    from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-        AcquisitionSampler,
-    )
-    from neps.search_spaces import SearchSpace
-    from neps.utils.types import Number
-
-
-class ModelPolicy(Sampler):
-    """A policy for sampling configuration, i.e. the default for SH / hyperband.
-
-    Args:
-        SamplingPolicy ([type]): [description]
-    """
-
-    def __init__(
-        self,
-        *,
-        space: SearchSpace,
-        surrogate_model: str | Any = "gp",
-        surrogate_model_args: Mapping[str, Any] | None = None,
-        domain_se_kernel: str | None = None,
-        graph_kernels: list | None = None,
-        hp_kernels: list | None = None,
-        acquisition: str | BaseAcquisition | type[BaseAcquisition] = "EI",
-        acquisition_sampler: (
-            str | AcquisitionSampler | type[AcquisitionSampler]
-        ) = "random",
-        patience: int = 100,
-    ):
-        surrogate_model_args = dict(surrogate_model_args) if surrogate_model_args else {}
-
-        graph_kernels, hp_kernels = get_kernels(
-            pipeline_space=space,
-            domain_se_kernel=domain_se_kernel,
-            graph_kernels=graph_kernels,
-            hp_kernels=hp_kernels,
-            optimal_assignment=False,
-        )
-
-        if "graph_kernels" not in surrogate_model_args:
-            surrogate_model_args["graph_kernels"] = None
-
-        if "hp_kernels" not in surrogate_model_args:
-            surrogate_model_args["hp_kernels"] = hp_kernels
-
-        if not surrogate_model_args["hp_kernels"]:
-            raise ValueError("No kernels are provided!")
-
-        if "vectorial_features" not in surrogate_model_args:
-            # TODO: Graph gets ignored?
-            surrogate_model_args["vectorial_features"] = {
-                "continuous": len(space.numericals),
-                "categorical": len(space.categoricals),
-            }
-
-        # TODO: What the hell type is this
-        self.surrogate_model: Any = instance_from_map(
-            SurrogateModelMapping,
-            surrogate_model,
-            name="surrogate model",
-            kwargs=surrogate_model_args,
-        )
-
-        self.acquisition: BaseAcquisition = instance_from_map(
-            AcquisitionMapping,
-            acquisition,  # type: ignore
-            name="acquisition function",
-        )
-
-        self.acquisition_sampler: AcquisitionSampler = instance_from_map(
-            AcquisitionSamplerMapping,
-            acquisition_sampler,  # type: ignore
-            name="acquisition sampler function",
-            kwargs={"patience": patience, "pipeline_space": space},
-        )
-        self.uniform_sampler = UniformSampler.new(space)
-
-    def _fantasize_pending(self, train_x, train_y, pending_x):
-        if len(pending_x) == 0:
-            return train_x, train_y
-
-        self.surrogate_model.fit(train_x, train_y)
-        # hallucinating: predict for the pending evaluations
-        _y, _ = self.surrogate_model.predict(pending_x)
-        _y = _y.detach().numpy().tolist()
-        # appending to training data
-        train_x.extend(pending_x)
-        train_y.extend(_y)
-        return train_x, train_y
-
-    def update_model(self, train_x, train_y, pending_x, decay_t=None):
-        if decay_t is None:
-            decay_t = len(train_x)
-        train_x, train_y = self._fantasize_pending(train_x, train_y, pending_x)
-        self.surrogate_model.fit(train_x, train_y)
-        self.acquisition.set_state(self.surrogate_model, decay_t=decay_t)
-        # TODO: set_state should generalize to all options
-        #  no needed to set state of sampler when using `random`
-        # self.acquisition_sampler.set_state(x=train_x, y=train_y)
-
-    def sample(
-        self,
-        n: int,
-        *,
-        active_max_fidelity: Mapping[str, Number] | None = None,
-        fidelity: Mapping[str, Number] | None = None,
-        seed: np.random.Generator,
-    ) -> SearchSpace:
-        """Performs the equivalent of optimizing the acquisition function.
-
-        Performs 2 strategies as per the arguments passed:
-            * If fidelity is not None, triggers the case when the surrogate has been
-              trained jointly with the fidelity dimension, i.e., all observations ever
-              recorded. In this case, the EI for random samples is evaluated at the
-              `fidelity` where the new sample will be evaluated. The top-10 are selected,
-              and the EI for them is evaluated at the target/mmax fidelity.
-            * If active_max_fidelity is not None, triggers the case when a surrogate is
-              trained per fidelity. In this case, all samples have their fidelity
-              variable set to the same value. This value is same as that of the fidelity
-              value of the configs in the training data.
-        """
-        logger.info("Acquiring...")
-
-        # sampling random configurations
-        samples = [
-            self.space.sample(user_priors=False, ignore_fidelity=True)
-            for _ in range(SAMPLE_THRESHOLD)
-        ]
-
-        if fidelity is not None:
-            # w/o setting this flag, the AF eval will set all fidelities to max
-            self.acquisition.optimize_on_max_fidelity = False
-            _inc_copy = self.acquisition.incumbent
-            # TODO: better design required, for example, not import torch
-            #  right now this case handles the 2-step acquisition in `sample`
-            if "incumbent" in kwargs:
-                # sets the incumbent to the best score at the required fidelity for
-                # correct computation of EI scores
-                self.acquisition.incumbent = torch.tensor(kwargs["incumbent"])
-            # updating the fidelity of the sampled configurations
-            samples = list(map(update_fidelity, samples, [fidelity] * len(samples)))
-            # computing EI at the given `fidelity`
-            eis = self.acquisition.eval(x=samples, asscalar=True)
-            # extracting the 10 highest scores
-            _ids = np.argsort(eis)[-TOP_EI_SAMPLE_COUNT:]
-            samples = pd.Series(samples).iloc[_ids].values.tolist()
-            # setting the fidelity to the maximum fidelity
-            self.acquisition.optimize_on_max_fidelity = True
-            self.acquisition.incumbent = _inc_copy
-
-        if active_max_fidelity is not None:
-            # w/o setting this flag, the AF eval will set all fidelities to max
-            self.acquisition.optimize_on_max_fidelity = False
-            fidelity = active_max_fidelity
-            samples = list(map(update_fidelity, samples, [fidelity] * len(samples)))
-
-        # computes the EI for all `samples`
-        eis = self.acquisition.eval(x=samples, asscalar=True)
-        # extracting the highest scored sample
-        return samples[np.argmax(eis)]
-        # TODO: can generalize s.t. sampler works for all types, currently,
-        #  random sampler in NePS does not do what is required here
-        # return self.acquisition_sampler.sample(self.acquisition)
diff --git a/neps/search_spaces/samplers/prior.py b/neps/search_spaces/samplers/prior.py
deleted file mode 100644
index 65165cae..00000000
--- a/neps/search_spaces/samplers/prior.py
+++ /dev/null
@@ -1,110 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Mapping
-from typing_extensions import Self, override
-
-from neps.search_spaces.config import Config
-from neps.search_spaces.distributions.uniform_int import UniformIntDistribution
-from neps.search_spaces.distributions.weighted_ints import WeightedIntsDistribution
-from neps.search_spaces.samplers.sampler import Sampler
-
-if TYPE_CHECKING:
-    import numpy as np
-
-    from neps.search_spaces.distributions.distribution import Distribution
-    from neps.search_spaces.search_space import SearchSpace
-
-
-@dataclass
-class PriorSampler(Sampler):
-    search_space: SearchSpace
-
-    _numerical_distributions: Mapping[str, Distribution]
-    _categorical_distributions: Mapping[str, Distribution]
-
-    @override
-    def sample_configs(
-        self,
-        n: int,
-        *,
-        fidelity: Mapping[str, float] | None,
-        seed: np.random.Generator,
-        with_constants: bool = True,
-    ) -> list[Config]:
-        numerical_samples = {}
-        for k, dist in self._numerical_distributions.items():
-            param = self.search_space.numericals[k]
-            numerical_samples[k] = dist.sample(n, to=param.domain, seed=seed)
-
-        categorical_samples = {}
-        for k, dist in self._categorical_distributions.items():
-            cat = self.search_space.categoricals[k]
-            domain = cat.domain
-            samples = dist.sample(n, to=domain, seed=seed)
-            choices = cat.lookup(samples)
-            categorical_samples[k] = choices
-
-        graph_samples = {}
-        for k, v in self.search_space.graphs.items():
-            graph_samples[k] = [v.sample() for _ in range(n)]
-
-        _constants = self.search_space.constants if with_constants else {}
-
-        return [
-            Config(
-                values={
-                    **{k: samples[i] for k, samples in numerical_samples.items()},
-                    **{k: samples[i] for k, samples in categorical_samples.items()},
-                    **{k: samples[i] for k, samples in graph_samples.items()},
-                    **_constants,
-                },
-                fidelity=fidelity,
-            )
-            for i in range(n)
-        ]
-
-    @classmethod
-    def new(
-        cls,
-        space: SearchSpace,
-        prior: Mapping[str, tuple[Any, float]],
-        *,
-        replace_missing_with_uniform: bool = True,
-    ) -> Self:
-        missing = set(space.hyperparameters) - set(prior.keys())
-        if not replace_missing_with_uniform and any(missing):
-            raise ValueError(
-                "If `replace_missing_with_uniform` is False, the prior must be defined"
-                f" for all parameters. Missing prior for: {missing}"
-            )
-
-        numerical_distributions = {
-            hp_name: (
-                hp.domain.truncnorm_distribution(center=p[0], confidence=p[1])
-                if (p := prior.get(hp_name))
-                else hp.domain.uniform_distribution()
-            )
-            for hp_name, hp in space.numericals.items()
-        }
-        # NOTE: It would be nice to somehow check if the prior given for
-        # a categorical was an index or a value in the categorical.
-        # Since it's much more efficient to hold on to the index, we will
-        # assume that for now.
-        categorical_distribution = {
-            hp_name: (
-                WeightedIntsDistribution.with_favoured(
-                    n=cat.size,
-                    favoured=cat.index(p[0]),
-                    confidence=p[1],
-                )
-                if (p := prior.get(hp_name))
-                else UniformIntDistribution.indices(cat.size)
-            )
-            for hp_name, cat in space.categoricals.items()
-        }
-        return cls(
-            space,
-            _numerical_distributions=numerical_distributions,
-            _categorical_distributions=categorical_distribution,
-        )
diff --git a/neps/search_spaces/samplers/sampler.py b/neps/search_spaces/samplers/sampler.py
deleted file mode 100644
index f104a3a5..00000000
--- a/neps/search_spaces/samplers/sampler.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Mapping
-from typing_extensions import Protocol
-
-if TYPE_CHECKING:
-    import numpy as np
-
-    from neps.search_spaces.config import Config
-    from neps.utils.types import Number
-
-
-@dataclass
-class Sampler(Protocol):
-    def sample_configs(
-        self,
-        n: int,
-        *,
-        fidelity: Mapping[str, Number] | None,
-        seed: np.random.Generator,
-    ) -> list[Config]: ...
diff --git a/neps/search_spaces/samplers/uniform.py b/neps/search_spaces/samplers/uniform.py
deleted file mode 100644
index 88060932..00000000
--- a/neps/search_spaces/samplers/uniform.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Mapping
-from typing_extensions import Self, override
-
-from neps.search_spaces.config import Config
-from neps.search_spaces.distributions.uniform_int import UniformIntDistribution
-from neps.search_spaces.samplers.sampler import Sampler
-
-if TYPE_CHECKING:
-    import numpy as np
-
-    from neps.search_spaces.distributions.distribution import Distribution
-    from neps.search_spaces.search_space import SearchSpace
-
-
-@dataclass
-class UniformSampler(Sampler):
-    search_space: SearchSpace
-
-    _numerical_distributions: Mapping[str, Distribution]
-    _categorical_distributions: Mapping[str, Distribution]
-
-    @override
-    def sample_configs(
-        self,
-        n: int,
-        *,
-        fidelity: Mapping[str, float] | None = None,
-        seed: np.random.Generator,
-        with_constants: bool = True,
-    ) -> list[Config]:
-        numerical_samples = {}
-        for k, dist in self._numerical_distributions.items():
-            param = self.search_space.numericals[k]
-            numerical_samples[k] = dist.sample(n, to=param.domain, seed=seed)
-
-        categorical_samples = {}
-        for k, dist in self._categorical_distributions.items():
-            cat = self.search_space.categoricals[k]
-            domain = cat.domain
-            samples = dist.sample(n, to=domain, seed=seed)
-            choices = cat.lookup(samples)
-            categorical_samples[k] = choices
-
-        graph_samples = {}
-        for k, v in self.search_space.graphs.items():
-            graph_samples[k] = [v.sample() for _ in range(n)]
-
-        _constants = self.search_space.constants if with_constants else {}
-
-        return [
-            Config(
-                {
-                    **{k: samples[i] for k, samples in numerical_samples.items()},
-                    **{k: samples[i] for k, samples in categorical_samples.items()},
-                    **{k: samples[i] for k, samples in graph_samples.items()},
-                    **_constants,
-                },
-                fidelity=fidelity,
-            )
-            for i in range(n)
-        ]
-
-    @classmethod
-    def new(cls, space: SearchSpace) -> Self:
-        numerical_distributions = {
-            k: p.domain.uniform_distribution() for k, p in space.numericals.items()
-        }
-        categorical_distribution = {
-            k: UniformIntDistribution.indices(p.size)
-            for k, p in space.categoricals.items()
-        }
-        return cls(
-            space,
-            _numerical_distributions=numerical_distributions,
-            _categorical_distributions=categorical_distribution,
-        )
diff --git a/neps/search_spaces/samplers/weighted_sampler.py b/neps/search_spaces/samplers/weighted_sampler.py
deleted file mode 100644
index 32e51908..00000000
--- a/neps/search_spaces/samplers/weighted_sampler.py
+++ /dev/null
@@ -1,51 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Mapping
-from typing_extensions import Self, override
-
-import numpy as np
-
-from neps.search_spaces.samplers.sampler import Sampler
-from neps.utils.types import Arr, Number, f64
-
-if TYPE_CHECKING:
-    from neps.search_spaces.config import Config
-
-
-@dataclass
-class WeightedSampler(Sampler):
-    weights: dict[str, float]
-    samplers: dict[str, Sampler]
-
-    _probabilities: Arr[f64] = field(init=False, repr=False, compare=False)
-    _samplers: Arr[np.str_] = field(init=False, repr=False, compare=False)
-
-    def __post_init__(self):
-        probs = np.array(list(self.weights.values()), dtype=f64)
-        probs /= probs.sum()
-        self._probabilities = probs
-        self._samplers = np.asarray(sorted(self.samplers.keys()), dtype=np.str_)
-
-    @override
-    def sample_configs(
-        self,
-        n: int,
-        *,
-        fidelity: Mapping[str, Number] | None,
-        seed: np.random.Generator,
-    ) -> list[Config]:
-        choices = seed.choice(self._samplers, size=n, p=self._probabilities)
-        keys, counts = np.unique(choices, return_counts=True)
-
-        configs: list[Config] = []
-        for key, count in zip(keys, counts):
-            sampler = self.samplers[key]
-            config_samples = sampler.sample_configs(count, fidelity=fidelity, seed=seed)
-            configs.extend(config_samples)
-
-        return configs
-
-    @classmethod
-    def equally_weighted(cls, samples: dict[str, Sampler]) -> Self:
-        return cls(weights={k: 1.0 for k in samples}, samplers=samples)

From 03729ca18fe1e2940a5fb581139ca6b49d386522 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Thu, 29 Aug 2024 18:56:32 +0200
Subject: [PATCH 26/63] fix: Return acquisition_functions still in use

---
 neps/{sampling => }/distributions.py          |   2 +-
 .../acquisition_functions/_ehvi.py            | 213 ++++++++++++++++++
 .../acquisition_functions/base_acquisition.py |  17 ++
 .../acquisition_functions/ei.py               | 120 ++++++++++
 .../acquisition_functions/mf_ei.py            |  35 ++-
 .../acquisition_functions/prior_weighted.py   | 111 +++++++++
 .../acquisition_functions/ucb.py              |  60 +++++
 7 files changed, 537 insertions(+), 21 deletions(-)
 rename neps/{sampling => }/distributions.py (99%)
 create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py
 create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py
 create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
 create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
 create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py

diff --git a/neps/sampling/distributions.py b/neps/distributions.py
similarity index 99%
rename from neps/sampling/distributions.py
rename to neps/distributions.py
index fb552949..2361e191 100644
--- a/neps/sampling/distributions.py
+++ b/neps/distributions.py
@@ -225,6 +225,6 @@ def log_prob(self, value):
 
 
 @dataclass
-class TorchDistributionWithDomain:
+class DistributionOverDomain:
     distribution: Distribution
     domain: Domain
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py b/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py
new file mode 100644
index 00000000..8722c545
--- /dev/null
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py
@@ -0,0 +1,213 @@
+# from abc import ABC, abstractmethod
+from itertools import product
+
+import torch
+from torch import Tensor
+from torch.distributions import Normal
+from torch.nn import Module
+
+# class MultiObjectiveBaseAcqusition(ABC):
+#     def __init__(self, surrogate_models: dict):
+#         self.surrogate_models = surrogate_models
+#
+#     def propose_location(self, *args):
+#         """Propose new locations for subsequent sampling
+#         This method should be overriden by respective acquisition function implementations."""
+#         raise NotImplementedError
+#
+#     def optimize(self):
+#         """This is the method that user should call for the Bayesian optimisation main loop."""
+#         raise NotImplementedError
+#
+#     @abstractmethod
+#     def eval(self, x, asscalar: bool = False):
+#         """Evaluate the acquisition function at point x2. This should be overridden by respective acquisition
+#         function implementations"""
+#         raise NotImplementedError
+#
+#     def __call__(self, *args, **kwargs):
+#         return self.eval(*args, **kwargs)
+#
+#     def reset_surrogate_model(self, surrogate_models: dict):
+#         for objective, surrogate_model in surrogate_models.items():
+#             self.surrogate_models[objective] = surrogate_model
+#
+
+
+class ExpectedHypervolumeImprovement(Module):  # , MultiObjectiveBaseAcqusition):
+    def __init__(
+        self,
+        model,
+        ref_point,
+        partitioning,
+    ) -> None:
+        r"""Expected Hypervolume Improvement supporting m>=2 outcomes.
+
+        Implementation from BOtorch, adapted from
+        https://github.com/pytorch/botorch/blob/353f37649fa8d90d881e8ea20c11986b15723ef1/botorch/acquisition/multi_objective/analytic.py#L78
+
+        This implements the computes EHVI using the algorithm from [Yang2019]_, but
+        additionally computes gradients via auto-differentiation as proposed by
+        [Daulton2020qehvi]_.
+
+        Note: this is currently inefficient in two ways due to the binary partitioning
+        algorithm that we use for the box decomposition:
+
+            - We have more boxes in our decomposition
+            - If we used a box decomposition that used `inf` as the upper bound for
+                the last dimension *in all hypercells*, then we could reduce the number
+                of terms we need to compute from 2^m to 2^(m-1). [Yang2019]_ do this
+                by using DKLV17 and LKF17 for the box decomposition.
+
+        TODO: Use DKLV17 and LKF17 for the box decomposition as in [Yang2019]_ for
+        greater efficiency.
+
+        TODO: Add support for outcome constraints.
+
+        Example:
+            >>> model = SingleTaskGP(train_X, train_Y)
+            >>> ref_point = [0.0, 0.0]
+            >>> EHVI = ExpectedHypervolumeImprovement(model, ref_point, partitioning)
+            >>> ehvi = EHVI(test_X)
+
+        Args:
+            model: A fitted model.
+            ref_point: A list with `m` elements representing the reference point (in the
+                outcome space) w.r.t. to which compute the hypervolume. This is a
+                reference point for the objective values (i.e. after applying
+                `objective` to the samples).
+            partitioning: A `NondominatedPartitioning` module that provides the non-
+                dominated front and a partitioning of the non-dominated space in hyper-
+                rectangles.
+            objective: An `AnalyticMultiOutputObjective`.
+        """
+        # TODO: we could refactor this __init__ logic into a
+        # HypervolumeAcquisitionFunction Mixin
+        if len(ref_point) != partitioning.num_outcomes:
+            raise ValueError(
+                "The length of the reference point must match the number of outcomes. "
+                f"Got ref_point with {len(ref_point)} elements, but expected "
+                f"{partitioning.num_outcomes}."
+            )
+        ref_point = torch.tensor(
+            ref_point,
+            dtype=partitioning.pareto_Y.dtype,
+            device=partitioning.pareto_Y.device,
+        )
+        better_than_ref = (partitioning.pareto_Y > ref_point).all(dim=1)
+        if not better_than_ref.any() and partitioning.pareto_Y.shape[0] > 0:
+            raise ValueError(
+                "At least one pareto point must be better than the reference point."
+            )
+        super().__init__()
+        self.model = model
+        self.register_buffer("ref_point", ref_point)
+        self.partitioning = partitioning
+        cell_bounds = self.partitioning.get_hypercell_bounds()
+        self.register_buffer("cell_lower_bounds", cell_bounds[0])
+        self.register_buffer("cell_upper_bounds", cell_bounds[1])
+        # create indexing tensor of shape `2^m x m`
+        self._cross_product_indices = torch.tensor(
+            list(product(*[[0, 1] for _ in range(ref_point.shape[0])])),
+            dtype=torch.long,
+            device=ref_point.device,
+        )
+        self.normal = Normal(0, 1)
+
+    def psi(self, lower: Tensor, upper: Tensor, mu: Tensor, sigma: Tensor) -> None:
+        r"""Compute Psi function.
+
+        For each cell i and outcome k:
+
+            Psi(lower_{i,k}, upper_{i,k}, mu_k, sigma_k) = (
+            sigma_k * PDF((upper_{i,k} - mu_k) / sigma_k) + (
+            mu_k - lower_{i,k}
+            ) * (1 - CDF(upper_{i,k} - mu_k) / sigma_k)
+            )
+
+        See Equation 19 in [Yang2019]_ for more details.
+
+        Args:
+            lower: A `num_cells x m`-dim tensor of lower cell bounds
+            upper: A `num_cells x m`-dim tensor of upper cell bounds
+            mu: A `batch_shape x 1 x m`-dim tensor of means
+            sigma: A `batch_shape x 1 x m`-dim tensor of standard deviations (clamped).
+
+        Returns:
+            A `batch_shape x num_cells x m`-dim tensor of values.
+        """
+        u = (upper - mu) / sigma
+        return sigma * self.normal.log_prob(u).exp() + (mu - lower) * (
+            1 - self.normal.cdf(u)
+        )
+
+    def nu(self, lower: Tensor, upper: Tensor, mu: Tensor, sigma: Tensor) -> None:
+        r"""Compute Nu function.
+
+        For each cell i and outcome k:
+
+            nu(lower_{i,k}, upper_{i,k}, mu_k, sigma_k) = (
+            upper_{i,k} - lower_{i,k}
+            ) * (1 - CDF((upper_{i,k} - mu_k) / sigma_k))
+
+        See Equation 25 in [Yang2019]_ for more details.
+
+        Args:
+            lower: A `num_cells x m`-dim tensor of lower cell bounds
+            upper: A `num_cells x m`-dim tensor of upper cell bounds
+            mu: A `batch_shape x 1 x m`-dim tensor of means
+            sigma: A `batch_shape x 1 x m`-dim tensor of standard deviations (clamped).
+
+        Returns:
+            A `batch_shape x num_cells x m`-dim tensor of values.
+        """
+        return (upper - lower) * (1 - self.normal.cdf((upper - mu) / sigma))
+
+    def forward(self, X: Tensor) -> Tensor:
+        posterior = [[_m.predict(_x) for _m in self.model] for _x in X]
+        mu = torch.tensor([[_m[0].item() for _m in _p] for _p in posterior])[:, None, :]
+        sigma = torch.tensor([[_s[1].item() for _s in _p] for _p in posterior])[
+            :, None, :
+        ]
+
+        # clamp here, since upper_bounds will contain `inf`s, which
+        # are not differentiable
+        cell_upper_bounds = self.cell_upper_bounds.clamp_max(1e8)
+        # Compute psi(lower_i, upper_i, mu_i, sigma_i) for i=0, ... m-2
+        psi_lu = self.psi(
+            lower=self.cell_lower_bounds, upper=cell_upper_bounds, mu=mu, sigma=sigma
+        )
+        # Compute psi(lower_m, lower_m, mu_m, sigma_m)
+        psi_ll = self.psi(
+            lower=self.cell_lower_bounds,
+            upper=self.cell_lower_bounds,
+            mu=mu,
+            sigma=sigma,
+        )
+        # Compute nu(lower_m, upper_m, mu_m, sigma_m)
+        nu = self.nu(
+            lower=self.cell_lower_bounds, upper=cell_upper_bounds, mu=mu, sigma=sigma
+        )
+        # compute the difference psi_ll - psi_lu
+        psi_diff = psi_ll - psi_lu
+
+        # this is batch_shape x num_cells x 2 x (m-1)
+        stacked_factors = torch.stack([psi_diff, nu], dim=-2)
+
+        # Take the cross product of psi_diff and nu across all outcomes
+        # e.g. for m = 2
+        # for each batch and cell, compute
+        # [psi_diff_0, psi_diff_1]
+        # [nu_0, psi_diff_1]
+        # [psi_diff_0, nu_1]
+        # [nu_0, nu_1]
+        # this tensor has shape: `batch_shape x num_cells x 2^m x m`
+        all_factors_up_to_last = stacked_factors.gather(
+            dim=-2,
+            index=self._cross_product_indices.expand(
+                stacked_factors.shape[:-2] + self._cross_product_indices.shape
+            ),
+        )
+        # compute product for all 2^m terms,
+        # sum across all terms and hypercells
+        return all_factors_up_to_last.prod(dim=-1).sum(dim=-1).sum(dim=-1)
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py
new file mode 100644
index 00000000..7249c0fd
--- /dev/null
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py
@@ -0,0 +1,17 @@
+from abc import ABC, abstractmethod
+
+
+class BaseAcquisition(ABC):
+    def __init__(self):
+        self.surrogate_model = None
+
+    @abstractmethod
+    def eval(self, x, asscalar: bool = False):
+        """Evaluate the acquisition function at point x2."""
+        raise NotImplementedError
+
+    def __call__(self, *args, **kwargs):
+        return self.eval(*args, **kwargs)
+
+    def set_state(self, surrogate_model, **kwargs):
+        self.surrogate_model = surrogate_model
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
new file mode 100644
index 00000000..1a4e24d0
--- /dev/null
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
@@ -0,0 +1,120 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Sequence
+
+import torch
+from torch.distributions import Normal
+
+from .base_acquisition import BaseAcquisition
+
+if TYPE_CHECKING:
+    import numpy as np
+
+    from neps.search_spaces import SearchSpace
+
+
+class ComprehensiveExpectedImprovement(BaseAcquisition):
+    def __init__(
+        self,
+        augmented_ei: bool = False,
+        xi: float = 0.0,
+        in_fill: str = "best",
+        log_ei: bool = False,
+        optimize_on_max_fidelity: bool = True,
+    ):
+        """This is the graph BO version of the expected improvement
+        key differences are:
+
+        1. The input x2 is a networkx graph instead of a vectorial input
+
+        2. The search space (a collection of x1_graphs) is discrete, so there is no
+           gradient-based optimisation. Instead, we compute the EI at all candidate points
+           and empirically select the best position during optimisation
+
+        Args:
+            augmented_ei: Using the Augmented EI heuristic modification to the standard
+                expected improvement algorithm according to Huang (2006).
+            xi: manual exploration-exploitation trade-off parameter.
+            in_fill: the criterion to be used for in-fill for the determination of mu_star
+                'best' means the empirical best observation so far (but could be
+                susceptible to noise), 'posterior' means the best *posterior GP mean*
+                encountered so far, and is recommended for optimization of more noisy
+                functions. Defaults to "best".
+            log_ei: log-EI if true otherwise usual EI.
+        """
+        super().__init__()
+
+        if in_fill not in ["best", "posterior"]:
+            raise ValueError(f"Invalid value for in_fill ({in_fill})")
+        self.augmented_ei = augmented_ei
+        self.xi = xi
+        self.in_fill = in_fill
+        self.log_ei = log_ei
+        self.incumbent = None
+        self.optimize_on_max_fidelity = optimize_on_max_fidelity
+
+    def eval(
+        self,
+        x: Sequence[SearchSpace],
+        asscalar: bool = False,
+    ) -> np.ndarray | torch.Tensor | float:
+        """Return the negative expected improvement at the query point x2."""
+        assert self.incumbent is not None, "EI function not fitted on model"
+
+        if x[0].has_fidelity and self.optimize_on_max_fidelity:
+            _x = [e.clone() for e in x]
+            for e in _x:
+                e.set_to_max_fidelity()
+        else:
+            _x = x
+
+        mu, cov = self.surrogate_model.predict(_x)
+
+        std = torch.sqrt(torch.diag(cov))
+        mu_star = self.incumbent
+
+        gauss = Normal(torch.zeros(1, device=mu.device), torch.ones(1, device=mu.device))
+        # u = (mu - mu_star - self.xi) / std
+        # ei = std * updf + (mu - mu_star - self.xi) * ucdf
+        if self.log_ei:
+            # we expect that f_min is in log-space
+            f_min = mu_star - self.xi
+            v = (f_min - mu) / std
+            ei = torch.exp(f_min) * gauss.cdf(v) - torch.exp(
+                0.5 * torch.diag(cov) + mu
+            ) * gauss.cdf(v - std)
+        else:
+            u = (mu_star - mu - self.xi) / std
+            try:
+                ucdf = gauss.cdf(u)
+            except ValueError as e:
+                print(f"u: {u}")  # noqa: T201
+                print(f"mu_star: {mu_star}")  # noqa: T201
+                print(f"mu: {mu}")  # noqa: T201
+                print(f"std: {std}")  # noqa: T201
+                print(f"diag: {cov.diag()}")  # noqa: T201
+                raise e
+            updf = torch.exp(gauss.log_prob(u))
+            ei = std * updf + (mu_star - mu - self.xi) * ucdf
+        if self.augmented_ei:
+            sigma_n = self.surrogate_model.likelihood
+            ei *= 1.0 - torch.sqrt(torch.tensor(sigma_n, device=mu.device)) / torch.sqrt(
+                sigma_n + torch.diag(cov)
+            )
+        if isinstance(_x, list) and asscalar:
+            return ei.detach().numpy()
+        if asscalar:
+            ei = ei.detach().numpy().item()
+        return ei
+
+    def set_state(self, surrogate_model, **kwargs):
+        super().set_state(surrogate_model, **kwargs)
+
+        # Compute incumbent
+        if self.in_fill == "best":
+            self.incumbent = torch.min(self.surrogate_model.y_)
+        else:
+            x = self.surrogate_model.x
+            mu_train, _ = self.surrogate_model.predict(x)
+            incumbent_idx = torch.argmin(mu_train)
+            self.incumbent = self.surrogate_model.y_[incumbent_idx]
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py
index c8502ca1..3d19040d 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py
@@ -1,28 +1,22 @@
-# Left in as reference for now.
 # type: ignore
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Any, Iterable
+from typing import Any, Iterable, Tuple, Union
 
 import numpy as np
 import pandas as pd
 import torch
 from torch.distributions import Normal
 
-from neps.optimizers.utils import map_real_hyperparameters_from_tabular_ids
-
+from ....optimizers.utils import map_real_hyperparameters_from_tabular_ids
+from ....search_spaces.search_space import SearchSpace
+from ...multi_fidelity.utils import MFObservedData
 from .ei import ComprehensiveExpectedImprovement
 
-if TYPE_CHECKING:
-    from neps.optimizers.multi_fidelity.utils import MFObservedData
-    from neps.search_spaces.search_space import SearchSpace
-
 
 class MFEI(ComprehensiveExpectedImprovement):
     def __init__(
         self,
         pipeline_space: SearchSpace,
-        surrogate_model_name: str | None = None,
+        surrogate_model_name: str = None,
         augmented_ei: bool = False,
         xi: float = 0.0,
         in_fill: str = "best",
@@ -38,7 +32,7 @@ def __init__(
     def get_budget_level(self, config) -> int:
         return int((config.fidelity.value - config.fidelity.lower) / self.b_step)
 
-    def preprocess(self, x: pd.Series) -> tuple[Iterable, Iterable]:
+    def preprocess(self, x: pd.Series) -> Tuple[Iterable, Iterable]:
         """Prepares the configurations for appropriate EI calculation.
 
         Takes a set of points and computes the budget and incumbent for each point, as
@@ -71,7 +65,7 @@ def preprocess(self, x: pd.Series) -> tuple[Iterable, Iterable]:
                 budget_list.append(self.get_budget_level(config))
 
         # Drop unused configs
-        x = x.drop(labels=indices_to_drop)
+        x.drop(labels=indices_to_drop, inplace=True)
 
         performances = self.observations.get_best_performance_for_each_budget()
         inc_list = []
@@ -84,11 +78,11 @@ def preprocess(self, x: pd.Series) -> tuple[Iterable, Iterable]:
 
         return x, torch.Tensor(inc_list)
 
-    def preprocess_gp(self, x: Iterable) -> tuple[Iterable, Iterable]:
+    def preprocess_gp(self, x: Iterable) -> Tuple[Iterable, Iterable]:
         x, inc_list = self.preprocess(x)
         return x.values.tolist(), inc_list
 
-    def preprocess_deep_gp(self, x: Iterable) -> tuple[Iterable, Iterable]:
+    def preprocess_deep_gp(self, x: Iterable) -> Tuple[Iterable, Iterable]:
         x, inc_list = self.preprocess(x)
         x_lcs = []
         for idx in x.index:
@@ -103,7 +97,7 @@ def preprocess_deep_gp(self, x: Iterable) -> tuple[Iterable, Iterable]:
         self.surrogate_model.set_prediction_learning_curves(x_lcs)
         return x.values.tolist(), inc_list
 
-    def preprocess_pfn(self, x: Iterable) -> tuple[Iterable, Iterable, Iterable]:
+    def preprocess_pfn(self, x: Iterable) -> Tuple[Iterable, Iterable, Iterable]:
         """Prepares the configurations for appropriate EI calculation.
 
         Takes a set of points and computes the budget and incumbent for each point, as
@@ -120,7 +114,7 @@ def preprocess_pfn(self, x: Iterable) -> tuple[Iterable, Iterable, Iterable]:
         ) / self.b_step
         return _x_tok, _x, inc_list
 
-    def eval(self, x: pd.Series, asscalar: bool = False) -> tuple[np.ndarray, pd.Series]:
+    def eval(self, x: pd.Series, asscalar: bool = False) -> Tuple[np.ndarray, pd.Series]:
         # _x = x.copy()  # preprocessing needs to change the reference x Series so we don't copy here
         if self.surrogate_model_name == "pfn":
             _x_tok, _x, inc_list = self.preprocess_pfn(
@@ -149,7 +143,7 @@ def eval(self, x: pd.Series, asscalar: bool = False) -> tuple[np.ndarray, pd.Ser
 
     def eval_pfn_ei(
         self, x: Iterable, inc_list: Iterable
-    ) -> np.ndarray | torch.Tensor | float:
+    ) -> Union[np.ndarray, torch.Tensor, float]:
         """PFN-EI modified to preprocess samples and accept list of incumbents."""
         # x, inc_list = self.preprocess(x)  # IMPORTANT change from vanilla-EI
         # _x = x.copy()
@@ -160,7 +154,7 @@ def eval_pfn_ei(
 
     def eval_gp_ei(
         self, x: Iterable, inc_list: Iterable
-    ) -> np.ndarray | torch.Tensor | float:
+    ) -> Union[np.ndarray, torch.Tensor, float]:
         """Vanilla-EI modified to preprocess samples and accept list of incumbents."""
         # x, inc_list = self.preprocess(x)  # IMPORTANT change from vanilla-EI
         _x = x.copy()
@@ -200,7 +194,7 @@ def set_state(
         pipeline_space: SearchSpace,
         surrogate_model: Any,
         observations: MFObservedData,
-        b_step: int | float,
+        b_step: Union[int, float],
         **kwargs,
     ):
         # overload to select incumbent differently through observations
@@ -208,3 +202,4 @@ def set_state(
         self.surrogate_model = surrogate_model
         self.observations = observations
         self.b_step = b_step
+        return
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
new file mode 100644
index 00000000..8a735d58
--- /dev/null
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
@@ -0,0 +1,111 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Iterable
+from typing_extensions import override
+
+import numpy as np
+import torch
+from botorch.acquisition import MCAcquisitionFunction
+
+from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
+    BaseAcquisition,
+)
+
+if TYPE_CHECKING:
+    from neps.priors import Prior
+
+
+class PiboAcquisition(MCAcquisitionFunction):
+    """Compute a prior weighted acquisition function according to PiBO.
+
+    * https://arxiv.org/pdf/2204.11051
+    """
+
+    def __init__(
+        self,
+        acq_fn: MCAcquisitionFunction,
+        prior: Prior,
+        beta: float,
+        n: float,
+    ):
+        """Initialize the acquisition function.
+
+        Args:
+            acq_fn: The acquisition function to be weighted.
+            prior: The prior distribution to be used for weighting.
+            beta: The beta parameter for weighting.
+            n: The denominator for the beta parameter.
+        """
+        self._log = self.acq_fn._log
+        self.acq_fn = acq_fn
+
+        self.beta = beta
+        self.n = n
+        self.prior = prior
+
+    @override
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+        weight = self.beta / self.n
+        acq = self.acq_fn(X)
+
+        # The weight is shown as being applied to the pdf and not the log_pdf
+        values = acq * self.prior.prob(X) * weight
+
+        # However, if the base acq function advertises as being log,
+        # i.e. self._log, then we should return the log of the values
+        return torch.log(values) if self._log else values
+
+
+class DecayingPriorWeightedAcquisition(BaseAcquisition):
+    def __init__(
+        self,
+        base_acquisition,
+        pibo_beta=10,
+        log: bool = False,
+    ):
+        super().__init__()
+        self.pibo_beta = pibo_beta
+        self.base_acquisition = base_acquisition
+        self.log = log
+        self.decay_t = 0.0
+
+    def eval(
+        self,
+        x: Iterable,
+        **base_acquisition_kwargs,
+    ) -> np.ndarray | torch.Tensor | float:
+        acquisition = self.base_acquisition(x, **base_acquisition_kwargs)
+
+        if self.log:
+            min_acq_val = abs(min(acquisition)) if min(acquisition) < 0 else 0
+
+        for i, candidate in enumerate(x):
+            prior_weight = candidate.compute_prior(log=self.log)
+            if prior_weight != 1.0:
+                if self.log:
+                    # for log -> the smaller the prior_weight,
+                    # the more unlikely it is from the prior
+                    # also shift acquisition values to avoid negativ values
+                    acquisition[i] = (
+                        np.log(acquisition[i] + min_acq_val + 1e-12)
+                        + (self.pibo_beta / self.decay_t) * prior_weight
+                    )
+                else:
+                    acquisition[i] *= np.power(
+                        prior_weight + 1e-12, self.pibo_beta / self.decay_t
+                    )
+        return acquisition
+
+    def set_state(self, surrogate_model, **kwargs):
+        if "decay_t" in kwargs:
+            decay_t = kwargs.pop("decay_t")
+        else:
+            train_x = surrogate_model.x
+            if train_x[0].has_fidelity:
+                decay_t = np.sum(
+                    [float(_x.fidelity.value >= _x.fidelity.upper) for _x in train_x]
+                )
+            else:
+                decay_t = len(train_x)
+        self.decay_t = decay_t
+        self.base_acquisition.set_state(surrogate_model, **kwargs)
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py
new file mode 100644
index 00000000..adf57266
--- /dev/null
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py
@@ -0,0 +1,60 @@
+from typing import Iterable, Union
+
+import numpy as np
+import torch
+
+from .base_acquisition import BaseAcquisition
+
+
+class UpperConfidenceBound(BaseAcquisition):
+    def __init__(self, beta: float=1.0, maximize: bool=False):
+        """Upper Confidence Bound (UCB) acquisition function.
+
+        Args:
+            beta: Controls the balance between exploration and exploitation.
+            maximize: If True, maximize the given model, else minimize.
+                DEFAULT=False, assumes minimzation.
+        """
+        super().__init__()
+        self.beta = beta  # can be updated as part of the state for dynamism or a schedule
+        self.maximize = maximize
+        
+        # to be initialized as part of the state
+        self.surrogate_model = None
+
+    def set_state(self, surrogate_model, **kwargs):
+        super().set_state(surrogate_model)
+        self.surrogate_model = surrogate_model
+        if "beta" in kwargs:
+            if not isinstance(kwargs["beta"], (list, np.array)):
+                self.beta = kwargs["beta"]
+            else:
+                self.logger.warning("Beta is a list, not updating beta value!")
+        
+    def eval(
+        self, x: Iterable, asscalar: bool = False
+    ) -> Union[np.ndarray, torch.Tensor, float]:
+        try:
+            mu, cov = self.surrogate_model.predict(x)
+            std = torch.sqrt(torch.diag(cov))
+        except ValueError as e:
+            raise e
+        sign = 1 if self.maximize else -1  # LCB is performed if minimize=True
+        ucb_scores = mu + sign * np.sqrt(self.beta) * std
+        # if LCB, minimize acquisition, or maximize -acquisition  
+        ucb_scores = ucb_scores.detach().numpy() * sign  
+
+        return ucb_scores
+
+
+class MF_UCB(UpperConfidenceBound):
+
+    def preprocess(self, x: Iterable) -> Iterable:
+        performances = self.observations.get_best_performance_for_each_budget()
+        pass
+
+    def eval(
+        self, x: Iterable, asscalar: bool = False
+    ) -> Union[np.ndarray, torch.Tensor, float]:
+        x = self.preprocess(x)
+        return self.eval(x, asscalar=asscalar)

From 16c27f8e518e584643d75f5c0d5bd729969acbec Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Thu, 29 Aug 2024 19:40:40 +0200
Subject: [PATCH 27/63] fix: Some minor cleanup fixes

---
 neps/optimizers/__init__.py          |  10 +-
 neps/{ => sampling}/distributions.py |   2 +-
 neps/state/neps_state.py             | 140 +++++++++++++--------------
 3 files changed, 73 insertions(+), 79 deletions(-)
 rename neps/{ => sampling}/distributions.py (99%)

diff --git a/neps/optimizers/__init__.py b/neps/optimizers/__init__.py
index 31cb4c4a..518952cd 100644
--- a/neps/optimizers/__init__.py
+++ b/neps/optimizers/__init__.py
@@ -1,11 +1,9 @@
 from __future__ import annotations
 
 from functools import partial
-from typing import Callable, Mapping
+from typing import TYPE_CHECKING, Callable, Mapping
 
 from .base_optimizer import BaseOptimizer
-from .bayesian_optimization.cost_cooling import CostCooling
-from .bayesian_optimization.mf_tpe import MultiFidelityPriorWeightedTreeParzenEstimator
 from .bayesian_optimization.optimizer import BayesianOptimization
 from .grid_search.optimizer import GridSearch
 from .multi_fidelity.dyhpo import MFEIBO
@@ -26,13 +24,14 @@
 from .random_search.optimizer import RandomSearch
 from .regularized_evolution.optimizer import RegularizedEvolution
 
+if TYPE_CHECKING:
+    from .base_optimizer import BaseOptimizer
+
 # TODO: Rename Searcher to Optimizer...
 SearcherMapping: Mapping[str, Callable[..., BaseOptimizer]] = {
     "bayesian_optimization": BayesianOptimization,
     "pibo": partial(BayesianOptimization, disable_priors=False),
-    "cost_cooling_bayesian_optimization": CostCooling,
     "random_search": RandomSearch,
-    "cost_cooling": CostCooling,
     "regularized_evolution": RegularizedEvolution,
     "assisted_regularized_evolution": partial(RegularizedEvolution, assisted=True),
     "grid_search": GridSearch,
@@ -41,7 +40,6 @@
     "asha": AsynchronousSuccessiveHalving,
     "hyperband": Hyperband,
     "asha_prior": AsynchronousSuccessiveHalvingWithPriors,
-    "multifidelity_tpe": MultiFidelityPriorWeightedTreeParzenEstimator,
     "hyperband_custom_default": HyperbandCustomDefault,
     "priorband": PriorBand,
     "mobster": MOBSTER,
diff --git a/neps/distributions.py b/neps/sampling/distributions.py
similarity index 99%
rename from neps/distributions.py
rename to neps/sampling/distributions.py
index 2361e191..fb552949 100644
--- a/neps/distributions.py
+++ b/neps/sampling/distributions.py
@@ -225,6 +225,6 @@ def log_prob(self, value):
 
 
 @dataclass
-class DistributionOverDomain:
+class TorchDistributionWithDomain:
     distribution: Distribution
     domain: Domain
diff --git a/neps/state/neps_state.py b/neps/state/neps_state.py
index 163679d8..8afaee62 100644
--- a/neps/state/neps_state.py
+++ b/neps/state/neps_state.py
@@ -32,75 +32,6 @@
 Loc = TypeVar("Loc")
 T = TypeVar("T")
 
-def sample_trial(
-    neps_state,
-    optimizer: BaseOptimizer,
-    *,
-    worker_id: str,
-    _sample_hooks: list[Callable] | None = None,
-) -> Trial:
-    """Sample a new trial from the optimizer.
-
-    Args:
-        optimizer: The optimizer to sample the trial from.
-        worker_id: The worker that is sampling the trial.
-        _sample_hooks: A list of hooks to apply to the optimizer before sampling.
-
-    Returns:
-        The new trial.
-    """
-    with neps_state._optimizer_state.acquire() as (
-        opt_state,
-        put_opt,
-    ), neps_state._seed_state.acquire() as (seed_state, put_seed_state):
-        trials: dict[Trial.ID, Trial] = {}
-        for trial_id, shared_trial in neps_state._trials.all().items():
-            trial = shared_trial.synced()
-            trials[trial_id] = trial
-
-        seed_state.set_as_global_seed_state()
-
-        # TODO: Not sure if any existing pre_load hooks required
-        # it to be done after `load_results`... I hope not.
-        if _sample_hooks is not None:
-            for hook in _sample_hooks:
-                optimizer = hook(optimizer)
-
-        # NOTE: We don't want optimizers mutating this before serialization
-        budget = opt_state.budget.clone() if opt_state.budget is not None else None
-        sampled_config, new_opt_state = optimizer.ask(
-            trials=trials,
-            budget_info=budget,
-            optimizer_state=opt_state.shared_state,
-        )
-
-        if sampled_config.previous_config_id is not None:
-            previous_trial = trials.get(sampled_config.previous_config_id)
-            if previous_trial is None:
-                raise ValueError(
-                    f"Previous trial '{sampled_config.previous_config_id}' not found."
-                )
-            previous_trial_location = previous_trial.metadata.location
-        else:
-            previous_trial_location = None
-
-        trial = Trial.new(
-            trial_id=sampled_config.id,
-            location="",  # HACK: This will be set by the `TrialRepo`
-            config=sampled_config.config,
-            previous_trial=sampled_config.previous_config_id,
-            previous_trial_location=previous_trial_location,
-            time_sampled=time.time(),
-            worker_id=worker_id,
-        )
-        shared_trial = neps_state._trials.put_new(trial)
-        seed_state.recapture()
-        put_seed_state(seed_state)
-        put_opt(
-            OptimizationState(budget=opt_state.budget, shared_state=new_opt_state)
-        )
-
-    return trial
 
 @dataclass
 class NePSState(Generic[Loc]):
@@ -140,10 +71,75 @@ def get_trials_by_ids(self, trial_ids: list[str], /) -> dict[str, Trial | None]:
             for _id, shared_trial in self._trials.get_by_ids(trial_ids).items()
         }
 
-    def get_optimizer_instance(self) -> BaseOptimizer:
-        """Get the optimizer instance."""
-        raise NotImplementedError
+    def sample_trial(
+        self,
+        optimizer: BaseOptimizer,
+        *,
+        worker_id: str,
+        _sample_hooks: list[Callable] | None = None,
+    ) -> Trial:
+        """Sample a new trial from the optimizer.
+
+        Args:
+            optimizer: The optimizer to sample the trial from.
+            worker_id: The worker that is sampling the trial.
+            _sample_hooks: A list of hooks to apply to the optimizer before sampling.
 
+        Returns:
+            The new trial.
+        """
+        with self._optimizer_state.acquire() as (
+            opt_state,
+            put_opt,
+        ), self._seed_state.acquire() as (seed_state, put_seed_state):
+            trials: dict[Trial.ID, Trial] = {}
+            for trial_id, shared_trial in self._trials.all().items():
+                trial = shared_trial.synced()
+                trials[trial_id] = trial
+
+            seed_state.set_as_global_seed_state()
+
+            # TODO: Not sure if any existing pre_load hooks required
+            # it to be done after `load_results`... I hope not.
+            if _sample_hooks is not None:
+                for hook in _sample_hooks:
+                    optimizer = hook(optimizer)
+
+            # NOTE: We don't want optimizers mutating this before serialization
+            budget = opt_state.budget.clone() if opt_state.budget is not None else None
+            sampled_config, new_opt_state = optimizer.ask(
+                trials=trials,
+                budget_info=budget,
+                optimizer_state=opt_state.shared_state,
+            )
+
+            if sampled_config.previous_config_id is not None:
+                previous_trial = trials.get(sampled_config.previous_config_id)
+                if previous_trial is None:
+                    raise ValueError(
+                        f"Previous trial '{sampled_config.previous_config_id}' not found."
+                    )
+                previous_trial_location = previous_trial.metadata.location
+            else:
+                previous_trial_location = None
+
+            trial = Trial.new(
+                trial_id=sampled_config.id,
+                location="",  # HACK: This will be set by the `TrialRepo`
+                config=sampled_config.config,
+                previous_trial=sampled_config.previous_config_id,
+                previous_trial_location=previous_trial_location,
+                time_sampled=time.time(),
+                worker_id=worker_id,
+            )
+            shared_trial = self._trials.put_new(trial)
+            seed_state.recapture()
+            put_seed_state(seed_state)
+            put_opt(
+                OptimizationState(budget=opt_state.budget, shared_state=new_opt_state)
+            )
+
+        return trial
 
     def report_trial_evaluation(
         self,

From da4f376e20c64fa338f6e28564ee17d12cf73ce2 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Thu, 29 Aug 2024 19:40:59 +0200
Subject: [PATCH 28/63] optim: Switch to just additive kernel

---
 .../bayesian_optimization/models/gp.py        | 48 +++++++++----------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index 307f806b..d8709c2a 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -5,12 +5,12 @@
 from functools import reduce
 from typing import TYPE_CHECKING, Any, Mapping, TypeVar
 
+from botorch.models import MultiTaskGP
 import gpytorch
 import gpytorch.constraints
 import torch
 from botorch.acquisition.analytic import SingleTaskGP
-from botorch.models import MixedSingleTaskGP
-from botorch.models.gp_regression_mixed import CategoricalKernel, Likelihood
+from botorch.models.gp_regression_mixed import CategoricalKernel, Likelihood, MixedSingleTaskGP
 from botorch.models.transforms.outcome import Standardize
 from botorch.optim import optimize_acqf, optimize_acqf_mixed
 from gpytorch.kernels import MaternKernel, ScaleKernel
@@ -190,31 +190,29 @@ def default_single_obj_gp(
         return gp, likelihood
 
     # Mixed
-    def cont_kernel_factory(
-        batch_shape: torch.Size,
-        ard_num_dims: int,
-        active_dims: list[int],
-    ) -> ScaleKernel:
-        lengthscale_prior, lengthscale_constraint = default_lengthscale_prior(
-            ard_num_dims
-        )
-        return ScaleKernel(
-            MaternKernel(
-                nu=2.5,
-                batch_shape=batch_shape,
-                ard_num_dims=ard_num_dims,
-                active_dims=active_dims,
-                lengthscale_prior=lengthscale_prior,
-                lengthscale_constraint=lengthscale_constraint,
-            ),
-        )
+    numeric_kernel = default_matern_kernel(len(numerics), active_dims=tuple(numerics))
+    cat_kernel = default_categorical_kernel(
+        len(categoricals), active_dims=tuple(categoricals)
+    )
+
+    # WARNING: I previously tried SingleTaskMixedGp which does the following:
+    #
+    # x K((x1, c1), (x2, c2)) =
+    # x     K_cont_1(x1, x2) + K_cat_1(c1, c2) +
+    # x      K_cont_2(x1, x2) * K_cat_2(c1, c2)
+    #
+    # In a toy example with a single binary categorical which acted like F * {0, 1},
+    # the model collapsed to always predicting `0`. Causing all parameters defining F
+    # to essentially be guess at random. This is a lot more stable while testing...
+    # TODO: Figure out why...
+    kernel = numeric_kernel + cat_kernel
 
-    gp = MixedSingleTaskGP(
+    gp = SingleTaskGP(
         train_X=x.tensor,
         train_Y=y,
-        cat_dims=categoricals,
+        mean_module=default_mean(),
         likelihood=likelihood,
-        cont_kernel_factory=cont_kernel_factory,
+        covar_module=kernel,
         outcome_transform=Standardize(m=1),
     )
     return gp, likelihood
@@ -232,8 +230,8 @@ def optimize_acq(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     acq_options = acq_options or {}
 
-    lower = [domain.lower for domain in encoder.domains.values()]
-    upper = [domain.upper for domain in encoder.domains.values()]
+    lower = [domain.lower for domain in encoder.domains]
+    upper = [domain.upper for domain in encoder.domains]
     bounds = torch.tensor([lower, upper], dtype=torch.float)
 
     cat_transformers = {

From 5f76a7e3ffe642cb8092c403721ce604a16dba66 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Fri, 30 Aug 2024 15:46:01 +0200
Subject: [PATCH 29/63] feat: Stable pibo implementation

---
 neps/optimizers/__init__.py                   |  5 +-
 .../acquisition_functions/pibo.py             | 10 +-
 .../weighted_acquisition.py                   |  9 +-
 .../bayesian_optimization/models/gp.py        |  6 +-
 .../bayesian_optimization/optimizer.py        | 97 +++++++++++--------
 .../bayesian_optimization.yaml                |  1 -
 neps/optimizers/default_searchers/pibo.yaml   |  1 -
 neps/runtime.py                               |  2 -
 neps/sampling/distributions.py                | 64 ++++++++++--
 neps/sampling/priors.py                       | 86 ++++++++++------
 neps/search_spaces/domain.py                  |  2 +-
 .../hyperparameters/categorical.py            |  2 +
 neps_examples/basic_usage/hyperparameters.py  | 27 ++++--
 13 files changed, 210 insertions(+), 102 deletions(-)

diff --git a/neps/optimizers/__init__.py b/neps/optimizers/__init__.py
index 518952cd..74421687 100644
--- a/neps/optimizers/__init__.py
+++ b/neps/optimizers/__init__.py
@@ -19,7 +19,6 @@
     SuccessiveHalving,
     SuccessiveHalvingWithPriors,
 )
-from .multi_fidelity_prior.async_priorband import PriorBandAsha, PriorBandAshaHB
 from .multi_fidelity_prior.priorband import PriorBand
 from .random_search.optimizer import RandomSearch
 from .regularized_evolution.optimizer import RegularizedEvolution
@@ -29,8 +28,8 @@
 
 # TODO: Rename Searcher to Optimizer...
 SearcherMapping: Mapping[str, Callable[..., BaseOptimizer]] = {
-    "bayesian_optimization": BayesianOptimization,
-    "pibo": partial(BayesianOptimization, disable_priors=False),
+    "bayesian_optimization": partial(BayesianOptimization, use_priors=False),
+    "pibo": partial(BayesianOptimization, use_priors=True),
     "random_search": RandomSearch,
     "regularized_evolution": RegularizedEvolution,
     "assisted_regularized_evolution": partial(RegularizedEvolution, assisted=True),
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py
index 0f1668f1..76499ba1 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py
@@ -39,9 +39,15 @@ def apply_pibo_acquisition_weight(
     x_domain: Domain | list[Domain],
     prior_exponent: float,
 ):
+    import rich
+
+    rich.print(prior_exponent)
     if acq._log:
-        return acq_values + prior.log_prob(X, frm=x_domain) * prior_exponent
-    return acq_values * prior.prob(X, frm=x_domain).pow(prior_exponent)
+        weighted_log_probs = prior.log_prob(X, frm=x_domain) * prior_exponent
+        return acq_values + weighted_log_probs
+
+    weighted_probs = prior.prob(X, frm=x_domain).pow(prior_exponent)
+    return acq_values * weighted_probs
 
 
 def pibo_acquisition(
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
index 488c57f4..eadf9207 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
@@ -115,7 +115,7 @@ def __init__(
         # NOTE: We remove the X_pending from the base acquisition function as we will get
         # it in our own forward with `@concatenate_pending_points` and pass that forward.
         # This avoids possible duplicates
-        self.acq.set_X_pending(None)
+        acq.set_X_pending(None)
         self.set_X_pending(X_pending)
         self.apply_weight = apply_weight
         self.acq = acq
@@ -136,10 +136,11 @@ def forward(self, X: Tensor) -> Tensor:
         """
         if isinstance(self.acq, SampleReducingMCAcquisitionFunction):
             # shape: mc_samples x batch x q-candidates
-            acq_values = self.acq._non_reduce_forward(X)
+            acq_values = self.acq._non_reduced_forward(X)
             weighted_acq_values = self.apply_weight(acq_values, X, self.acq)
-            vals = self.acq._sample_reduction(self.acq._q_reduction(weighted_acq_values))
-            return vals.squeeze(-1)
+            q_reduced_acq = self.acq._q_reduction(weighted_acq_values)
+            sample_reduced_acq = self.acq._sample_reduction(q_reduced_acq)
+            return sample_reduced_acq.squeeze(-1)
 
         # shape: batch x q-candidates
         acq_values = self.acq(X).unsqueeze(-1)
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index d8709c2a..39e81cba 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -5,12 +5,14 @@
 from functools import reduce
 from typing import TYPE_CHECKING, Any, Mapping, TypeVar
 
-from botorch.models import MultiTaskGP
 import gpytorch
 import gpytorch.constraints
 import torch
 from botorch.acquisition.analytic import SingleTaskGP
-from botorch.models.gp_regression_mixed import CategoricalKernel, Likelihood, MixedSingleTaskGP
+from botorch.models.gp_regression_mixed import (
+    CategoricalKernel,
+    Likelihood,
+)
 from botorch.models.transforms.outcome import Standardize
 from botorch.optim import optimize_acqf, optimize_acqf_mixed
 from gpytorch.kernels import MaternKernel, ScaleKernel
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 2c4e5eeb..7d57e936 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -4,10 +4,7 @@
 from typing import TYPE_CHECKING, Any, Callable, Literal, Mapping
 
 import torch
-from botorch.acquisition import (
-    LinearMCObjective,
-    qLogExpectedImprovement,
-)
+from botorch.acquisition import LinearMCObjective, qLogExpectedImprovement
 from botorch.fit import fit_gpytorch_mll
 from gpytorch import ExactMarginalLogLikelihood
 
@@ -95,25 +92,34 @@ def _missing_cost_strategy(cost: torch.Tensor) -> torch.Tensor:
     return _missing_fill_strategy(cost, strategy="3std", lower_is_better=True)
 
 
-def _pibo_exp_term(n_sampled_already: int, ndims: int, budget_info: BudgetInfo) -> float:
-    if budget_info.max_evaluations is not None:
-        # From the PIBO paper (Section 4.1)
-        # https://arxiv.org/pdf/2204.11051
-        n = n_sampled_already
-        beta = budget_info.max_evaluations / 10
-    elif budget_info.max_cost_budget is not None:
-        # This might not work well if cost number is high
-        # early on, but it will start to normalize.
-        n = budget_info.used_cost_budget
-        beta = budget_info.max_cost_budget / 10
-    else:
-        # Otherwise, just some random heuristic based on the number
-        # of trials and dimensionality of the search space
-        # TODO: Think about and evaluate this more.
-        n = n_sampled_already
-        beta = ndims**2 / 10
-
-    return beta / n
+def _pibo_exp_term(
+    n_sampled_already: int,
+    ndims: int,
+    initial_design_size: int,
+) -> float:
+    # pibo paper
+    # https://arxiv.org/pdf/2204.11051
+    #
+    # they use some constant determined from max problem budget. seems impractical,
+    # given we might not know the final budget (i.e. imagine you iteratively increase
+    # the budget as you go along).
+    #
+    # instead, we base it on the fact that in lower dimensions, we don't to rely
+    # on the prior for too long as the amount of space you need to cover around the
+    # prior is fairly low. effectively, since the gp needs little samples to
+    # model pretty effectively in low dimension, we can derive the utility from
+    # the prior pretty quickly.
+    #
+    # however, for high dimensional settings, we want to rely longer on the prior
+    # for longer as the number of samples needed to model the area around the prior
+    # is much larger, and deriving the utility will take longer.
+    #
+    # in the end, we would like some curve going from 1->0 as n->inf, where `n` is
+    # the number of samples we have done so far.
+    # the easiest function that does this is `exp(-n)`, with some discounting of `n`
+    # dependant on the number of dimensions.
+    n_bo_samples = n_sampled_already - initial_design_size
+    return math.exp(-n_bo_samples / ndims)
 
 
 def _cost_used_budget_percentage(budget_info: BudgetInfo) -> float:
@@ -214,12 +220,6 @@ def __init__(  # noqa: D417
             raise NotImplementedError("Only supports flat search spaces for now!")
         super().__init__(pipeline_space=pipeline_space)
 
-        if initial_design_size is None:
-            N = len(pipeline_space.hyperparameters)
-            initial_design_size = int(max(2, math.log(N) ** 2))
-        elif initial_design_size < 1:
-            raise ValueError("Initial_design_size to be at least 1")
-
         params: dict[str, CategoricalParameter | FloatParameter | IntegerParameter] = {
             **pipeline_space.numerical,
             **pipeline_space.categoricals,
@@ -227,6 +227,14 @@ def __init__(  # noqa: D417
         if treat_fidelity_as_hyperparameters:
             params.update(pipeline_space.fidelities)
 
+        if initial_design_size is None:
+            # As we have fairly regularized GPs, who start with a more smooth landscape
+            # model, we don't need a high level of initial samples.
+            ndims = len(params)
+            initial_design_size = max(2, int(math.log(ndims) ** 2))
+        elif initial_design_size < 1:
+            raise ValueError("Initial_design_size to be at least 1")
+
         self.encoder = TensorEncoder.default(params) if encoder is None else encoder
         self.use_cost = use_cost
         self.prior = _make_prior(params) if use_priors is True else None
@@ -251,9 +259,9 @@ def ask(
                 "Seed is not yet implemented for BayesianOptimization"
             )
 
-        n_trials = len(trials)
+        n_trials_completed = len(trials)
         space = self.pipeline_space
-        config_id = str(n_trials + 1)
+        config_id = str(n_trials_completed + 1)
 
         # Fill intitial design data if we don't have any...
         if self.initial_design_ is None:
@@ -279,8 +287,8 @@ def ask(
             self.initial_design_.extend(configs)
 
         # If we havn't passed the intial design phase
-        if n_trials <= len(self.initial_design_):
-            config = self.initial_design_[n_trials - 1]
+        if n_trials_completed < len(self.initial_design_):
+            config = self.initial_design_[n_trials_completed]
             sample = SampledConfig(id=config_id, config=config, previous_config_id=None)
             return sample, optimizer_state
 
@@ -331,14 +339,25 @@ def ask(
         # If we should use the prior, weight the acquisition function by
         # the probability of it being sampled from the prior.
         if self.prior:
-            acq = pibo_acquisition(
-                acq,
-                prior=self.prior,
-                prior_exponent=_pibo_exp_term(n_trials, self.encoder.ncols, budget_info),
-                x_domain=self.encoder.domains,
-                X_pending=maybe_x_pending_tensor,
+            pibo_exp_term = _pibo_exp_term(
+                n_trials_completed,
+                self.encoder.ncols,
+                self.n_initial_design,
             )
 
+            # If the amount of weight derived from the pibo exponent becomes
+            # insignificant, we don't use it as it as it adds extra computational
+            # burden and introduces more chance of numerical instability.
+            significant_lower_bound = 1e-4
+            if pibo_exp_term > significant_lower_bound:
+                acq = pibo_acquisition(
+                    acq,
+                    prior=self.prior,
+                    prior_exponent=pibo_exp_term,
+                    x_domain=self.encoder.domains,
+                    X_pending=maybe_x_pending_tensor,
+                )
+
         # If we should use cost, weight the acquisition function by the cost
         # of the configurations.
         if self.use_cost:
diff --git a/neps/optimizers/default_searchers/bayesian_optimization.yaml b/neps/optimizers/default_searchers/bayesian_optimization.yaml
index cf3717ab..fb43f97b 100644
--- a/neps/optimizers/default_searchers/bayesian_optimization.yaml
+++ b/neps/optimizers/default_searchers/bayesian_optimization.yaml
@@ -1,6 +1,5 @@
 strategy: bayesian_optimization
 # Arguments that can be modified by the user
-initial_design_size: 10
 surrogate_model: gp  # or {"gp_hierarchy"}
 acquisition: EI  # or {"LogEI", "AEI"}
 log_prior_weighted: false
diff --git a/neps/optimizers/default_searchers/pibo.yaml b/neps/optimizers/default_searchers/pibo.yaml
index 9c386069..8b514ba8 100644
--- a/neps/optimizers/default_searchers/pibo.yaml
+++ b/neps/optimizers/default_searchers/pibo.yaml
@@ -1,6 +1,5 @@
 strategy: pibo
 # Arguments that can be modified by the user
-initial_design_size: 10
 surrogate_model: gp  # or {"gp_hierarchy"}
 acquisition: EI  # or {"LogEI", "AEI"}
 log_prior_weighted: false
diff --git a/neps/runtime.py b/neps/runtime.py
index b102b153..b234a479 100644
--- a/neps/runtime.py
+++ b/neps/runtime.py
@@ -519,8 +519,6 @@ def _launch_runtime(  # noqa: PLR0913
                     max_evaluations=max_evaluations_total,
                     used_evaluations=0,
                 )
-                if max_cost_total is not None
-                else None
             ),
             shared_state={},  # TODO: Unused for the time being...
         ),
diff --git a/neps/sampling/distributions.py b/neps/sampling/distributions.py
index fb552949..6b557e5a 100644
--- a/neps/sampling/distributions.py
+++ b/neps/sampling/distributions.py
@@ -9,18 +9,22 @@
 from typing_extensions import override
 
 import torch
-from torch.distributions import Distribution, constraints
+from torch.distributions import Distribution, Uniform, constraints
 from torch.distributions.utils import broadcast_all
 
+from neps.search_spaces.domain import Domain
+
 if TYPE_CHECKING:
     from neps.search_spaces.architecture.cfg_variants.constrained_cfg import Constraint
-    from neps.search_spaces.domain import Domain
 
-CONST_SQRT_2 = math.sqrt(2)
-CONST_INV_SQRT_2PI = 1 / math.sqrt(2 * math.pi)
-CONST_INV_SQRT_2 = 1 / math.sqrt(2)
-CONST_LOG_INV_SQRT_2PI = math.log(CONST_INV_SQRT_2PI)
-CONST_LOG_SQRT_2PI_E = 0.5 * math.log(2 * math.pi * math.e)
+CONST_SQRT_2 = torch.tensor(math.sqrt(2), dtype=torch.float64)
+CONST_INV_SQRT_2PI = torch.tensor(1 / math.sqrt(2 * math.pi), dtype=torch.float64)
+CONST_INV_SQRT_2 = torch.tensor(1 / math.sqrt(2), dtype=torch.float64)
+CONST_LOG_INV_SQRT_2PI = torch.tensor(math.log(CONST_INV_SQRT_2PI), dtype=torch.float64)
+CONST_LOG_SQRT_2PI_E = torch.tensor(
+    0.5 * math.log(2 * math.pi * math.e),
+    dtype=torch.float64,
+)
 
 # from https://github.com/toshas/torch_truncnorm
 
@@ -224,7 +228,53 @@ def log_prob(self, value):
         return super().log_prob(value) - self._log_scale
 
 
+class UniformWithUpperBound(Uniform):
+    """Uniform distribution with upper bound inclusive.
+
+    This is mostly a hack because torch's version of Uniform does not include
+    the upper bound which only causes a problem when considering the log_prob.
+    Otherwise the upper bound works with every other method.
+    """
+
+    # OPTIM: This could probably be optimized a lot but I'm not sure how it effects
+    # gradients. Could probably do a different path depending on if `value` requires
+    # gradients or not.
+    @override
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
+        if self._validate_args:
+            self._validate_sample(value)
+
+        lb = self.low.le(value).type_as(self.low)
+        ub = self.high.ge(value).type_as(self.low)  # The main change, is `gt` in original
+        return torch.log(lb.mul(ub)) - torch.log(self.high - self.low)
+
+
 @dataclass
 class TorchDistributionWithDomain:
     distribution: Distribution
     domain: Domain
+
+
+UNIT_UNIFORM_DIST = TorchDistributionWithDomain(
+    distribution=UniformWithUpperBound(0, 1),
+    domain=Domain.unit_float(),
+)
+
+if __name__ == "__main__":
+    loc = 0.95
+    for confidence in torch.linspace(0.0, 0.8, 8):
+        scale = 1 - confidence
+        dist = TruncatedNormal(
+            loc=loc,
+            scale=scale,
+            a=0.0,
+            b=1.0,
+        )
+        xs = torch.linspace(0, 1, 100)
+        ys = dist.log_prob(xs)
+        import matplotlib.pyplot as plt
+
+        plt.plot(xs, ys, label=f"confidence={confidence}")
+        plt.plot(loc, dist.log_prob(torch.tensor(loc)), "ro")
+    plt.legend()
+    plt.show()
diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py
index 03b64122..f2373a68 100644
--- a/neps/sampling/priors.py
+++ b/neps/sampling/priors.py
@@ -15,7 +15,11 @@
 
 import torch
 
-from neps.sampling.distributions import TorchDistributionWithDomain, TruncatedNormal
+from neps.sampling.distributions import (
+    UNIT_UNIFORM_DIST,
+    TorchDistributionWithDomain,
+    TruncatedNormal,
+)
 from neps.sampling.samplers import Sampler, WeightedSampler
 from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
 
@@ -160,21 +164,11 @@ def make_centered(
                     f" Got {confidence}."
                 )
 
-        for name in domains:
-            if name not in centers:
-                raise ValueError(
-                    f"Center for {name} is missing. "
-                    f"Please provide a center for all domains."
-                )
-
         distributions: list[TorchDistributionWithDomain] = []
         for name, domain in domains.items():
             center_confidence = centers.get(name)
             if center_confidence is None:
-                dist = TorchDistributionWithDomain(
-                    distribution=torch.distributions.Uniform(0.0, 1.0),
-                    domain=UNIT_FLOAT_DOMAIN,
-                )
+                distributions.append(UNIT_UNIFORM_DIST)
                 continue
 
             center, confidence = center_confidence
@@ -203,7 +197,9 @@ def make_centered(
                 weights[center] = confidence
 
                 dist = TorchDistributionWithDomain(
-                    distribution=torch.distributions.Categorical(probs=weights),
+                    distribution=torch.distributions.Categorical(
+                        probs=weights, validate_args=False
+                    ),
                     domain=domain,
                 )
                 distributions.append(dist)
@@ -213,13 +209,17 @@ def make_centered(
             unit_center = domain.to_unit(
                 torch.tensor(center, device=device, dtype=torch.float64)
             )
+            scale = torch.tensor(1 - confidence, device=device, dtype=torch.float64)
+            a = torch.tensor(0.0, device=device, dtype=torch.float64)
+            b = torch.tensor(1.0, device=device, dtype=torch.float64)
             dist = TorchDistributionWithDomain(
                 distribution=TruncatedNormal(
                     loc=unit_center,
-                    scale=(1 - confidence),
-                    a=0.0,
-                    b=1.0,
+                    scale=scale,
+                    a=a,
+                    b=b,
                     device=device,
+                    validate_args=False,
                 ),
                 domain=UNIT_FLOAT_DOMAIN,
             )
@@ -257,11 +257,29 @@ class CenteredPrior(Prior):
     distributions: list[TorchDistributionWithDomain]
     """Distributions along with the corresponding domains they sample from."""
 
-    _distribution_domains: list[Domain] = field(init=False, repr=False)
+    _distribution_domains: list[Domain] = field(init=False)
+
+    # OPTIM: These are used for an optimization in `log_prob`
+    _meaningful_ixs: list[int] = field(init=False)
+    _meaningful_doms: list[Domain] = field(init=False)
+    _meaningful_dists: list[Distribution] = field(init=False)
 
     def __post_init__(self):
         self._distribution_domains = [dist.domain for dist in self.distributions]
 
+        rest: list[tuple[int, Domain, Distribution]] = []
+        for i, dist in enumerate(self.distributions):
+            if dist != UNIT_UNIFORM_DIST:
+                rest.append((i, dist.domain, dist.distribution))
+
+        if len(rest) == 0:
+            self._meaningful_ixs = []
+            self._meaningful_doms = []
+            self._meaningful_dists = []
+            return
+
+        self._meaningful_ixs, self._meaningful_doms, self._meaningful_dists = zip(*rest)
+
     @property
     @override
     def ncols(self) -> int:
@@ -275,23 +293,31 @@ def log_prob(self, x: torch.Tensor, *, frm: list[Domain] | Domain) -> torch.Tens
         if x.ndim == 1:
             x = x.unsqueeze(0)
 
+        # OPTIM: We can actually just skip elements that are distributed uniformly as
+        # **assuming** they are all correctly in bounds, their log_pdf will be 0 and
+        # contribute nothing.
+        # It also helps numeric stability to avoid useless computations.
+        if len(self._meaningful_ixs) == 0:
+            return torch.zeros(x.shape[:-1], dtype=torch.float64, device=x.device)
+
+        frm = frm if isinstance(frm, Domain) else [frm[i] for i in self._meaningful_ixs]
+
         # Cast all values from the value domains to the domain of the sampler.
-        sample_domain_tensor = Domain.translate(
-            x,
+        translated_x = Domain.translate(
+            x[..., self._meaningful_ixs],
             frm=frm,
-            to=self._distribution_domains,
+            to=self._meaningful_doms,
         )
 
         # Calculate the log probabilities of the sample domain tensors under their
         # respective distributions.
-        itr = enumerate(self.distributions)
+        itr = iter(zip(self._meaningful_ixs, self._meaningful_dists))
         first_i, first_dist = next(itr)
+        log_probs = first_dist.log_prob(translated_x[..., first_i])
 
-        log_probs = first_dist.distribution.log_prob(sample_domain_tensor[..., first_i])
         for i, dist in itr:
-            log_probs = log_probs + dist.distribution.log_prob(
-                sample_domain_tensor[..., i]
-            )
+            log_probs = log_probs + dist.log_prob(translated_x[..., i])
+
         return log_probs
 
     @override
@@ -330,15 +356,11 @@ class UniformPrior(Prior):
     ncols: int
     """The number of columns in the tensor to sample from."""
 
-    _unit_uniform: Distribution = field(init=False, repr=False)
-
-    def __post_init__(self):
-        self._unit_uniform = torch.distributions.Uniform(0.0, 1.0)
-
     @override
     def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor:
-        sample_domain_tensor = Domain.translate(x, frm=frm, to=UNIT_FLOAT_DOMAIN)
-        return torch.sum(self._unit_uniform.log_prob(sample_domain_tensor), dim=-1)
+        # NOTE: We just assume everything is in bounds...
+        shape = x.shape[:-1]
+        return torch.zeros(shape, dtype=torch.float64, device=x.device)
 
     @override
     def sample(
diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py
index 5d92fac3..7342a136 100644
--- a/neps/search_spaces/domain.py
+++ b/neps/search_spaces/domain.py
@@ -271,7 +271,7 @@ def cast(self, x: Tensor, frm: Domain) -> Tensor:
         if same_bounds and same_log_bounds and (self.bins is None or same_bins):
             if self.round:
                 x = torch.round(x)
-            return x.type(self.dtype)
+            return x.type(self.dtype) if x.dtype != self.dtype else x
 
         # Shortcut 2. (From normalized)
         # The domain we are coming from is already normalized, we only need to lift
diff --git a/neps/search_spaces/hyperparameters/categorical.py b/neps/search_spaces/hyperparameters/categorical.py
index b6a1fe27..6756e828 100644
--- a/neps/search_spaces/hyperparameters/categorical.py
+++ b/neps/search_spaces/hyperparameters/categorical.py
@@ -17,6 +17,7 @@
 import numpy.typing as npt
 from more_itertools import all_unique
 
+from neps.search_spaces.domain import Domain
 from neps.search_spaces.parameter import MutatableParameter, ParameterWithPrior
 
 if TYPE_CHECKING:
@@ -110,6 +111,7 @@ def __init__(
         self._default_index: int | None = (
             self.choices.index(default) if default is not None else None
         )
+        self.domain = Domain.indices(len(self.choices))
 
     @override
     def clone(self) -> Self:
diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py
index a89c9bcc..6345b1c5 100644
--- a/neps_examples/basic_usage/hyperparameters.py
+++ b/neps_examples/basic_usage/hyperparameters.py
@@ -7,12 +7,23 @@
 
 import neps
 
+from rich import print
 
-def run_pipeline(float1, float2, float3, categorical, integer1, integer2):
+PRINT = False
+
+
+def run_pipeline(float1, float2, float3, integer1, integer2):
+    if PRINT:
+        print("float1:", float1)
+        print("float2:", float2)
+        print("float3:", float3)
+        # print("categorical:", categorical)
+        print("integer1:", integer1)
+        print("integer2:", integer2)
     loss = -float(
         np.sum(
             [
-                (float1 * float2 / (float3 + 1)) * int(categorical),
+                (float1 * float2 / (float3 + 1)) * 1,  # ,(int(categorical) + 1),
                 integer1,
                 math.log(integer2),
             ]
@@ -23,12 +34,12 @@ def run_pipeline(float1, float2, float3, categorical, integer1, integer2):
 
 
 pipeline_space = dict(
-    float1=neps.FloatParameter(lower=0, upper=1),
-    float2=neps.FloatParameter(lower=0, upper=20),
-    float3=neps.FloatParameter(lower=0, upper=5),
-    categorical=neps.CategoricalParameter(choices=[0, 1]),
-    integer1=neps.IntegerParameter(lower=0, upper=1),
-    integer2=neps.IntegerParameter(lower=1, upper=1000, log=True),
+    float1=neps.FloatParameter(lower=0, upper=1, default=0.95),
+    float2=neps.FloatParameter(lower=0, upper=20, default=19.5),
+    float3=neps.FloatParameter(lower=0, upper=5, default=0.5),
+    # categorical=neps.CategoricalParameter(choices=[0, 1]),
+    integer1=neps.IntegerParameter(lower=0, upper=1, default=1),
+    integer2=neps.IntegerParameter(lower=1, upper=1000, log=True, default=950),
 )
 
 logging.basicConfig(level=logging.INFO)

From b2d3e15af9829868741820a98d09f5114b6a4908 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Fri, 30 Aug 2024 15:59:46 +0200
Subject: [PATCH 30/63] fix: Remove stray prints

---
 .../bayesian_optimization/acquisition_functions/pibo.py        | 3 ---
 neps_examples/basic_usage/hyperparameters.py                   | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py
index 76499ba1..db3120e7 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py
@@ -39,9 +39,6 @@ def apply_pibo_acquisition_weight(
     x_domain: Domain | list[Domain],
     prior_exponent: float,
 ):
-    import rich
-
-    rich.print(prior_exponent)
     if acq._log:
         weighted_log_probs = prior.log_prob(X, frm=x_domain) * prior_exponent
         return acq_values + weighted_log_probs
diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py
index 6345b1c5..a137fabf 100644
--- a/neps_examples/basic_usage/hyperparameters.py
+++ b/neps_examples/basic_usage/hyperparameters.py
@@ -30,7 +30,7 @@ def run_pipeline(float1, float2, float3, integer1, integer2):
         )
     )  # Random noise
     # time.sleep(0.7)  # For demonstration purposes
-    return loss
+    return {"loss": loss, "cost": math.log(integer2)}
 
 
 pipeline_space = dict(

From f209f4773c315e99f8ce9cd7b2cc80824d4add79 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Fri, 30 Aug 2024 16:09:47 +0200
Subject: [PATCH 31/63] optim: Lengthscale has more wiggle-room in high d

---
 neps/optimizers/bayesian_optimization/models/gp.py | 4 ++--
 neps/optimizers/bayesian_optimization/optimizer.py | 8 ++++----
 neps_examples/basic_usage/hyperparameters.py       | 1 +
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index 39e81cba..b3c50112 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -42,7 +42,7 @@ def default_likelihood_with_prior() -> gpytorch.likelihoods.GaussianLikelihood:
     # even a 0.01% noise, we need that all the way up to 1e-2. Hence
     #
     # If we had 10% noise and we allow the noise to easily optimize towards
-    # 1e-8, then the lengthscales are forced to beome very small, essentially
+    # 1e-8, then the lengthscales are forced to become very small, essentially
     # overfitting. If we have 0% noise and we don't allow it to easily get low
     # then we will drastically underfit.
     # A guiding principle here is that we should allow the noise to be just
@@ -90,7 +90,7 @@ def default_lengthscale_prior(
     # of the dimension and number of samples
     lengthscale_prior = gpytorch.priors.LogNormalPrior(
         loc=math.sqrt(2.0) + math.log(N) / 2,
-        scale=math.sqrt(3.0),
+        scale=math.sqrt(3.0) * math.log(N),
     )
     # NOTE: It's possible to just specify `GreaterThan`, however
     # digging through the code, if this ends up at botorch's optimize,
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 7d57e936..eb916fb6 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -308,8 +308,8 @@ def ask(
                     trial.report.loss if trial.report.loss is not None else torch.nan
                 )
                 if self.use_cost:
-                    cost = trial.report.cost
-                    costs.append(cost if cost is not None else torch.nan)
+                    cost_z_score = trial.report.cost
+                    costs.append(cost_z_score if cost_z_score is not None else torch.nan)
 
         x = self.encoder.pack(x_configs, device=self.device)
         maybe_x_pending_tensor = None
@@ -362,12 +362,12 @@ def ask(
         # of the configurations.
         if self.use_cost:
             cost = torch.tensor(costs, dtype=torch.float64, device=self.device)
-            cost = _missing_cost_strategy(cost)
+            cost_z_score = _missing_cost_strategy(cost)
 
             # TODO: We might want a different model for cost estimation... one reason
             # is that cost estimates are likely to be a lot noisier than the likelihood
             # we have by default.
-            cost_model, cost_likelihood = self._get_model(x, cost)
+            cost_model, cost_likelihood = self._get_model(x, cost_z_score)
 
             # Optimize the cost model
             fit_gpytorch_mll(
diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py
index a137fabf..e28cf585 100644
--- a/neps_examples/basic_usage/hyperparameters.py
+++ b/neps_examples/basic_usage/hyperparameters.py
@@ -45,6 +45,7 @@ def run_pipeline(float1, float2, float3, integer1, integer2):
 logging.basicConfig(level=logging.INFO)
 neps.run(
     run_pipeline=run_pipeline,
+    searcher="pibo",
     pipeline_space=pipeline_space,
     root_directory="results/hyperparameters_example",
     post_run_summary=True,

From c8151634e847da075c01c33c55f0ac3ab1362538 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Sun, 8 Sep 2024 18:43:25 +0200
Subject: [PATCH 32/63] feat: Cost cooling

---
 .../acquisition_functions/cost_cooling.py     | 29 +++++-----
 .../weighted_acquisition.py                   |  1 +
 .../bayesian_optimization/models/gp.py        | 54 ++++++++++++-------
 .../bayesian_optimization/optimizer.py        | 51 ++++++++++--------
 neps_examples/basic_usage/hyperparameters.py  | 12 ++---
 pyproject.toml                                |  2 +-
 6 files changed, 87 insertions(+), 62 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py b/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py
index 4741705f..a32baebe 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py
@@ -2,6 +2,7 @@
 
 from typing import TYPE_CHECKING
 
+import torch
 from botorch.acquisition.logei import partial
 
 from neps.optimizers.bayesian_optimization.acquisition_functions.weighted_acquisition import (
@@ -9,10 +10,8 @@
 )
 
 if TYPE_CHECKING:
-    import torch
     from botorch.acquisition import AcquisitionFunction
-    from botorch.models.gp_regression import Likelihood
-    from botorch.models.model import Model
+    from botorch.acquisition.analytic import GPyTorchModel
     from torch import Tensor
 
 
@@ -20,23 +19,28 @@ def apply_cost_cooling(
     acq_values: Tensor,
     X: Tensor,
     acq: AcquisitionFunction,
-    cost_model: Model,
-    likelihood: Likelihood,
+    cost_model: GPyTorchModel,
     alpha: float,
 ) -> Tensor:
-    posterior = likelihood(cost_model(X))
-    cost = posterior.mean
+    # NOTE: We expect **positive** costs from model
+    cost = cost_model.posterior(X).mean
+    cost = cost.squeeze(dim=-1) if cost_model.num_outputs == 1 else cost.sum(dim=-1)
 
     if acq._log:
-        # can derive from eq log(x) = log(acq / cost^alpha)
-        return acq_values - alpha * cost.log()
-    return acq_values / cost.pow(alpha)
+        # Take log of both sides, acq is already log scaled
+        # -- x = acq / cost^alpha
+        # -- log(x) = log(acq) - alpha * log(cost)
+        w = alpha * cost.log()
+        return acq_values - w
+
+    # https://github.com/pytorch/botorch/discussions/2194
+    w = cost.pow(alpha)
+    return torch.where(acq_values > 0, acq_values / w, acq_values * w)
 
 
 def cost_cooled_acq(
     acq_fn: AcquisitionFunction,
-    model: Model,
-    likelihood: Likelihood,
+    model: GPyTorchModel,
     used_budget_percentage: float,
     X_pending: torch.Tensor | None = None,
 ) -> WeightedAcquisition:
@@ -46,7 +50,6 @@ def cost_cooled_acq(
         apply_weight=partial(
             apply_cost_cooling,
             cost_model=model,
-            likelihood=likelihood,
             alpha=1 - used_budget_percentage,
         ),
         X_pending=X_pending,
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
index eadf9207..fa7ca176 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
@@ -119,6 +119,7 @@ def __init__(
         self.set_X_pending(X_pending)
         self.apply_weight = apply_weight
         self.acq = acq
+        self._log = acq._log
 
     # Taken from PiBO implementation in botorch (PriorGuidedAcquisitionFunction).
     @concatenate_pending_points
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index b3c50112..7dd29dd7 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -1,3 +1,5 @@
+"""Gaussian Process models for Bayesian Optimization."""
+
 from __future__ import annotations
 
 import logging
@@ -12,6 +14,7 @@
 from botorch.models.gp_regression_mixed import (
     CategoricalKernel,
     Likelihood,
+    OutcomeTransform,
 )
 from botorch.models.transforms.outcome import Standardize
 from botorch.optim import optimize_acqf, optimize_acqf_mixed
@@ -33,7 +36,12 @@
 T = TypeVar("T")
 
 
-def default_likelihood_with_prior() -> gpytorch.likelihoods.GaussianLikelihood:
+def likelihood_with_prior_on_log_scale(
+    mean: float = 1e-2,
+    std: float = math.sqrt(3),
+    bounds: tuple[float, float] = (1e-6, 1),
+) -> gpytorch.likelihoods.GaussianLikelihood:
+    """Default Gaussian likelihood with priors for the noise."""
     # The effect of the likelihood of noise is pretty crucial w.r.t.
     # whether we are going to overfit every point by overfitting with
     # the lengthscale, or whether we smooth through and assume variation
@@ -54,25 +62,21 @@ def default_likelihood_with_prior() -> gpytorch.likelihoods.GaussianLikelihood:
     # TOOD: We may want to move the likelihood inside the GP and decay the
     # amount the GP can attribute to noise (reduce std and mean) relative
     # to samples seen, effectively reducing the smoothness of the GP overtime
-    noise_mean = 1e-2
-    noise_std = math.sqrt(3)
-    _noise_prior = gpytorch.priors.LogNormalPrior(
-        math.log(noise_mean) + noise_std**2,
-        noise_std,
-    )
+    _noise_prior = gpytorch.priors.LogNormalPrior(math.log(mean) + std**2, std)
     return gpytorch.likelihoods.GaussianLikelihood(
         noise_prior=_noise_prior,
         # Going below 1e-6 could introduuce a lot of numerical instability in the
         # kernels, even if it's a noiseless function
         noise_constraint=gpytorch.constraints.Interval(
-            lower_bound=1e-6,
-            upper_bound=1,
-            initial_value=noise_mean,
+            lower_bound=bounds[0],
+            upper_bound=bounds[1],
+            initial_value=mean,
         ),
     )
 
 
 def default_signal_variance_prior() -> gpytorch.priors.NormalPrior:
+    """Default prior for the signal variance."""
     # The outputscale prior is a bit more tricky. Essentially
     # it describes how much we expect the function to move
     # around the mean (0 as we normalize the `ys`)
@@ -85,6 +89,7 @@ def default_signal_variance_prior() -> gpytorch.priors.NormalPrior:
 def default_lengthscale_prior(
     N: int,
 ) -> tuple[gpytorch.priors.LogNormalPrior, gpytorch.constraints.Interval]:
+    """Default prior for the lengthscale."""
     # Based on `Vanilla GP work great in High Dimensions` by Carl Hvafner
     # TODO: I'm not convinced entirely that the `std` is independant
     # of the dimension and number of samples
@@ -107,6 +112,7 @@ def default_lengthscale_prior(
 
 
 def default_mean() -> gpytorch.means.ConstantMean:
+    """Default mean for the GP."""
     return gpytorch.means.ConstantMean(
         constant_prior=gpytorch.priors.NormalPrior(0, 0.2),
         constant_constraint=gpytorch.constraints.Interval(
@@ -121,6 +127,7 @@ def default_matern_kernel(
     N: int,
     active_dims: tuple[int, ...] | None = None,
 ) -> ScaleKernel:
+    """Default Matern kernel for the GP."""
     lengthscale_prior, lengthscale_constraint = default_lengthscale_prior(N)
 
     return ScaleKernel(
@@ -138,6 +145,7 @@ def default_categorical_kernel(
     N: int,
     active_dims: tuple[int, ...] | None = None,
 ) -> ScaleKernel:
+    """Default Categorical kernel for the GP."""
     # Following BoTorches implementation of the MixedSingleTaskGP
     return ScaleKernel(
         CategoricalKernel(
@@ -151,9 +159,16 @@ def default_categorical_kernel(
 def default_single_obj_gp(
     x: TensorPack,
     y: torch.Tensor,
+    *,
+    outcome_transform: OutcomeTransform | None = None,
 ) -> tuple[SingleTaskGP, Likelihood]:
+    """Default GP for single objective optimization."""
     if y.ndim == 1:
         y = y.unsqueeze(-1)
+
+    if outcome_transform is None:
+        outcome_transform = Standardize(m=1)
+
     encoder = x.encoder
     numerics: list[int] = []
     categoricals: list[int] = []
@@ -163,7 +178,9 @@ def default_single_obj_gp(
         else:
             numerics.append(encoder.index_of[hp_name])
 
-    likelihood = default_likelihood_with_prior()
+    # TODO: If we have a low cardinality integer, we should consider
+    # just treating it as a categorical...
+    likelihood = likelihood_with_prior_on_log_scale()
 
     # Purely vectorial
     if len(categoricals) == 0:
@@ -174,7 +191,7 @@ def default_single_obj_gp(
             likelihood=likelihood,
             # Only matern kernel
             covar_module=default_matern_kernel(len(numerics)),
-            outcome_transform=Standardize(m=1),
+            outcome_transform=outcome_transform,
         )
         return gp, likelihood
 
@@ -187,7 +204,7 @@ def default_single_obj_gp(
             likelihood=likelihood,
             # Only categorical kernel
             covar_module=default_categorical_kernel(len(categoricals)),
-            outcome_transform=Standardize(m=1),
+            outcome_transform=outcome_transform,
         )
         return gp, likelihood
 
@@ -215,7 +232,7 @@ def default_single_obj_gp(
         mean_module=default_mean(),
         likelihood=likelihood,
         covar_module=kernel,
-        outcome_transform=Standardize(m=1),
+        outcome_transform=outcome_transform,
     )
     return gp, likelihood
 
@@ -226,15 +243,16 @@ def optimize_acq(
     *,
     n_candidates_required: int = 1,
     num_restarts: int = 20,
-    n_intial_start_points: int = 512,
+    n_intial_start_points: int = 256,
     acq_options: Mapping[str, Any] | None = None,
     maximum_allowed_categorical_combinations: int = 30,
 ) -> tuple[torch.Tensor, torch.Tensor]:
+    """Optimize the acquisition function."""
     acq_options = acq_options or {}
 
     lower = [domain.lower for domain in encoder.domains]
     upper = [domain.upper for domain in encoder.domains]
-    bounds = torch.tensor([lower, upper], dtype=torch.float)
+    bounds = torch.tensor([lower, upper], dtype=torch.float64)
 
     cat_transformers = {
         name: t
@@ -281,8 +299,8 @@ def optimize_acq(
     else:
         fixed_cats = [dict(zip(cats.keys(), combo)) for combo in product(*cats.values())]
 
-    # TODO: we should deterministicall shuffle the fixed_categoricals as the
-    # underlying function does not.
+    # TODO: we should deterministically shuffle the fixed_categoricals
+    # as the underlying function does not.
     return optimize_acqf_mixed(
         acq_function=acq_fn,
         bounds=bounds,
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index eb916fb6..d2b58c19 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -1,11 +1,15 @@
 from __future__ import annotations
 
 import math
-from typing import TYPE_CHECKING, Any, Callable, Literal, Mapping
+from typing import TYPE_CHECKING, Any, Literal, Mapping
 
 import torch
-from botorch.acquisition import LinearMCObjective, qLogExpectedImprovement
+from botorch.acquisition import (
+    LinearMCObjective,
+)
+from botorch.acquisition.logei import qLogNoisyExpectedImprovement
 from botorch.fit import fit_gpytorch_mll
+from botorch.models.transforms.outcome import ChainedOutcomeTransform, Log, Standardize
 from gpytorch import ExactMarginalLogLikelihood
 
 from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig
@@ -20,13 +24,10 @@
     optimize_acq,
 )
 from neps.sampling import Prior, Sampler
-from neps.search_spaces.encoding import TensorEncoder, TensorPack
+from neps.search_spaces.encoding import TensorEncoder
 from neps.search_spaces.hyperparameters.categorical import CategoricalParameter
 
 if TYPE_CHECKING:
-    from botorch.models.gp_regression_mixed import Likelihood
-    from botorch.models.model import Model
-
     from neps.search_spaces import (
         SearchSpace,
     )
@@ -174,9 +175,6 @@ def __init__(  # noqa: D417
         pipeline_space: SearchSpace,
         *,
         initial_design_size: int | None = None,
-        surrogate_model: (
-            Literal["gp"] | Callable[[TensorPack, torch.Tensor], tuple[Model, Likelihood]]
-        ) = "gp",
         use_priors: bool = False,
         use_cost: bool = False,
         sample_default_first: bool = False,
@@ -192,8 +190,6 @@ def __init__(  # noqa: D417
             initial_design_size: Number of samples used before using the surrogate model.
                 If None, it will take `int(log(N) ** 2)` samples where `N` is the number
                 of parameters in the search space.
-            surrogate_model: Surrogate model, either a known model str or a callable
-                that takes in the training data and returns a model fitted to (X, y).
             use_priors: Whether to use priors set on the hyperparameters during search.
             use_cost: Whether to consider reported "cost" from configurations in decision
                 making. If True, the optimizer will weigh potential candidates by how much
@@ -241,10 +237,6 @@ def __init__(  # noqa: D417
         self.device = device
         self.sample_default_first = sample_default_first
         self.n_initial_design = initial_design_size
-        self._get_model = (
-            default_single_obj_gp if surrogate_model == "gp" else surrogate_model
-        )
-
         self.initial_design_: list[dict[str, Any]] | None = None
 
     def ask(
@@ -321,14 +313,21 @@ def ask(
         y = _missing_y_strategy(y)
 
         # Now fit our model
-        y_model, y_likelihood = self._get_model(x, y)
+        y_model, y_likelihood = default_single_obj_gp(x, y)
         fit_gpytorch_mll(
             ExactMarginalLogLikelihood(likelihood=y_likelihood, model=y_model)
         )
 
-        acq = qLogExpectedImprovement(
+        # NOTE: We use:
+        # * q - allows accounting for pending points, normally used to get a batch
+        #       of points.
+        # * log - More numerically stable
+        # * Noisy - In Deep-Learning, we shouldn't take f.min() incase it was a noise
+        #           spike. This accounts for noise in objective.
+        # * ExpectedImprovement - Cause ya know, the default.
+        acq = qLogNoisyExpectedImprovement(
             y_model,
-            best_f=y.min(),
+            X_baseline=x.tensor,
             X_pending=maybe_x_pending_tensor,
             # Unfortunatly, there's no option to indicate that we minimize
             # the AcqFunction so we need to do some kind of transformation.
@@ -364,10 +363,16 @@ def ask(
             cost = torch.tensor(costs, dtype=torch.float64, device=self.device)
             cost_z_score = _missing_cost_strategy(cost)
 
-            # TODO: We might want a different model for cost estimation... one reason
-            # is that cost estimates are likely to be a lot noisier than the likelihood
-            # we have by default.
-            cost_model, cost_likelihood = self._get_model(x, cost_z_score)
+            cost_model, cost_likelihood = default_single_obj_gp(
+                x,
+                cost_z_score,
+                outcome_transform=ChainedOutcomeTransform(
+                    # TODO: Maybe some way for a user to specify their cost
+                    # is on a log scale?
+                    log=Log(),
+                    standardize=Standardize(m=1),
+                ),
+            )
 
             # Optimize the cost model
             fit_gpytorch_mll(
@@ -376,8 +381,8 @@ def ask(
             acq = cost_cooled_acq(
                 acq_fn=acq,
                 model=cost_model,
-                likelihood=cost_likelihood,
                 used_budget_percentage=_cost_used_budget_percentage(budget_info),
+                X_pending=maybe_x_pending_tensor,
             )
 
         # Finally, optimize the acquisition function to get a configuration
diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py
index e28cf585..eb7e095b 100644
--- a/neps_examples/basic_usage/hyperparameters.py
+++ b/neps_examples/basic_usage/hyperparameters.py
@@ -7,8 +7,6 @@
 
 import neps
 
-from rich import print
-
 PRINT = False
 
 
@@ -21,16 +19,16 @@ def run_pipeline(float1, float2, float3, integer1, integer2):
         print("integer1:", integer1)
         print("integer2:", integer2)
     loss = -float(
-        np.sum(
+        integer2
+        * np.sum(
             [
-                (float1 * float2 / (float3 + 1)) * 1,  # ,(int(categorical) + 1),
+                (float1 * float2 / (float3 + 1)),  # * (int(categorical) + 1),
                 integer1,
-                math.log(integer2),
             ]
         )
     )  # Random noise
     # time.sleep(0.7)  # For demonstration purposes
-    return {"loss": loss, "cost": math.log(integer2)}
+    return {"loss": loss, "cost": float(integer2)}
 
 
 pipeline_space = dict(
@@ -45,7 +43,7 @@ def run_pipeline(float1, float2, float3, integer1, integer2):
 logging.basicConfig(level=logging.INFO)
 neps.run(
     run_pipeline=run_pipeline,
-    searcher="pibo",
+    searcher="bayesian_optimization",
     pipeline_space=pipeline_space,
     root_directory="results/hyperparameters_example",
     post_run_summary=True,
diff --git a/pyproject.toml b/pyproject.toml
index 27e49fa2..cd808439 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,7 +54,7 @@ grakel = "^0.1"
 numpy = "^1"
 pandas = "^2"
 networkx = "^2.6.3"
-nltk = "^3.6.4"
+nltk = "^3"
 scipy = "^1"
 torch = ">1.7.0,!=2.0.1, !=2.1.0"
 matplotlib = "^3"

From da47d701bf0f7907d447cc389ae3d0830ce32337 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Sun, 8 Sep 2024 18:55:53 +0200
Subject: [PATCH 33/63] optim: Scale initial sample points for acq. opt based
 on ndims

---
 neps/optimizers/bayesian_optimization/models/gp.py | 13 +++++++++++--
 neps/optimizers/bayesian_optimization/optimizer.py |  8 +++++++-
 neps_examples/basic_usage/hyperparameters.py       |  1 +
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index 7dd29dd7..8436310c 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -243,7 +243,7 @@ def optimize_acq(
     *,
     n_candidates_required: int = 1,
     num_restarts: int = 20,
-    n_intial_start_points: int = 256,
+    n_intial_start_points: int | None = None,
     acq_options: Mapping[str, Any] | None = None,
     maximum_allowed_categorical_combinations: int = 30,
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -260,6 +260,15 @@ def optimize_acq(
         if isinstance(t, CategoricalToIntegerTransformer)
     }
     if not any(cat_transformers):
+        # Small heuristic to increase the number of candidates as our dimensionality
+        # increases... we apply a cap.
+        if n_intial_start_points is None:
+            # TODO: Need to investigate how num_restarts is used in botorch to inform
+            # this proxy.
+
+            # Cap out at 4096 when len(bounds) >= 8
+            n_intial_start_points = min(64 * len(bounds) ** 2, 4096)
+
         return optimize_acqf(
             acq_function=acq_fn,
             bounds=bounds,
@@ -304,7 +313,7 @@ def optimize_acq(
     return optimize_acqf_mixed(
         acq_function=acq_fn,
         bounds=bounds,
-        num_restarts=num_restarts,
+        num_restarts=min(num_restarts // n_combos, 2),
         raw_samples=n_intial_start_points,
         q=n_candidates_required,
         fixed_features_list=fixed_cats,
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index d2b58c19..5a6593ba 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -313,7 +313,13 @@ def ask(
         y = _missing_y_strategy(y)
 
         # Now fit our model
-        y_model, y_likelihood = default_single_obj_gp(x, y)
+        y_model, y_likelihood = default_single_obj_gp(
+            x,
+            y,
+            # TODO: We should consider applying some heurisitc to see if this should
+            # also include a log transform, similar as we do to cost if using `use_cost`.
+            outcome_transform=Standardize(m=1),
+        )
         fit_gpytorch_mll(
             ExactMarginalLogLikelihood(likelihood=y_likelihood, model=y_model)
         )
diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py
index eb7e095b..6ea897f8 100644
--- a/neps_examples/basic_usage/hyperparameters.py
+++ b/neps_examples/basic_usage/hyperparameters.py
@@ -48,4 +48,5 @@ def run_pipeline(float1, float2, float3, integer1, integer2):
     root_directory="results/hyperparameters_example",
     post_run_summary=True,
     max_evaluations_total=50,
+    use_prior=True,
 )

From 5a710303711e90c52deaadf6e3c78b889bbe4da1 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 18 Sep 2024 12:57:43 +0200
Subject: [PATCH 34/63] fix: Remove old model

---
 neps/optimizers/bayesian_optimization/models/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/models/__init__.py b/neps/optimizers/bayesian_optimization/models/__init__.py
index 789b2b05..35ae2120 100755
--- a/neps/optimizers/bayesian_optimization/models/__init__.py
+++ b/neps/optimizers/bayesian_optimization/models/__init__.py
@@ -1,9 +1,5 @@
-from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP
-from neps.utils.common import MissingDependencyError
-
 from .ftpfn import FTPFNSurrogate
 
 SurrogateModelMapping = {
-    "gp": ComprehensiveGP,
     "ftpfn": FTPFNSurrogate,
 }

From afc904d89482afa21de8084b1cd19851fc10c3e4 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 18 Sep 2024 12:59:20 +0200
Subject: [PATCH 35/63] ci: Update ruff version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 775823dc..9b670189 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,7 +66,7 @@ torchvision = ">=0.8.0"
 ifbo = ">=0.3.10"
 
 [tool.poetry.group.dev.dependencies]
-ruff = "^0.4"
+ruff = "*"
 pre-commit = "^3"
 mypy = "^1"
 pytest = "^7"

From f0ec81ed21bf2543ff8ef296c18fde6c8ad8679a Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 18 Sep 2024 13:00:46 +0200
Subject: [PATCH 36/63] fix: Update pre-commit

---
 .pre-commit-config.yaml                       |   6 +-
 neps/optimizers/__init__.py                   |  10 +-
 neps/optimizers/base_optimizer.py             |  31 +--
 .../acquisition_functions/__init__.py         |   5 +-
 .../acquisition_functions/_ehvi.py            |   2 +
 .../acquisition_functions/base_acquisition.py |   2 +
 .../acquisition_functions/ei.py               |   3 +-
 .../acquisition_functions/mf_pi.py            | 105 +++++-----
 .../acquisition_functions/prior_weighted.py   |   3 +-
 .../acquisition_functions/ucb.py              |  20 +-
 .../weighted_acquisition.py                   |   3 +-
 .../acquisition_samplers/base_acq_sampler.py  |   4 +-
 .../acquisition_samplers/evolution_sampler.py |  21 +-
 .../freeze_thaw_sampler.py                    |  16 +-
 .../acquisition_samplers/mutation_sampler.py  |  11 +-
 .../acquisition_samplers/random_sampler.py    |   8 +-
 .../bayesian_optimization/kernels/__init__.py |   4 +-
 .../kernels/get_kernels.py                    |  14 +-
 .../kernels/grakel_replace/edge_histogram.py  | 154 +++++++-------
 .../kernels/grakel_replace/utils.py           |  18 +-
 .../grakel_replace/vertex_histogram.py        | 198 +++++++++---------
 .../grakel_replace/weisfeiler_lehman.py       | 193 ++++++++---------
 .../bayesian_optimization/kernels/utils.py    |  18 +-
 .../bayesian_optimization/models/ftpfn.py     |  10 +-
 .../bayesian_optimization/models/gp.py        |   8 +-
 .../bayesian_optimization/optimizer.py        |   3 +-
 neps/optimizers/grid_search/optimizer.py      |  12 +-
 neps/optimizers/info.py                       |  20 +-
 neps/optimizers/multi_fidelity/hyperband.py   | 163 +++++++-------
 neps/optimizers/multi_fidelity/ifbo.py        |  69 +++---
 neps/optimizers/multi_fidelity/mf_bo.py       |  62 +++---
 .../multi_fidelity/promotion_policy.py        |  16 +-
 .../multi_fidelity/sampling_policy.py         |  74 ++++---
 .../multi_fidelity/successive_halving.py      |  63 +++---
 neps/optimizers/multi_fidelity/utils.py       |  77 +++----
 .../multi_fidelity_prior/async_priorband.py   | 122 +++++------
 .../multi_fidelity_prior/priorband.py         |  85 ++++----
 neps/optimizers/multi_fidelity_prior/utils.py |  23 +-
 .../prototype_optimizer.py                    |  14 +-
 neps/optimizers/random_search/optimizer.py    |  12 +-
 .../regularized_evolution/optimizer.py        |  36 ++--
 neps/optimizers/utils.py                      |  22 +-
 neps/plot/tensorboard_eval.py                 |   5 +-
 neps/sampling/distributions.py                |   3 +-
 neps/sampling/priors.py                       |  11 +-
 neps/sampling/samplers.py                     |   3 +-
 neps/search_spaces/domain.py                  |   5 +-
 neps/search_spaces/encoding.py                |  10 +-
 neps/state/__init__.py                        |   2 +-
 49 files changed, 907 insertions(+), 872 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 92ff2356..aa81c5bb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -27,7 +27,7 @@ repos:
         files: '^src/.*\.py$'
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.11.1
+    rev: v1.11.2
     hooks:
       - id: mypy
         files: |
@@ -42,7 +42,7 @@ repos:
           - "--show-traceback"
 
   - repo: https://github.com/python-jsonschema/check-jsonschema
-    rev: 0.29.1
+    rev: 0.29.2
     hooks:
       - id: check-github-workflows
         files: '^github/workflows/.*\.ya?ml$'
@@ -51,7 +51,7 @@ repos:
         files: '^\.github/dependabot\.ya?ml$'
 
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.5.5
+    rev: v0.6.5
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix, --no-cache]
diff --git a/neps/optimizers/__init__.py b/neps/optimizers/__init__.py
index a6c0d5f9..c96022f6 100644
--- a/neps/optimizers/__init__.py
+++ b/neps/optimizers/__init__.py
@@ -1,18 +1,17 @@
-
-
+from collections.abc import Callable, Mapping
 from functools import partial
-from typing import TYPE_CHECKING, Callable, Mapping
+from typing import TYPE_CHECKING
 
 from .base_optimizer import BaseOptimizer
 from .bayesian_optimization.optimizer import BayesianOptimization
 from .grid_search.optimizer import GridSearch
-from .multi_fidelity.ifbo import IFBO
 from .multi_fidelity.hyperband import (
     MOBSTER,
     AsynchronousHyperband,
     Hyperband,
     HyperbandCustomDefault,
 )
+from .multi_fidelity.ifbo import IFBO
 from .multi_fidelity.successive_halving import (
     AsynchronousSuccessiveHalving,
     AsynchronousSuccessiveHalvingWithPriors,
@@ -24,9 +23,6 @@
 from .random_search.optimizer import RandomSearch
 from .regularized_evolution.optimizer import RegularizedEvolution
 
-if TYPE_CHECKING:
-    from .base_optimizer import BaseOptimizer
-
 # TODO: Rename Searcher to Optimizer...
 SearcherMapping: Mapping[str, Callable[..., BaseOptimizer]] = {
     "bayesian_optimization": partial(BayesianOptimization, use_priors=False),
diff --git a/neps/optimizers/base_optimizer.py b/neps/optimizers/base_optimizer.py
index c5b5f83f..a80f9f75 100644
--- a/neps/optimizers/base_optimizer.py
+++ b/neps/optimizers/base_optimizer.py
@@ -1,15 +1,18 @@
-
+from __future__ import annotations
 
 import logging
 from abc import abstractmethod
-from typing import Any, Mapping
-
+from collections.abc import Mapping
 from dataclasses import asdict, dataclass
-from neps.state.optimizer import BudgetInfo
-from neps.utils.types import ConfigResult, RawConfig, ERROR, ResultDict
-from neps.search_spaces.search_space import SearchSpace
-from neps.utils.data_loading import _get_cost, _get_learning_curve, _get_loss
+from typing import TYPE_CHECKING, Any
+
 from neps.state.trial import Trial
+from neps.utils.data_loading import _get_cost, _get_learning_curve, _get_loss
+from neps.utils.types import ERROR, ConfigResult, RawConfig, ResultDict
+
+if TYPE_CHECKING:
+    from neps.search_spaces.search_space import SearchSpace
+    from neps.state.optimizer import BudgetInfo
 
 
 @dataclass
@@ -58,7 +61,7 @@ def load_optimization_state(
 
     @abstractmethod
     def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]:
-        """Sample a new configuration
+        """Sample a new configuration.
 
         Returns:
             config: serializable object representing the configuration
@@ -74,7 +77,7 @@ def ask(
         budget_info: BudgetInfo | None,
         optimizer_state: dict[str, Any],
     ) -> tuple[SampledConfig, dict[str, Any]]:
-        """Sample a new configuration
+        """Sample a new configuration.
 
         !!! note
 
@@ -145,8 +148,8 @@ def get_loss(
         self, result: ERROR | ResultDict | float | Trial.Report
     ) -> float | ERROR:
         """Calls result.utils.get_loss() and passes the error handling through.
-        Please use self.get_loss() instead of get_loss() in all optimizer classes."""
-
+        Please use self.get_loss() instead of get_loss() in all optimizer classes.
+        """
         # TODO(eddiebergman): This is a forward change for whenever we can have optimizers
         # use `Trial` and `Report`, they already take care of this and save having to do this
         # `_get_loss` at every call. We can also then just use `None` instead of the string `"error"`
@@ -163,7 +166,8 @@ def get_cost(
         self, result: ERROR | ResultDict | float | Trial.Report
     ) -> float | ERROR:
         """Calls result.utils.get_cost() and passes the error handling through.
-        Please use self.get_cost() instead of get_cost() in all optimizer classes."""
+        Please use self.get_cost() instead of get_cost() in all optimizer classes.
+        """
         # TODO(eddiebergman): This is a forward change for whenever we can have optimizers
         # use `Trial` and `Report`, they already take care of this and save having to do this
         # `_get_loss` at every call
@@ -180,7 +184,8 @@ def get_learning_curve(
         self, result: str | dict | float | Trial.Report
     ) -> list[float] | Any:
         """Calls result.utils.get_loss() and passes the error handling through.
-        Please use self.get_loss() instead of get_loss() in all optimizer classes."""
+        Please use self.get_loss() instead of get_loss() in all optimizer classes.
+        """
         # TODO(eddiebergman): This is a forward change for whenever we can have optimizers
         # use `Trial` and `Report`, they already take care of this and save having to do this
         # `_get_loss` at every call
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py b/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py
index 542664c4..a125997d 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py
@@ -1,13 +1,10 @@
+from collections.abc import Callable
 from functools import partial
-from typing import Callable
 
 from neps.optimizers.bayesian_optimization.acquisition_functions.ei import (
     ComprehensiveExpectedImprovement,
 )
 from neps.optimizers.bayesian_optimization.acquisition_functions.mf_pi import MFPI_Random
-from neps.optimizers.bayesian_optimization.acquisition_functions.ucb import (
-    UpperConfidenceBound,
-)
 from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import (
     DecayingPriorWeightedAcquisition,
 )
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py b/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py
index 8722c545..236a17e5 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py
@@ -1,4 +1,6 @@
 # from abc import ABC, abstractmethod
+from __future__ import annotations
+
 from itertools import product
 
 import torch
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py
index 7249c0fd..b2f8783a 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from abc import ABC, abstractmethod
 
 
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
index 1a4e24d0..aadb76a0 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Sequence
+from collections.abc import Sequence
+from typing import TYPE_CHECKING
 
 import torch
 from torch.distributions import Normal
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py
index 71955820..ba2e886b 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py
@@ -1,26 +1,33 @@
 # type: ignore
-from typing import Any, Iterable, Tuple, Union
+from __future__ import annotations
+
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
-import pandas as pd
 import torch
 
-from copy import deepcopy
-
-from neps.optimizers.utils import map_real_hyperparameters_from_tabular_ids
-from neps.search_spaces.search_space import SearchSpace
+from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
+    BaseAcquisition,
+)
 from neps.optimizers.multi_fidelity.utils import (
-    get_freeze_thaw_normalized_step, get_tokenized_data, MFObservedData
+    MFObservedData,
+    get_freeze_thaw_normalized_step,
+    get_tokenized_data,
 )
-from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import BaseAcquisition
+from neps.optimizers.utils import map_real_hyperparameters_from_tabular_ids
 
+if TYPE_CHECKING:
+    import pandas as pd
+
+    from neps.search_spaces.search_space import SearchSpace
 
-class MFPI(BaseAcquisition):
 
+class MFPI(BaseAcquisition):
     def __init__(
         self,
         pipeline_space: SearchSpace,
-        surrogate_model_name: str = None,
+        surrogate_model_name: str | None = None,
     ):
         super().__init__()
         self.pipeline_space = pipeline_space
@@ -34,7 +41,7 @@ def set_state(
         pipeline_space: SearchSpace,
         surrogate_model: Any,
         observations: MFObservedData,
-        b_step: Union[int, float],
+        b_step: int | float,
         **kwargs,
     ):
         # overload to select incumbent differently through observations
@@ -42,9 +49,8 @@ def set_state(
         self.surrogate_model = surrogate_model
         self.observations = observations
         self.b_step = b_step
-        return
 
-    def preprocess(self, x: pd.Series) -> Tuple[pd.Series, torch.Tensor]:
+    def preprocess(self, x: pd.Series) -> tuple[pd.Series, torch.Tensor]:
         """Prepares the configurations for appropriate EI calculation.
 
         Takes a set of points and computes the budget and incumbent for each point, as
@@ -52,7 +58,7 @@ def preprocess(self, x: pd.Series) -> Tuple[pd.Series, torch.Tensor]:
         """
         raise NotImplementedError
 
-    def eval(self, x: pd.Series, asscalar: bool = False) -> Tuple[np.ndarray, pd.Series]:
+    def eval(self, x: pd.Series, asscalar: bool = False) -> tuple[np.ndarray, pd.Series]:
         # deepcopy
         # _x = pd.Series([deepcopy(x.loc[idx]) for idx in x.index.values], index=x.index)
         if self.surrogate_model_name == "ftpfn":
@@ -64,18 +70,20 @@ def eval(self, x: pd.Series, asscalar: bool = False) -> Tuple[np.ndarray, pd.Ser
             idx_mask = np.where(_idx > max(self.observations.seen_config_ids))[0]
             _idx[idx_mask] = 0
             # normalizing steps
-            _steps = torch.Tensor([
-                get_freeze_thaw_normalized_step(
-                    _conf.fidelity.value,
-                    self.pipeline_space.fidelity.lower,
-                    self.pipeline_space.fidelity.upper,
-                    self.b_step
-                )
-                for _conf in _x
-            ])
-            _x_tok = torch.hstack((
-                (_idx).reshape(-1, 1), _steps.reshape(-1, 1), torch.Tensor(_x_tok)
-            ))
+            _steps = torch.Tensor(
+                [
+                    get_freeze_thaw_normalized_step(
+                        _conf.fidelity.value,
+                        self.pipeline_space.fidelity.lower,
+                        self.pipeline_space.fidelity.upper,
+                        self.b_step,
+                    )
+                    for _conf in _x
+                ]
+            )
+            _x_tok = torch.hstack(
+                ((_idx).reshape(-1, 1), _steps.reshape(-1, 1), torch.Tensor(_x_tok))
+            )
             pi = self.eval_pfn_pi(_x_tok, inc_list)
         else:
             raise ValueError(
@@ -85,12 +93,11 @@ def eval(self, x: pd.Series, asscalar: bool = False) -> Tuple[np.ndarray, pd.Ser
             pi = pi.cpu()
         if len(_x) > 1 and asscalar:
             return pi.detach().numpy(), _x
-        else:
-            return pi.detach().numpy().item(), _x
+        return pi.detach().numpy().item(), _x
 
     def eval_pfn_pi(
         self, x: Iterable, inc_list: Iterable
-    ) -> Union[np.ndarray, torch.Tensor, float]:
+    ) -> np.ndarray | torch.Tensor | float:
         """PFN-PI modified to preprocess samples and accept list of incumbents."""
         pi = self.surrogate_model.get_pi(x.to(self.surrogate_model.device), inc_list)
         if len(pi.shape) == 2:
@@ -99,7 +106,6 @@ def eval_pfn_pi(
 
 
 class MFPI_Random(MFPI):
-
     BUDGET = 1000
 
     def __init__(
@@ -107,7 +113,7 @@ def __init__(
         pipeline_space: SearchSpace,
         horizon: str = "random",
         threshold: str = "random",
-        surrogate_model_name: str = None,
+        surrogate_model_name: str | None = None,
     ):
         super().__init__(pipeline_space, surrogate_model_name)
         self.horizon = horizon
@@ -118,35 +124,34 @@ def set_state(
         pipeline_space: SearchSpace,
         surrogate_model: Any,
         observations: MFObservedData,
-        b_step: Union[int, float],
+        b_step: int | float,
         **kwargs,
     ):
         # set RNG
         self.rng = np.random.RandomState(seed=42)
-        for i in range(len(observations.completed_runs)):
-            self.rng.uniform(-4,-1)
-            self.rng.randint(1,51)
+        for _i in range(len(observations.completed_runs)):
+            self.rng.uniform(-4, -1)
+            self.rng.randint(1, 51)
 
         return super().set_state(pipeline_space, surrogate_model, observations, b_step)
 
     def sample_horizon(self, steps_passed):
-        if self.horizon == 'random':
+        if self.horizon == "random":
             shortest = self.pipeline_space.fidelity.lower
             longest = min(self.pipeline_space.fidelity.upper, self.BUDGET - steps_passed)
-            return self.rng.randint(shortest, longest+1)
-        elif self.horizon == 'max':
+            return self.rng.randint(shortest, longest + 1)
+        if self.horizon == "max":
             return min(self.pipeline_space.fidelity.upper, self.BUDGET - steps_passed)
-        else:
-            return int(self.horizon)
+        return int(self.horizon)
 
     def sample_performance_threshold(self, f_inc):
-        if self.threshold == 'random':
-            lu = 10**self.rng.uniform(-4,-1) # % of gap closed
+        if self.threshold == "random":
+            lu = 10 ** self.rng.uniform(-4, -1)  # % of gap closed
         else:
             lu = float(self.threshold)
         return f_inc * (1 - lu)
 
-    def preprocess(self, x: pd.Series) -> Tuple[pd.Series, torch.Tensor]:
+    def preprocess(self, x: pd.Series) -> tuple[pd.Series, torch.Tensor]:
         """Prepares the configurations for appropriate EI calculation.
 
         Takes a set of points and computes the budget and incumbent for each point, as
@@ -180,11 +185,13 @@ def preprocess(self, x: pd.Series) -> Tuple[pd.Series, torch.Tensor]:
                     indices_to_drop.append(i)
                 else:
                     # a candidate partial training run to continue
-                    config.update_hp_values({
-                        config.fidelity_name: min(
-                            config.fidelity.value + horizon, config.fidelity.upper
-                        )  # if horizon exceeds max, query at max
-                    }) 
+                    config.update_hp_values(
+                        {
+                            config.fidelity_name: min(
+                                config.fidelity.value + horizon, config.fidelity.upper
+                            )  # if horizon exceeds max, query at max
+                        }
+                    )
                     inc_list.append(inc_value)
             else:
                 # a candidate new training run that we would need to start
@@ -192,7 +199,7 @@ def preprocess(self, x: pd.Series) -> Tuple[pd.Series, torch.Tensor]:
                 inc_list.append(inc_value)
 
         # Drop unused configs
-        x.drop(labels=indices_to_drop, inplace=True)
+        x = x.drop(labels=indices_to_drop)
 
         assert len(inc_list) == len(x)
 
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
index 8a735d58..2728d67a 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Iterable
+from collections.abc import Iterable
+from typing import TYPE_CHECKING
 from typing_extensions import override
 
 import numpy as np
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py
index 11b592eb..3733b693 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py
@@ -1,4 +1,6 @@
-from typing import Iterable, Union
+from __future__ import annotations
+
+from collections.abc import Iterable
 
 import numpy as np
 import torch
@@ -7,7 +9,7 @@
 
 
 class UpperConfidenceBound(BaseAcquisition):
-    def __init__(self, beta: float=1.0, maximize: bool=False):
+    def __init__(self, beta: float = 1.0, maximize: bool = False):
         """Upper Confidence Bound (UCB) acquisition function.
 
         Args:
@@ -18,7 +20,7 @@ def __init__(self, beta: float=1.0, maximize: bool=False):
         super().__init__()
         self.beta = beta  # can be updated as part of the state for dynamism or a schedule
         self.maximize = maximize
-        
+
         # to be initialized as part of the state
         self.surrogate_model = None
 
@@ -26,14 +28,14 @@ def set_state(self, surrogate_model, **kwargs):
         super().set_state(surrogate_model)
         self.surrogate_model = surrogate_model
         if "beta" in kwargs:
-            if not isinstance(kwargs["beta"], (list, np.array)):
+            if not isinstance(kwargs["beta"], list | np.array):
                 self.beta = kwargs["beta"]
             else:
                 self.logger.warning("Beta is a list, not updating beta value!")
-        
+
     def eval(
         self, x: Iterable, asscalar: bool = False
-    ) -> Union[np.ndarray, torch.Tensor, float]:
+    ) -> np.ndarray | torch.Tensor | float:
         try:
             mu, cov = self.surrogate_model.predict(x)
             std = torch.sqrt(torch.diag(cov))
@@ -41,7 +43,5 @@ def eval(
             raise e
         sign = 1 if self.maximize else -1  # LCB is performed if minimize=True
         ucb_scores = mu + sign * np.sqrt(self.beta) * std
-        # if LCB, minimize acquisition, or maximize -acquisition  
-        ucb_scores = ucb_scores.detach().numpy() * sign  
-
-        return ucb_scores
+        # if LCB, minimize acquisition, or maximize -acquisition
+        return ucb_scores.detach().numpy() * sign
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
index fa7ca176..f589298b 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
@@ -74,7 +74,8 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Callable, TypeVar
+from collections.abc import Callable
+from typing import TYPE_CHECKING, TypeVar
 
 from botorch.acquisition import SampleReducingMCAcquisitionFunction
 from botorch.acquisition.analytic import AcquisitionFunction, t_batch_mode_transform
diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/base_acq_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/base_acq_sampler.py
index adf47b82..c0049a3f 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_samplers/base_acq_sampler.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/base_acq_sampler.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 
 from abc import abstractmethod
-from typing import TYPE_CHECKING, Sequence, Callable
+from collections.abc import Callable, Sequence
+from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     import numpy as np
     import torch
+
     from neps.search_spaces.search_space import SearchSpace
 
 
diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/evolution_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/evolution_sampler.py
index 6a76dcfc..4aec84eb 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_samplers/evolution_sampler.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/evolution_sampler.py
@@ -1,13 +1,17 @@
+from __future__ import annotations
+
 import random
 from heapq import nlargest
-from typing import List, Tuple
+from typing import TYPE_CHECKING
 
 import numpy as np
 
-from ....search_spaces.search_space import SearchSpace
 from .base_acq_sampler import AcquisitionSampler
 from .random_sampler import RandomSampler
 
+if TYPE_CHECKING:
+    from neps.search_spaces.search_space import SearchSpace
+
 
 class EvolutionSampler(AcquisitionSampler):
     def __init__(
@@ -113,7 +117,7 @@ def evolution(
         acquisition_function,
         previous_samples: list,
         population_size: int,
-        batch_size: int = None,
+        batch_size: int | None = None,
     ):
         def inner_loop(population, fitness, X_max, acq_max):
             try:
@@ -142,7 +146,7 @@ def inner_loop(population, fitness, X_max, acq_max):
             if not self.allow_isomorphism and self.check_isomorphism_history
             else []
         )
-        population: List[SearchSpace] = []
+        population: list[SearchSpace] = []
         remaining_patience = self.patience
         while (
             population_size - len(previous_samples) > len(population)
@@ -186,7 +190,10 @@ def inner_loop(population, fitness, X_max, acq_max):
                     population, fitness, X_max, acq_max
                 )
                 if all(
-                    all(np.isclose(x, l) for l in list(zip(*iterations_best[-5:]))[j])
+                    all(
+                        np.isclose(x, l)
+                        for l in list(zip(*iterations_best[-5:], strict=False))[j]
+                    )
                     for j, x in enumerate(acq_max)
                 ):
                     break
@@ -195,8 +202,8 @@ def inner_loop(population, fitness, X_max, acq_max):
 
         return X_max, population, acq_max
 
-    def sample(self, acquisition_function) -> Tuple[list, list, list]:
-        population: List[SearchSpace] = []
+    def sample(self, acquisition_function) -> tuple[list, list, list]:
+        population: list[SearchSpace] = []
         if self.initial_history_last > 0 and len(self.x) >= self.initial_history_last:
             population = self.x[-self.initial_history_last :]
         if self.initial_history_best > 0 and len(self.x) >= self.initial_history_best:
diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py
index 93c7370f..3021bfe0 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py
@@ -1,18 +1,20 @@
 from __future__ import annotations
 
-from typing import Callable
 import warnings
+from collections.abc import Callable
+from copy import deepcopy
+from typing import TYPE_CHECKING
 
 import numpy as np
 import pandas as pd
-from copy import deepcopy
 
-from neps.search_spaces.search_space import SearchSpace
-from neps.optimizers.multi_fidelity.utils import MFObservedData
 from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
     AcquisitionSampler,
 )
 
+if TYPE_CHECKING:
+    from neps.optimizers.multi_fidelity.utils import MFObservedData
+    from neps.search_spaces.search_space import SearchSpace
 
 SAMPLES_TO_DRAW = (
     100  # number of random samples to draw for optimizing acquisition function
@@ -131,7 +133,7 @@ def sample(
             # handles tabular data such that the entire unseen set of configs from the
             # table is considered to be the new set of candidates
             _partial_ids = {conf["id"].value for conf in partial_configs}
-            _all_ids = set(list(self.pipeline_space.custom_grid_table.keys()))
+            _all_ids = set(self.pipeline_space.custom_grid_table.keys())
 
             # accounting for unseen configs only, samples remaining table if flag is set
             max_n = len(_all_ids) + 1 if self.sample_full_table else _n
@@ -178,9 +180,7 @@ def sample(
         for config in new_configs:
             config.update_hp_values({config.fidelity_name: new_fid})
 
-        configs = pd.concat([deepcopy(partial_configs), new_configs])
-
-        return configs  # type: ignore
+        return pd.concat([deepcopy(partial_configs), new_configs])
 
     def set_state(
         self,
diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py
index 81f79b96..51f10bfb 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Callable, Sequence
+from collections.abc import Callable, Sequence
+from typing import TYPE_CHECKING
 from typing_extensions import override
 
 import numpy as np
@@ -119,9 +120,11 @@ def create_pool(
 
         n_best = len(self.x) if len(self.x) < self.n_best else self.n_best
         best_configs = [
-           x for (_, x) in
-           sorted(zip(self.y, self.x), key=lambda pair: pair[0])
-       ][:n_best]
+            x
+            for (_, x) in sorted(
+                zip(self.y, self.x, strict=False), key=lambda pair: pair[0]
+            )
+        ][:n_best]
 
         seen: set[int] = set()
 
diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/random_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/random_sampler.py
index 5d783a3e..d5335731 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_samplers/random_sampler.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/random_sampler.py
@@ -1,6 +1,12 @@
-from ....search_spaces.search_space import SearchSpace
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 from .base_acq_sampler import AcquisitionSampler
 
+if TYPE_CHECKING:
+    from neps.search_spaces.search_space import SearchSpace
+
 
 class RandomSampler(AcquisitionSampler):
     def __init__(self, pipeline_space: SearchSpace, patience: int = 100):
diff --git a/neps/optimizers/bayesian_optimization/kernels/__init__.py b/neps/optimizers/bayesian_optimization/kernels/__init__.py
index 3c922b17..7c7018d0 100644
--- a/neps/optimizers/bayesian_optimization/kernels/__init__.py
+++ b/neps/optimizers/bayesian_optimization/kernels/__init__.py
@@ -1,7 +1,5 @@
-
-
+from collections.abc import Callable
 from functools import partial
-from typing import Callable
 
 from .vectorial_kernels import HammingKernel, Matern32Kernel, Matern52Kernel, RBFKernel
 from .weisfilerlehman import WeisfilerLehman
diff --git a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py b/neps/optimizers/bayesian_optimization/kernels/get_kernels.py
index 927e23c2..36add92e 100644
--- a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py
+++ b/neps/optimizers/bayesian_optimization/kernels/get_kernels.py
@@ -1,9 +1,11 @@
-from neps.utils.common import instance_from_map
-from ....search_spaces.architecture.core_graph_grammar import CoreGraphGrammar
-from ....search_spaces.hyperparameters.categorical import CategoricalParameter
-from ....search_spaces.hyperparameters.float import FloatParameter
-from ....search_spaces.hyperparameters.integer import IntegerParameter
-from ....utils.common import has_instance
+from __future__ import annotations
+
+from neps.search_spaces.architecture.core_graph_grammar import CoreGraphGrammar
+from neps.search_spaces.hyperparameters.categorical import CategoricalParameter
+from neps.search_spaces.hyperparameters.float import FloatParameter
+from neps.search_spaces.hyperparameters.integer import IntegerParameter
+from neps.utils.common import has_instance, instance_from_map
+
 from . import GraphKernelMapping, StationaryKernelMapping
 
 
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py
index 1b0b37d6..12a83a19 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py
@@ -1,5 +1,7 @@
 """The Edge Histogram kernel as defined in :cite:`sugiyama2015halting`."""
 
+from __future__ import annotations
+
 from collections import Counter
 from collections.abc import Iterable
 from warnings import warn
@@ -22,7 +24,7 @@ class EdgeHistogram(VertexHistogram):
         If 'auto', uses a sparse matrix when the number of zeros is more than the half of the matrix size.
         In all cases if the dense matrix doesn't fit system memory, I sparse approach will be tried.
 
-    Attributes
+    Attributes:
     ----------
     None.
 
@@ -41,7 +43,7 @@ def parse_input(self, X: Iterable, **kwargs):
             node_labels and the third edge_labels (that fitting the given graph
             format).
 
-        Returns
+        Returns:
         -------
         out : np.array, shape=(len(X), n_labels)
             A np array for frequency (cols) histograms for all Graphs (rows).
@@ -49,82 +51,78 @@ def parse_input(self, X: Iterable, **kwargs):
         """
         if not isinstance(X, Iterable):
             raise TypeError("input must be an iterable\n")
-        else:
-            rows, cols, data = list(), list(), list()
-            if self._method_calling in [1, 2]:
-                labels = dict()
-                self._labels = labels
-            elif self._method_calling == 3:
-                labels = dict(self._labels)
-            ni = 0
-            for i, x in enumerate(iter(X)):
-                is_iter = isinstance(x, Iterable)
-                if is_iter:
-                    x = list(x)
-                if is_iter and len(x) in [0, 3]:
-                    if len(x) == 0:
-                        warn("Ignoring empty element on index: " + str(i))
-                        continue
-                    else:
-                        # Our element is an iterable of at least 2 elements
-                        L = x[2]
-                elif isinstance(x, Graph):
-                    # get labels in any existing format
-                    L = x.get_labels(purpose="any", label_type="edge")
-                else:
-                    raise TypeError(
-                        "each element of X must be either a "
-                        + "graph object or a list with at least "
-                        + "a graph like object and node labels "
-                        + "dict \n"
-                    )
-
-                if L is None:
-                    raise ValueError("Invalid graph entry at location " + str(i) + "!")
-                # construct the data input for the numpy array
-                for label, frequency in Counter(L.values()).items():
-                    # for the row that corresponds to that graph
-                    rows.append(ni)
-
-                    # and to the value that this label is indexed
-                    col_idx = labels.get(label, None)
-                    if col_idx is None:
-                        # if not indexed, add the new index (the next)
-                        col_idx = len(labels)
-                        labels[label] = col_idx
-
-                    # designate the certain column information
-                    cols.append(col_idx)
-
-                    # as well as the frequency value to data
-                    data.append(frequency)
-                ni += 1
+        rows, cols, data = [], [], []
+        if self._method_calling in [1, 2]:
+            labels = {}
+            self._labels = labels
+        elif self._method_calling == 3:
+            labels = dict(self._labels)
+        ni = 0
+        for i, x in enumerate(iter(X)):
+            is_iter = isinstance(x, Iterable)
+            if is_iter:
+                x = list(x)
+            if is_iter and len(x) in [0, 3]:
+                if len(x) == 0:
+                    warn("Ignoring empty element on index: " + str(i))
+                    continue
+                # Our element is an iterable of at least 2 elements
+                L = x[2]
+            elif isinstance(x, Graph):
+                # get labels in any existing format
+                L = x.get_labels(purpose="any", label_type="edge")
+            else:
+                raise TypeError(
+                    "each element of X must be either a "
+                    + "graph object or a list with at least "
+                    + "a graph like object and node labels "
+                    + "dict \n"
+                )
 
+            if L is None:
+                raise ValueError("Invalid graph entry at location " + str(i) + "!")
+            # construct the data input for the numpy array
+            for label, frequency in Counter(L.values()).items():
+                # for the row that corresponds to that graph
+                rows.append(ni)
+
+                # and to the value that this label is indexed
+                col_idx = labels.get(label, None)
+                if col_idx is None:
+                    # if not indexed, add the new index (the next)
+                    col_idx = len(labels)
+                    labels[label] = col_idx
+
+                # designate the certain column information
+                cols.append(col_idx)
+
+                # as well as the frequency value to data
+                data.append(frequency)
+            ni += 1
+
+        # Initialise the feature matrix
+        if self._method_calling in [1, 2]:
+            if self.sparse == "auto":
+                self.sparse_ = len(cols) / float(ni * len(labels)) <= 0.5
+            else:
+                self.sparse_ = bool(self.sparse)
+
+        if self.sparse_:
+            features = csr_matrix(
+                (data, (rows, cols)), shape=(ni, len(labels)), copy=False
+            )
+        else:
             # Initialise the feature matrix
-            if self._method_calling in [1, 2]:
-                if self.sparse == "auto":
-                    self.sparse_ = len(cols) / float(ni * len(labels)) <= 0.5
-                else:
-                    self.sparse_ = bool(self.sparse)
-
-            if self.sparse_:
-                features = csr_matrix(
-                    (data, (rows, cols)), shape=(ni, len(labels)), copy=False
+            try:
+                features = zeros(shape=(ni, len(labels)))
+                features[rows, cols] = data
+            except MemoryError:
+                warn("memory-error: switching to sparse")
+                self.sparse_, features = (
+                    True,
+                    csr_matrix((data, (rows, cols)), shape=(ni, len(labels)), copy=False),
                 )
-            else:
-                # Initialise the feature matrix
-                try:
-                    features = zeros(shape=(ni, len(labels)))
-                    features[rows, cols] = data
-                except MemoryError:
-                    warn("memory-error: switching to sparse")
-                    self.sparse_, features = (
-                        True,
-                        csr_matrix(
-                            (data, (rows, cols)), shape=(ni, len(labels)), copy=False
-                        ),
-                    )
-
-            if ni == 0:
-                raise ValueError("parsed input is empty")
-            return features
+
+        if ni == 0:
+            raise ValueError("parsed input is empty")
+        return features
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py
index b34b2c79..e0ad94f3 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py
@@ -1,11 +1,12 @@
+from __future__ import annotations
+
 import torch
 
 
 def calculate_kernel_matrix_as_tensor(
     X, Y=None, oa=False, se_kernel=None, normalize=True
 ) -> torch.Tensor:
-    """
-    Same as calculate kernel matrix, but in pytorch framework and uses autodiff to compute the gradient of
+    """Same as calculate kernel matrix, but in pytorch framework and uses autodiff to compute the gradient of
     the kernel function with respect to the feature vector.
 
     This function is taken out of the class to facilitate derivative computation.
@@ -26,18 +27,14 @@ def calculate_kernel_matrix_as_tensor(
 
     normalize: bool: Whether to normalize the GP covariance matrix to the range of [0, 1]. Default is True.
 
-    Returns
+    Returns:
     -------
     K: pytorch tensor, shape = [n_targets, n_inputs]
     dK_dY: pytorch tensor, of the same shape of K. The derivative of the value of the kernel function with
     respect to each of the X. If Y is None, the derivative is instead taken at the *training point* (i.e. X).
     """
-
     if Y is None:
-        if se_kernel is not None:
-            K = se_kernel.forward(X, X)
-        else:
-            K = X @ X.t()
+        K = se_kernel.forward(X, X) if se_kernel is not None else X @ X.t()
         if normalize:
             K_diag = torch.sqrt(torch.diag(K))
             K_diag_outer = torch.ger(K_diag, K_diag)
@@ -46,10 +43,7 @@ def calculate_kernel_matrix_as_tensor(
         assert Y.shape[1] == X.shape[1], (
             "got Y shape " + str(Y.shape[1]) + " but X shape " + str(X.shape[1])
         )
-        if se_kernel is not None:
-            K = se_kernel.forward(X, Y)
-        else:
-            K = Y @ X.t()
+        K = se_kernel.forward(X, Y) if se_kernel is not None else Y @ X.t()
         if normalize:
             Kxx = calculate_kernel_matrix_as_tensor(
                 X, X, oa=oa, se_kernel=se_kernel, normalize=False
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
index 4a4dfc79..a3a31bdf 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
@@ -108,7 +108,7 @@ def initialize(self):
             self._initialized["n_jobs"] = True
         if not self._initialized["sparse"]:
             if self.sparse not in ["auto", False, True]:
-                TypeError("sparse could be False, True or auto")
+                raise TypeError("sparse could be False, True or auto")
             self._initialized["sparse"] = True
 
     def parse_input(self, X, label_start_idx=0, label_end_idx=None):
@@ -144,100 +144,98 @@ def parse_input(self, X, label_start_idx=0, label_end_idx=None):
 
         if not isinstance(X, Iterable):
             raise TypeError("input must be an iterable\n")
-        else:
-            rows, cols, data = [], [], []
-            if self._method_calling in [0, 1, 2]:
-                labels = {}
-                self._labels = labels
-            elif self._method_calling == 3:
-                labels = dict(self._labels)
-            ni = 0
-            for i, x in enumerate(iter(X)):
-                is_iter = isinstance(x, Iterable)
-                if is_iter:
-                    x = list(x)
-                if is_iter and len(x) in [0, 2, 3]:
-                    if len(x) == 0:
-                        warn("Ignoring empty element on index: " + str(i))
-                        continue
-                    else:
-                        # Our element is an iterable of at least 2 elements
-                        L = x[1]
-                elif isinstance(x, Graph):
-                    # get labels in any existing format
-                    L = x.get_labels(purpose="any")
-                else:
-                    raise TypeError(
-                        "each element of X must be either a "
-                        "graph object or a list with at least "
-                        "a graph like object and node labels "
-                        "dict \n"
-                    )
-
-                # construct the data input for the numpy array
-                for label, frequency in Counter(L.values()).items():
-                    # for the row that corresponds to that graph
-                    rows.append(ni)
-
-                    # and to the value that this label is indexed
-                    if self.require_ordered_features:
-                        try:
-                            col_idx = int(label) - label_start_idx  # Offset
-                        except ValueError:
-                            logging.error(
-                                "Failed to convert label to a valid integer. Check whether all labels are"
-                                "numeric, and whether you called this kernel directly instead of from the"
-                                "Weisfiler-Lehman kernel. Falling back to the default unordered feature"
-                                "matrix."
-                            )
-                            self.require_ordered_features = False
-                    if not self.require_ordered_features:
-                        col_idx = labels.get(label, None)
-                        if col_idx is None:
-                            # if not indexed, add the new index (the next)
-                            col_idx = len(labels)
-                            labels[label] = col_idx
-
-                    # designate the certain column information
-                    cols.append(col_idx)
-
-                    # as well as the frequency value to data
-                    data.append(frequency)
-                ni += 1
-
-            if self.require_ordered_features:
-                label_length = max(label_end_idx - label_start_idx, *cols) + 1
+        rows, cols, data = [], [], []
+        if self._method_calling in [0, 1, 2]:
+            labels = {}
+            self._labels = labels
+        elif self._method_calling == 3:
+            labels = dict(self._labels)
+        ni = 0
+        for i, x in enumerate(iter(X)):
+            is_iter = isinstance(x, Iterable)
+            if is_iter:
+                x = list(x)
+            if is_iter and len(x) in [0, 2, 3]:
+                if len(x) == 0:
+                    warn("Ignoring empty element on index: " + str(i))
+                    continue
+                # Our element is an iterable of at least 2 elements
+                L = x[1]
+            elif isinstance(x, Graph):
+                # get labels in any existing format
+                L = x.get_labels(purpose="any")
             else:
-                label_length = len(labels)
+                raise TypeError(
+                    "each element of X must be either a "
+                    "graph object or a list with at least "
+                    "a graph like object and node labels "
+                    "dict \n"
+                )
 
-            if self._method_calling in [0, 1, 2]:
-                if self.sparse == "auto":
-                    self.sparse_ = len(cols) / float(ni * label_length) <= 0.5
-                else:
-                    self.sparse_ = bool(self.sparse)
+            # construct the data input for the numpy array
+            for label, frequency in Counter(L.values()).items():
+                # for the row that corresponds to that graph
+                rows.append(ni)
+
+                # and to the value that this label is indexed
+                if self.require_ordered_features:
+                    try:
+                        col_idx = int(label) - label_start_idx  # Offset
+                    except ValueError:
+                        logging.error(
+                            "Failed to convert label to a valid integer. Check whether all labels are"
+                            "numeric, and whether you called this kernel directly instead of from the"
+                            "Weisfiler-Lehman kernel. Falling back to the default unordered feature"
+                            "matrix."
+                        )
+                        self.require_ordered_features = False
+                if not self.require_ordered_features:
+                    col_idx = labels.get(label, None)
+                    if col_idx is None:
+                        # if not indexed, add the new index (the next)
+                        col_idx = len(labels)
+                        labels[label] = col_idx
+
+                # designate the certain column information
+                cols.append(col_idx)
+
+                # as well as the frequency value to data
+                data.append(frequency)
+            ni += 1
 
-            if self.sparse_:
-                features = csr_matrix(
-                    (data, (rows, cols)), shape=(ni, label_length), copy=False
-                )
+        if self.require_ordered_features:
+            label_length = max(label_end_idx - label_start_idx, *cols) + 1
+        else:
+            label_length = len(labels)
+
+        if self._method_calling in [0, 1, 2]:
+            if self.sparse == "auto":
+                self.sparse_ = len(cols) / float(ni * label_length) <= 0.5
             else:
-                # Initialise the feature matrix
-                try:
-                    features = zeros(shape=(ni, label_length))
-                    features[rows, cols] = data
-
-                except MemoryError:
-                    warn("memory-error: switching to sparse")
-                    self.sparse_, features = (
-                        True,
-                        csr_matrix(
-                            (data, (rows, cols)), shape=(ni, label_length), copy=False
-                        ),
-                    )
-
-            if ni == 0:
-                raise ValueError("parsed input is empty")
-            return features
+                self.sparse_ = bool(self.sparse)
+
+        if self.sparse_:
+            features = csr_matrix(
+                (data, (rows, cols)), shape=(ni, label_length), copy=False
+            )
+        else:
+            # Initialise the feature matrix
+            try:
+                features = zeros(shape=(ni, label_length))
+                features[rows, cols] = data
+
+            except MemoryError:
+                warn("memory-error: switching to sparse")
+                self.sparse_, features = (
+                    True,
+                    csr_matrix(
+                        (data, (rows, cols)), shape=(ni, label_length), copy=False
+                    ),
+                )
+
+        if ni == 0:
+            raise ValueError("parsed input is empty")
+        return features
 
     def _calculate_kernel_matrix(self, Y=None):
         """Calculate the kernel matrix given a target_graph and a kernel.
@@ -282,8 +280,7 @@ def _calculate_kernel_matrix(self, Y=None):
 
         if self.sparse_:
             return K.toarray()
-        else:
-            return K
+        return K
 
     def diagonal(self, use_tensor=False):
         """Calculate the kernel matrix diagonal of the fitted data.
@@ -319,12 +316,11 @@ def diagonal(self, use_tensor=False):
             if use_tensor:
                 Y_diag = torch.einsum("ij, ij->i", [self.Y_tensor, self.Y_tensor])
                 return self._X_diag, Y_diag
+            if self.sparse_:
+                Y_diag = squeeze(array(self._Y.multiply(self._Y).sum(axis=1)))
             else:
-                if self.sparse_:
-                    Y_diag = squeeze(array(self._Y.multiply(self._Y).sum(axis=1)))
-                else:
-                    Y_diag = einsum("ij,ij->i", self._Y, self._Y)
-                return self._X_diag, Y_diag
+                Y_diag = einsum("ij,ij->i", self._Y, self._Y)
+            return self._X_diag, Y_diag
         except NotFittedError:
             return self._X_diag
 
@@ -360,8 +356,7 @@ def transform(self, X, return_embedding_only=False, **kwargs):
         # Input validation and parsing
         if X is None:
             raise ValueError("`transform` input cannot be None")
-        else:
-            Y = self.parse_input(X, **kwargs)
+        Y = self.parse_input(X, **kwargs)
         if return_embedding_only:
             return Y
 
@@ -446,8 +441,7 @@ def fit(self, X, y=None, **kwargs):
         # Input validation and parsing
         if X is None:
             raise ValueError("`fit` input cannot be None")
-        else:
-            self.X = self.parse_input(X, **kwargs)
+        self.X = self.parse_input(X, **kwargs)
 
         # Return the transformer
         return self
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
index 8c4baf64..f62d0ca0 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
@@ -8,8 +8,8 @@
 import warnings
 from ast import literal_eval
 from collections import OrderedDict
+from collections.abc import Iterable
 from copy import deepcopy
-from typing import Iterable
 
 import numpy as np
 import torch
@@ -209,7 +209,7 @@ def parse_input(
             raise ValueError(
                 "method call must be called either from fit " + "or fit-transform"
             )
-        elif hasattr(self, "_X_diag"):
+        if hasattr(self, "_X_diag"):
             # Clean _X_diag value
             delattr(self, "_X_diag")
 
@@ -221,57 +221,56 @@ def parse_input(
             # Input validation and parsing
             if not isinstance(X, collections.abc.Iterable):
                 raise TypeError("input must be an iterable\n")
-            else:
-                nx = 0
-                Gs_ed, L, distinct_values, extras = {}, {}, set(), {}
-                for idx, x in enumerate(iter(X)):
-                    is_iter = isinstance(x, collections.abc.Iterable)
-                    if is_iter:
-                        x = list(x)
-                    if is_iter and (len(x) == 0 or len(x) >= 2):
-                        if len(x) == 0:
-                            warnings.warn("Ignoring empty element on index: " + str(idx))
-                            continue
-                        elif len(x) > 2:
-                            extra = ()
-                            if len(x) > 3:
-                                extra = tuple(x[3:])
-                            x = Graph(x[0], x[1], x[2], graph_format=self._graph_format)
-                            extra = (
-                                x.get_labels(
-                                    purpose=self._graph_format,
-                                    label_type="edge",
-                                    return_none=True,
-                                ),
-                                *extra,
-                            )
-                        else:
-                            x = Graph(x[0], x[1], {}, graph_format=self._graph_format)
-                            extra = ()
-
-                    elif isinstance(x, Graph):
-                        x.desired_format(self._graph_format)
-                        el = x.get_labels(
-                            purpose=self._graph_format,
-                            label_type="edge",
-                            return_none=True,
+            nx = 0
+            Gs_ed, L, distinct_values, extras = {}, {}, set(), {}
+            for idx, x in enumerate(iter(X)):
+                is_iter = isinstance(x, collections.abc.Iterable)
+                if is_iter:
+                    x = list(x)
+                if is_iter and (len(x) == 0 or len(x) >= 2):
+                    if len(x) == 0:
+                        warnings.warn("Ignoring empty element on index: " + str(idx))
+                        continue
+                    if len(x) > 2:
+                        extra = ()
+                        if len(x) > 3:
+                            extra = tuple(x[3:])
+                        x = Graph(x[0], x[1], x[2], graph_format=self._graph_format)
+                        extra = (
+                            x.get_labels(
+                                purpose=self._graph_format,
+                                label_type="edge",
+                                return_none=True,
+                            ),
+                            *extra,
                         )
-                        extra = () if el is None else (el,)
-
                     else:
-                        raise TypeError(
-                            "each element of X must be either a "
-                            + "graph object or a list with at least "
-                            + "a graph like object and node labels "
-                            + "dict \n"
-                        )
-                    Gs_ed[nx] = x.get_edge_dictionary()
-                    L[nx] = x.get_labels(purpose="dictionary")
-                    extras[nx] = extra
-                    distinct_values |= set(L[nx].values())
-                    nx += 1
-                if nx == 0:
-                    raise ValueError("parsed input is empty")
+                        x = Graph(x[0], x[1], {}, graph_format=self._graph_format)
+                        extra = ()
+
+                elif isinstance(x, Graph):
+                    x.desired_format(self._graph_format)
+                    el = x.get_labels(
+                        purpose=self._graph_format,
+                        label_type="edge",
+                        return_none=True,
+                    )
+                    extra = () if el is None else (el,)
+
+                else:
+                    raise TypeError(
+                        "each element of X must be either a "
+                        + "graph object or a list with at least "
+                        + "a graph like object and node labels "
+                        + "dict \n"
+                    )
+                Gs_ed[nx] = x.get_edge_dictionary()
+                L[nx] = x.get_labels(purpose="dictionary")
+                extras[nx] = extra
+                distinct_values |= set(L[nx].values())
+                nx += 1
+            if nx == 0:
+                raise ValueError("parsed input is empty")
 
             # Save the number of "fitted" graphs.
             self._nx = nx
@@ -411,9 +410,9 @@ def generate_graphs(label_count: int, WL_labels_inverse):
 
         if return_embedding_only:
             return K
-        elif self._method_calling == 1:
+        if self._method_calling == 1:
             return base_graph_kernel
-        elif self._method_calling == 2:
+        if self._method_calling == 2:
             if self.as_tensor:
                 K = torch.stack(K, dim=0).sum(dim=0)
                 return K, base_graph_kernel
@@ -451,8 +450,7 @@ def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True):  # pylint: di
         ]  # Flush the feature dimensions
         if X is None:
             raise ValueError("transform input cannot be None")
-        else:
-            km, self.X = self.parse_input(X, gp_fit=gp_fit)
+        km, self.X = self.parse_input(X, gp_fit=gp_fit)
 
         return km
 
@@ -487,40 +485,37 @@ def transform(self, X: Iterable, return_embedding_only: bool = True):
         # Input validation and parsing
         if X is None:
             raise ValueError("transform input cannot be None")
-        elif not isinstance(X, collections.abc.Iterable):
+        if not isinstance(X, collections.abc.Iterable):
             raise ValueError("input must be an iterable\n")
-        else:
-            nx = 0
-            distinct_values = set()
-            Gs_ed, L = {}, {}
-            for i, x in enumerate(iter(X)):
-                is_iter = isinstance(x, collections.abc.Iterable)
-                if is_iter:
-                    x = list(x)
-                if is_iter and len(x) in [0, 2, 3]:
-                    if len(x) == 0:
-                        warnings.warn("Ignoring empty element on index: " + str(i))
-                        continue
-
-                    elif len(x) in [2, 3]:
-                        x = Graph(x[0], x[1], {}, self._graph_format)
-                elif isinstance(x, Graph):
-                    x.desired_format("dictionary")
-                else:
-                    raise ValueError(
-                        "each element of X must have at "
-                        + "least one and at most 3 elements\n"
-                    )
-                Gs_ed[nx] = x.get_edge_dictionary()
-                L[nx] = x.get_labels(purpose="dictionary")
+        nx = 0
+        distinct_values = set()
+        Gs_ed, L = {}, {}
+        for i, x in enumerate(iter(X)):
+            is_iter = isinstance(x, collections.abc.Iterable)
+            if is_iter:
+                x = list(x)
+            if is_iter and len(x) in [0, 2, 3]:
+                if len(x) == 0:
+                    warnings.warn("Ignoring empty element on index: " + str(i))
+                    continue
+
+                if len(x) in [2, 3]:
+                    x = Graph(x[0], x[1], {}, self._graph_format)
+            elif isinstance(x, Graph):
+                x.desired_format("dictionary")
+            else:
+                raise ValueError(
+                    "each element of X must have at "
+                    + "least one and at most 3 elements\n"
+                )
+            Gs_ed[nx] = x.get_edge_dictionary()
+            L[nx] = x.get_labels(purpose="dictionary")
 
-                # Hold all the distinct values
-                distinct_values |= {
-                    v for v in L[nx].values() if v not in self._inv_labels[0]
-                }
-                nx += 1
-            if nx == 0:
-                raise ValueError("parsed input is empty")
+            # Hold all the distinct values
+            distinct_values |= {v for v in L[nx].values() if v not in self._inv_labels[0]}
+            nx += 1
+        if nx == 0:
+            raise ValueError("parsed input is empty")
 
         nl = len(self._inv_labels[0])
         WL_labels_inverse = {
@@ -690,8 +685,7 @@ def diagonal(self):
                 Y_diag = torch.tensor(Y_diag)
         if self._is_transformed:
             return self._X_diag, Y_diag
-        else:
-            return self._X_diag
+        return self._X_diag
 
     @staticmethod
     def translate_label(curr_layer: dict, h: int, prev_layer: dict | None = None):
@@ -706,17 +700,16 @@ def translate_label(curr_layer: dict, h: int, prev_layer: dict | None = None):
         """
         if h == 0:
             return {v: str(k) for k, v in curr_layer.items()}, curr_layer
-        else:
-            assert prev_layer is not None
-            label_in_node_attr, inv_label_in_node_attr = OrderedDict(), OrderedDict()
-            for pattern, encoding in curr_layer.items():
-                # current pattern is in terms of the encoding previous layer. Find the pattern from the prev_layer
-                root, leaf = literal_eval(pattern)
-                root_ = prev_layer[root]
-                leaf_ = [prev_layer[i] for i in leaf]
-                label_in_node_attr.update({encoding: "~".join([root_, *leaf_])})
-                inv_label_in_node_attr.update({"~".join([root_, *leaf_]): encoding})
-            return label_in_node_attr, inv_label_in_node_attr
+        assert prev_layer is not None
+        label_in_node_attr, inv_label_in_node_attr = OrderedDict(), OrderedDict()
+        for pattern, encoding in curr_layer.items():
+            # current pattern is in terms of the encoding previous layer. Find the pattern from the prev_layer
+            root, leaf = literal_eval(pattern)
+            root_ = prev_layer[root]
+            leaf_ = [prev_layer[i] for i in leaf]
+            label_in_node_attr.update({encoding: "~".join([root_, *leaf_])})
+            inv_label_in_node_attr.update({"~".join([root_, *leaf_]): encoding})
+        return label_in_node_attr, inv_label_in_node_attr
 
     @staticmethod
     def _compute_feature_weight(
diff --git a/neps/optimizers/bayesian_optimization/kernels/utils.py b/neps/optimizers/bayesian_optimization/kernels/utils.py
index e134bfd0..6d94a25d 100644
--- a/neps/optimizers/bayesian_optimization/kernels/utils.py
+++ b/neps/optimizers/bayesian_optimization/kernels/utils.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Tuple
+from typing import TYPE_CHECKING
 
 import networkx as nx
 import numpy as np
@@ -22,8 +22,8 @@ def transform_to_undirected(gr: list):
     return undirected_gr
 
 
-def extract_configs(configs: list[SearchSpace]) -> Tuple[list, list]:
-    """Extracts graph & HPs from configs objects
+def extract_configs(configs: list[SearchSpace]) -> tuple[list, list]:
+    """Extracts graph & HPs from configs objects.
 
     Args:
         configs (list): Object holding graph and/or HPs
@@ -53,10 +53,7 @@ def extract_configs(configs: list[SearchSpace]) -> Tuple[list, list]:
 
 
 def graph_metrics(graph, metric=None, directed=True):
-    if directed:
-        G = graph
-    else:
-        G = graph.to_undirected()
+    G = graph if directed else graph.to_undirected()
 
     # global metrics
     if metric == "avg_path_length":
@@ -75,14 +72,14 @@ def graph_metrics(graph, metric=None, directed=True):
 
 def extract_configs_hierarchy(
     configs: list, d_graph_features: int, hierarchy_consider=None
-) -> Tuple[list, list]:
+) -> tuple[list, list]:
     """Extracts graph & graph features from configs objects
     Args:
         configs (list): Object holding graph and/or graph features
         d_graph_features (int): Number of global graph features used; if d_graph_features=0, indicate not using global graph features
         hierarchy_consider (list or None): Specify graphs at which earlier hierarchical levels to be considered
     Returns:
-        Tuple[list, list]: list of graphs, list of HPs
+        Tuple[list, list]: list of graphs, list of HPs.
     """
     N = len(configs)
 
@@ -114,7 +111,8 @@ def extract_configs_hierarchy(
                             for hierarchy_id in hierarchy_consider
                         ]
                         for g in combined_graphs
-                    ]
+                    ],
+                    strict=False,
                 ),
             )
         )
diff --git a/neps/optimizers/bayesian_optimization/models/ftpfn.py b/neps/optimizers/bayesian_optimization/models/ftpfn.py
index 3831ec61..95b02ba0 100644
--- a/neps/optimizers/bayesian_optimization/models/ftpfn.py
+++ b/neps/optimizers/bayesian_optimization/models/ftpfn.py
@@ -1,9 +1,9 @@
 from __future__ import annotations
 
-from typing import Any
 from pathlib import Path
-import torch
+from typing import Any
 
+import torch
 from ifbo import FTPFN
 
 
@@ -136,23 +136,21 @@ def get_lcb(
         self, test_x: torch.Tensor, beta: float = (1 - 0.682) / 2
     ) -> torch.Tensor:
         logits = self._get_logits(test_x)
-        lcb = self.ftpfn.model.criterion.ucb(
+        return self.ftpfn.model.criterion.ucb(
             logits=logits,
             best_f=None,
             rest_prob=beta,
             maximize=False,  # IMPORTANT to be False, should calculate the LCB using the lower-bound ICDF as per beta
         )
-        return lcb
 
     @torch.no_grad()
     def get_ucb(
         self, test_x: torch.Tensor, beta: float = (1 - 0.682) / 2
     ) -> torch.Tensor:
         logits = self._get_logits(test_x)
-        lcb = self.ftpfn.model.criterion.ucb(
+        return self.ftpfn.model.criterion.ucb(
             logits=logits,
             best_f=None,
             rest_prob=beta,
             maximize=True,  # IMPORTANT to be True, should calculate the UCB using the upper-bound ICDF as per beta
         )
-        return lcb
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index 8436310c..2ab0b897 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -4,8 +4,9 @@
 
 import logging
 import math
+from collections.abc import Mapping
 from functools import reduce
-from typing import TYPE_CHECKING, Any, Mapping, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 
 import gpytorch
 import gpytorch.constraints
@@ -306,7 +307,10 @@ def optimize_acq(
         col, choice_indices = next(iter(cats.items()))
         fixed_cats = [{col: i} for i in choice_indices]
     else:
-        fixed_cats = [dict(zip(cats.keys(), combo)) for combo in product(*cats.values())]
+        fixed_cats = [
+            dict(zip(cats.keys(), combo, strict=False))
+            for combo in product(*cats.values())
+        ]
 
     # TODO: we should deterministically shuffle the fixed_categoricals
     # as the underlying function does not.
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index b5d518bc..6fe20655 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 import math
-from typing import TYPE_CHECKING, Any, Literal, Mapping
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any, Literal
 
 import torch
 from botorch.acquisition import LinearMCObjective
diff --git a/neps/optimizers/grid_search/optimizer.py b/neps/optimizers/grid_search/optimizer.py
index e9f1d9a3..f548b197 100644
--- a/neps/optimizers/grid_search/optimizer.py
+++ b/neps/optimizers/grid_search/optimizer.py
@@ -1,12 +1,16 @@
+from __future__ import annotations
+
 import random
-from typing import Any
+from typing import TYPE_CHECKING, Any
 from typing_extensions import override
 
-from neps.state.optimizer import BudgetInfo
-from neps.utils.types import ConfigResult, RawConfig
-from neps.search_spaces.search_space import SearchSpace
 from neps.optimizers.base_optimizer import BaseOptimizer
 
+if TYPE_CHECKING:
+    from neps.search_spaces.search_space import SearchSpace
+    from neps.state.optimizer import BudgetInfo
+    from neps.utils.types import ConfigResult, RawConfig
+
 
 class GridSearch(BaseOptimizer):
     def __init__(
diff --git a/neps/optimizers/info.py b/neps/optimizers/info.py
index 40b08174..d3dfffd8 100644
--- a/neps/optimizers/info.py
+++ b/neps/optimizers/info.py
@@ -1,18 +1,18 @@
+from __future__ import annotations
+
 import os
 
 import yaml
 
 
 class SearcherConfigs:
-    """
-    This class provides methods to access default configuration details
+    """This class provides methods to access default configuration details
     for NePS optimizers.
     """
 
     @staticmethod
     def _get_searchers_folder_path() -> str:
-        """
-        Helper method to get the folder path for default searchers.
+        """Helper method to get the folder path for default searchers.
 
         Returns:
             str: The absolute path to the default searchers folder.
@@ -22,8 +22,7 @@ def _get_searchers_folder_path() -> str:
 
     @staticmethod
     def get_searchers() -> list[str]:
-        """
-        List all the searcher names that can be used in neps run.
+        """List all the searcher names that can be used in neps run.
 
         Returns:
             list[str]: A list of searcher names.
@@ -40,8 +39,7 @@ def get_searchers() -> list[str]:
 
     @staticmethod
     def get_available_algorithms() -> list[str]:
-        """
-        List all available algorithms used by NePS searchers.
+        """List all available algorithms used by NePS searchers.
 
         Returns:
             list[str]: A list of algorithm names.
@@ -62,8 +60,7 @@ def get_available_algorithms() -> list[str]:
 
     @staticmethod
     def get_searcher_from_algorithm(algorithm: str) -> list[str]:
-        """
-        Get all NePS searchers that use a specific searching algorithm.
+        """Get all NePS searchers that use a specific searching algorithm.
 
         Args:
             algorithm (str): The name of the algorithm needed for the search.
@@ -86,8 +83,7 @@ def get_searcher_from_algorithm(algorithm: str) -> list[str]:
 
     @staticmethod
     def get_searcher_kwargs(searcher: str) -> str:
-        """
-        Get the kwargs and algorithm setup for a specific searcher.
+        """Get the kwargs and algorithm setup for a specific searcher.
 
         Args:
             searcher (str): The name of the searcher to check the details of.
diff --git a/neps/optimizers/multi_fidelity/hyperband.py b/neps/optimizers/multi_fidelity/hyperband.py
index 510fb582..f6c445ac 100644
--- a/neps/optimizers/multi_fidelity/hyperband.py
+++ b/neps/optimizers/multi_fidelity/hyperband.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import typing
 from copy import deepcopy
 from typing import Any, Literal
@@ -5,15 +7,6 @@
 
 import numpy as np
 
-from neps.state.optimizer import BudgetInfo
-from neps.utils.types import ConfigResult, RawConfig
-from neps.search_spaces.search_space import SearchSpace
-from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
-    BaseAcquisition,
-)
-from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-    AcquisitionSampler,
-)
 from neps.optimizers.multi_fidelity.mf_bo import MFBOBase
 from neps.optimizers.multi_fidelity.promotion_policy import (
     AsyncPromotionPolicy,
@@ -31,6 +24,17 @@
     SuccessiveHalvingBase,
 )
 
+if typing.TYPE_CHECKING:
+    from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
+        BaseAcquisition,
+    )
+    from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
+        AcquisitionSampler,
+    )
+    from neps.search_spaces.search_space import SearchSpace
+    from neps.state.optimizer import BudgetInfo
+    from neps.utils.types import ConfigResult, RawConfig
+
 
 class HyperbandBase(SuccessiveHalvingBase):
     """Implements a Hyperband procedure with a sampling and promotion policy."""
@@ -50,29 +54,29 @@ def __init__(
         cost_value_on_error: None | float = None,
         ignore_errors: bool = False,
         logger=None,
-        prior_confidence: Literal["low", "medium", "high"] = None,
+        prior_confidence: Literal["low", "medium", "high"] | None = None,
         random_interleave_prob: float = 0.0,
         sample_default_first: bool = False,
         sample_default_at_target: bool = False,
     ):
-        args = dict(
-            pipeline_space=pipeline_space,
-            budget=budget,
-            eta=eta,
-            early_stopping_rate=self.early_stopping_rate,  # HB subsumes this param of SH
-            initial_design_type=initial_design_type,
-            use_priors=use_priors,
-            sampling_policy=sampling_policy,
-            promotion_policy=promotion_policy,
-            loss_value_on_error=loss_value_on_error,
-            cost_value_on_error=cost_value_on_error,
-            ignore_errors=ignore_errors,
-            logger=logger,
-            prior_confidence=prior_confidence,
-            random_interleave_prob=random_interleave_prob,
-            sample_default_first=sample_default_first,
-            sample_default_at_target=sample_default_at_target,
-        )
+        args = {
+            "pipeline_space": pipeline_space,
+            "budget": budget,
+            "eta": eta,
+            "early_stopping_rate": self.early_stopping_rate,  # HB subsumes this param of SH
+            "initial_design_type": initial_design_type,
+            "use_priors": use_priors,
+            "sampling_policy": sampling_policy,
+            "promotion_policy": promotion_policy,
+            "loss_value_on_error": loss_value_on_error,
+            "cost_value_on_error": cost_value_on_error,
+            "ignore_errors": ignore_errors,
+            "logger": logger,
+            "prior_confidence": prior_confidence,
+            "random_interleave_prob": random_interleave_prob,
+            "sample_default_first": sample_default_first,
+            "sample_default_at_target": sample_default_at_target,
+        }
         super().__init__(**args)
         # stores the flattened sequence of SH brackets to loop over - the HB heuristic
         # for (n,r) pairing, i.e., (num. configs, fidelity)
@@ -120,7 +124,6 @@ def _handle_promotions(self):
         # promotions are handled by the individual SH brackets which are explicitly
         # called in the _update_sh_bracket_state() function
         # overloaded function disables the need for retrieving promotions for HB overall
-        return
 
     @override
     def load_optimization_state(
@@ -134,7 +137,7 @@ def load_optimization_state(
             previous_results=previous_results,
             pending_evaluations=pending_evaluations,
             budget_info=budget_info,
-            optimizer_state=optimizer_state
+            optimizer_state=optimizer_state,
         )
         # important for the global HB to run the right SH
         self._update_sh_bracket_state()
@@ -340,28 +343,28 @@ def __init__(
         cost_value_on_error: None | float = None,
         ignore_errors: bool = False,
         logger=None,
-        prior_confidence: Literal["low", "medium", "high"] = None,
+        prior_confidence: Literal["low", "medium", "high"] | None = None,
         random_interleave_prob: float = 0.0,
         sample_default_first: bool = False,
         sample_default_at_target: bool = False,
     ):
-        args = dict(
-            pipeline_space=pipeline_space,
-            budget=budget,
-            eta=eta,
-            initial_design_type=initial_design_type,
-            use_priors=use_priors,
-            sampling_policy=sampling_policy,
-            promotion_policy=promotion_policy,
-            loss_value_on_error=loss_value_on_error,
-            cost_value_on_error=cost_value_on_error,
-            ignore_errors=ignore_errors,
-            logger=logger,
-            prior_confidence=prior_confidence,
-            random_interleave_prob=random_interleave_prob,
-            sample_default_first=sample_default_first,
-            sample_default_at_target=sample_default_at_target,
-        )
+        args = {
+            "pipeline_space": pipeline_space,
+            "budget": budget,
+            "eta": eta,
+            "initial_design_type": initial_design_type,
+            "use_priors": use_priors,
+            "sampling_policy": sampling_policy,
+            "promotion_policy": promotion_policy,
+            "loss_value_on_error": loss_value_on_error,
+            "cost_value_on_error": cost_value_on_error,
+            "ignore_errors": ignore_errors,
+            "logger": logger,
+            "prior_confidence": prior_confidence,
+            "random_interleave_prob": random_interleave_prob,
+            "sample_default_first": sample_default_first,
+            "sample_default_at_target": sample_default_at_target,
+        }
         super().__init__(**args)
         # overwrite parent class SH brackets with Async SH brackets
         self.sh_brackets = {}
@@ -402,8 +405,7 @@ def _get_bracket_to_run(self):
             self.eta ** (K - s) * (K + 1) / (K - s + 1) for s in range(self.max_rung + 1)
         ]
         bracket_probs = np.array(bracket_probs) / sum(bracket_probs)
-        bracket_next = np.random.choice(range(self.max_rung + 1), p=bracket_probs)
-        return bracket_next
+        return np.random.choice(range(self.max_rung + 1), p=bracket_probs)
 
     def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]:
         """...and this is the method that decides which point to query.
@@ -477,50 +479,50 @@ def __init__(
         cost_value_on_error: None | float = None,
         ignore_errors: bool = False,
         logger=None,
-        prior_confidence: Literal["low", "medium", "high"] = None,
+        prior_confidence: Literal["low", "medium", "high"] | None = None,
         random_interleave_prob: float = 0.0,
         sample_default_first: bool = False,
         sample_default_at_target: bool = False,
         # new arguments for model
         model_policy: typing.Any = ModelPolicy,
         surrogate_model: str | Any = "gp",
-        domain_se_kernel: str = None,
-        hp_kernels: list = None,
-        surrogate_model_args: dict = None,
+        domain_se_kernel: str | None = None,
+        hp_kernels: list | None = None,
+        surrogate_model_args: dict | None = None,
         acquisition: str | BaseAcquisition = "EI",
         log_prior_weighted: bool = False,
         acquisition_sampler: str | AcquisitionSampler = "random",
     ):
-        hb_args = dict(
-            pipeline_space=pipeline_space,
-            budget=budget,
-            eta=eta,
-            initial_design_type=initial_design_type,
-            use_priors=use_priors,
-            sampling_policy=sampling_policy,
-            promotion_policy=promotion_policy,
-            loss_value_on_error=loss_value_on_error,
-            cost_value_on_error=cost_value_on_error,
-            ignore_errors=ignore_errors,
-            logger=logger,
-            prior_confidence=prior_confidence,
-            random_interleave_prob=random_interleave_prob,
-            sample_default_first=sample_default_first,
-            sample_default_at_target=sample_default_at_target,
-        )
+        hb_args = {
+            "pipeline_space": pipeline_space,
+            "budget": budget,
+            "eta": eta,
+            "initial_design_type": initial_design_type,
+            "use_priors": use_priors,
+            "sampling_policy": sampling_policy,
+            "promotion_policy": promotion_policy,
+            "loss_value_on_error": loss_value_on_error,
+            "cost_value_on_error": cost_value_on_error,
+            "ignore_errors": ignore_errors,
+            "logger": logger,
+            "prior_confidence": prior_confidence,
+            "random_interleave_prob": random_interleave_prob,
+            "sample_default_first": sample_default_first,
+            "sample_default_at_target": sample_default_at_target,
+        }
         super().__init__(**hb_args)
 
         self.pipeline_space.has_prior = self.use_priors
 
-        bo_args = dict(
-            surrogate_model=surrogate_model,
-            domain_se_kernel=domain_se_kernel,
-            hp_kernels=hp_kernels,
-            surrogate_model_args=surrogate_model_args,
-            acquisition=acquisition,
-            log_prior_weighted=log_prior_weighted,
-            acquisition_sampler=acquisition_sampler,
-        )
+        bo_args = {
+            "surrogate_model": surrogate_model,
+            "domain_se_kernel": domain_se_kernel,
+            "hp_kernels": hp_kernels,
+            "surrogate_model_args": surrogate_model_args,
+            "acquisition": acquisition,
+            "log_prior_weighted": log_prior_weighted,
+            "acquisition_sampler": acquisition_sampler,
+        }
         # counting non-fidelity dimensions in search space
         ndims = sum(
             1
@@ -535,4 +537,5 @@ def __init__(
             sh.model_policy = self.model_policy
             sh.sample_new_config = self.sample_new_config
 
+
 # TODO: TrulyAsyncHyperband
diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index bf2bbc83..c1a87862 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -1,27 +1,31 @@
-from typing import Any
+from __future__ import annotations
+
+import warnings
+from typing import TYPE_CHECKING, Any
 from typing_extensions import override
 
 import numpy as np
 import pandas as pd
-import warnings
 
-from neps.state.optimizer import BudgetInfo
-from neps.utils.types import ConfigResult
-from neps.utils.common import instance_from_map
-from neps.search_spaces.search_space import FloatParameter, IntegerParameter, SearchSpace
 from neps.optimizers.base_optimizer import BaseOptimizer
 from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping
-from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
-    BaseAcquisition,
-)
 from neps.optimizers.bayesian_optimization.acquisition_samplers import (
     AcquisitionSamplerMapping,
 )
-from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-    AcquisitionSampler,
-)
 from neps.optimizers.multi_fidelity.mf_bo import FreezeThawModel, PFNSurrogate
 from neps.optimizers.multi_fidelity.utils import MFObservedData
+from neps.search_spaces.search_space import FloatParameter, IntegerParameter, SearchSpace
+from neps.utils.common import instance_from_map
+
+if TYPE_CHECKING:
+    from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
+        BaseAcquisition,
+    )
+    from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
+        AcquisitionSampler,
+    )
+    from neps.state.optimizer import BudgetInfo
+    from neps.utils.types import ConfigResult
 
 
 class IFBO(BaseOptimizer):
@@ -32,7 +36,7 @@ class IFBO(BaseOptimizer):
     def __init__(
         self,
         pipeline_space: SearchSpace,
-        budget: int = None,
+        budget: int | None = None,
         step_size: int | float = 1,
         optimal_assignment: bool = False,  # pylint: disable=unused-argument
         use_priors: bool = False,
@@ -45,18 +49,18 @@ def __init__(
         logger=None,
         # arguments for model
         surrogate_model: str | Any = "ftpfn",
-        surrogate_model_args: dict = None,
-        domain_se_kernel: str = None,
-        graph_kernels: list = None,
-        hp_kernels: list = None,
+        surrogate_model_args: dict | None = None,
+        domain_se_kernel: str | None = None,
+        graph_kernels: list | None = None,
+        hp_kernels: list | None = None,
         acquisition: str | BaseAcquisition = acquisition,
-        acquisition_args: dict = None,
+        acquisition_args: dict | None = None,
         acquisition_sampler: str | AcquisitionSampler = "freeze-thaw",
-        acquisition_sampler_args: dict = None,
+        acquisition_sampler_args: dict | None = None,
         model_policy: Any = PFNSurrogate,
         initial_design_size: int = 1,
     ):
-        """Initialise
+        """Initialise.
 
         Args:
             pipeline_space: Space in which to search
@@ -187,18 +191,17 @@ def _adjust_fidelity_for_freeze_thaw_steps(
             f"Adjusted fidelity lower bound to {pipeline_space.fidelity.lower} "
             f"for equal-sized steps of {step_size}."
         )
-        print("New fidelity: ", pipeline_space.fidelity)
         return pipeline_space
 
     def _prep_model_args(self, hp_kernels, graph_kernels, pipeline_space):
         if self.surrogate_model_name in ["gp", "gp_hierarchy"]:
             # setup for GP implemented in NePS
             self.surrogate_model_args.update(
-                dict(
+                {
                     # domain_se_kernel=domain_se_kernel,
-                    hp_kernels=hp_kernels,
-                    graph_kernels=graph_kernels,
-                )
+                    "hp_kernels": hp_kernels,
+                    "graph_kernels": graph_kernels,
+                }
             )
             if not self.surrogate_model_args["hp_kernels"]:
                 raise ValueError("No kernels are provided!")
@@ -246,16 +249,10 @@ def total_budget_spent(self) -> int | float:
         n_configs = len(self.observed_configs.seen_config_ids)
         total_budget_level = sum(self.observed_configs.seen_budget_levels)
         total_initial_budget_spent = n_configs * self.pipeline_space.fidelity.lower
-        total_budget_spent = (
-            total_initial_budget_spent + total_budget_level * self.step_size
-        )
-
-        return total_budget_spent
+        return total_initial_budget_spent + total_budget_level * self.step_size
 
     def is_init_phase(self) -> bool:
-        if self.num_train_configs < self._initial_design_size:
-            return True
-        return False
+        return self.num_train_configs < self._initial_design_size
 
     @property
     def num_train_configs(self):
@@ -287,8 +284,8 @@ def load_optimization_state(
         self._handle_pending_evaluations(pending_evaluations)
 
         # an aesthetic choice more than a functional choice
-        self.observed_configs.df.sort_index(
-            level=self.observed_configs.df.index.names, inplace=True
+        self.observed_configs.df = self.observed_configs.df.sort_index(
+            level=self.observed_configs.df.index.names
         )
         # TODO: can we do better than keeping a copy of the observed configs?
         # TODO: can we not hide this in load_results and have something that pops out
@@ -322,7 +319,7 @@ def index_data_split(config_id: str, config_val):
                 tuple(index_data_split(config_id, config_val))
                 for config_id, config_val in previous_results.items()
             ]
-            indices, rows = zip(*index_row)
+            indices, rows = zip(*index_row, strict=False)
             self.observed_configs.add_data(data=list(rows), index=list(indices))
 
     def _handle_pending_evaluations(self, pending_evaluations):
diff --git a/neps/optimizers/multi_fidelity/mf_bo.py b/neps/optimizers/multi_fidelity/mf_bo.py
index ef31f9cc..790c833a 100755
--- a/neps/optimizers/multi_fidelity/mf_bo.py
+++ b/neps/optimizers/multi_fidelity/mf_bo.py
@@ -1,15 +1,20 @@
 # type: ignore
-
+from __future__ import annotations
 
 from copy import deepcopy
+
 import torch
 
-from neps.utils.common import instance_from_map
 from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
 from neps.optimizers.multi_fidelity.utils import (
-    get_tokenized_data, get_training_data_for_freeze_thaw
+    get_tokenized_data,
+    get_training_data_for_freeze_thaw,
 )
-from neps.optimizers.multi_fidelity_prior.utils import calc_total_resources_spent, update_fidelity
+from neps.optimizers.multi_fidelity_prior.utils import (
+    calc_total_resources_spent,
+    update_fidelity,
+)
+from neps.utils.common import instance_from_map
 
 
 class MFBOBase:
@@ -20,7 +25,6 @@ class MFBOBase:
 
     def _fit_models(self):
         """Performs necessary procedures to build and use models."""
-
         if not self.model_based:
             # do nothing here if the algorithm has model-based search disabled
             return
@@ -139,7 +143,7 @@ def is_init_phase(self) -> bool:
 
     def sample_new_config(
         self,
-        rung: int = None,
+        rung: int | None = None,
         **kwargs,  # pylint: disable=unused-argument
     ):
         """Samples configuration from policies or random."""
@@ -186,7 +190,7 @@ def __init__(
         self,
         pipeline_space,
         surrogate_model: str = "ftpfn",
-        surrogate_model_args: dict = None,
+        surrogate_model_args: dict | None = None,
         step_size: int = 1,
     ):
         self.observed_configs = None
@@ -228,7 +232,7 @@ def set_state(
             name="surrogate model",
             kwargs=self.surrogate_model_args,
         )
-    
+
 
 class PFNSurrogate(FreezeThawModel):
     """Special class to deal with PFN surrogate model and freeze-thaw acquisition."""
@@ -246,18 +250,20 @@ def update_model(self):
             self.observed_configs.perf_col,
             self.pipeline_space,
             step_size=self.step_size,
-            maximize=True  # inverts performance since NePS minimizes
+            maximize=True,  # inverts performance since NePS minimizes
         )
         df_idxs = torch.Tensor(idxs)
         df_x = torch.Tensor(get_tokenized_data(configs))
         df_steps = torch.Tensor(steps)
-        train_x = torch.hstack([
-            df_idxs.reshape(df_steps.shape[0], 1),
-            df_steps.reshape(df_steps.shape[0], 1),
-            df_x
-        ])
+        train_x = torch.hstack(
+            [
+                df_idxs.reshape(df_steps.shape[0], 1),
+                df_steps.reshape(df_steps.shape[0], 1),
+                df_x,
+            ]
+        )
         train_y = torch.Tensor(performance)
-        
+
         # fit the model, on only completed runs
         self._fit(train_x, train_y)
 
@@ -270,16 +276,18 @@ def update_model(self):
                 self.observed_configs.perf_col,
                 self.pipeline_space,
                 step_size=self.step_size,
-                maximize=True  # inverts performance since NePS minimizes
+                maximize=True,  # inverts performance since NePS minimizes
             )
             _df_x = torch.Tensor(get_tokenized_data(_configs))
             _df_idxs = torch.Tensor(_idxs)
             _df_steps = torch.Tensor(_steps)
-            _test_x = torch.hstack([
-                _df_idxs.reshape(_df_idxs.shape[0], 1),
-                _df_steps.reshape(_df_steps.shape[0], 1),
-                _df_x
-            ])
+            _test_x = torch.hstack(
+                [
+                    _df_idxs.reshape(_df_idxs.shape[0], 1),
+                    _df_steps.reshape(_df_steps.shape[0], 1),
+                    _df_x,
+                ]
+            )
             _performances = self._predict(_test_x)  # returns maximizing metric
             # update the training data
             train_x = torch.vstack([train_x, _test_x])
@@ -294,14 +302,14 @@ def _fit(self, train_x: torch.Tensor, train_y: torch.Tensor):  # pylint: disable
         self.surrogate_model.train_y = train_y
 
     def _predict(self, test_x: torch.Tensor) -> torch.Tensor:
-        assert self.surrogate_model.train_x is not None and self.surrogate_model.train_y is not None, "Model not trained yet!"
+        assert (
+            self.surrogate_model.train_x is not None
+            and self.surrogate_model.train_y is not None
+        ), "Model not trained yet!"
         if self.surrogate_model_name == "ftpfn":
             mean = self.surrogate_model.get_mean_performance(test_x)
             if mean.is_cuda:
                 mean = mean.cpu()
             return mean
-        else:
-            # check neps/optimizers/bayesian_optimization/models/__init__.py for options
-            raise ValueError(
-                f"Surrogate model {self.surrogate_model_name} not supported!"
-            )
+        # check neps/optimizers/bayesian_optimization/models/__init__.py for options
+        raise ValueError(f"Surrogate model {self.surrogate_model_name} not supported!")
diff --git a/neps/optimizers/multi_fidelity/promotion_policy.py b/neps/optimizers/multi_fidelity/promotion_policy.py
index 102b7f82..8f6847ff 100644
--- a/neps/optimizers/multi_fidelity/promotion_policy.py
+++ b/neps/optimizers/multi_fidelity/promotion_policy.py
@@ -1,10 +1,12 @@
+from __future__ import annotations
+
 from abc import ABC, abstractmethod
 
 import numpy as np
 
 
 class PromotionPolicy(ABC):
-    """Base class for implementing a sampling straregy for SH and its subclasses"""
+    """Base class for implementing a sampling straregy for SH and its subclasses."""
 
     def __init__(self, eta: int):
         self.rung_members: dict = {}
@@ -54,12 +56,12 @@ def set_state(
         self.config_map = config_map
 
     def retrieve_promotions(self) -> dict:
-        """Returns the top 1/eta configurations per rung if enough configurations seen"""
+        """Returns the top 1/eta configurations per rung if enough configurations seen."""
         assert self.config_map is not None
 
-        self.rung_promotions = {rung: [] for rung in self.config_map.keys()}
+        self.rung_promotions = {rung: [] for rung in self.config_map}
         total_rung_evals = 0
-        for rung in reversed(sorted(self.config_map.keys())):
+        for rung in sorted(self.config_map.keys(), reverse=True):
             total_rung_evals += len(self.rung_members[rung])
             if (
                 total_rung_evals >= self.config_map[rung]
@@ -93,7 +95,7 @@ def __init__(self, eta, **kwargs):
         super().__init__(eta, **kwargs)
 
     def retrieve_promotions(self) -> dict:
-        """Returns the top 1/eta configurations per rung if enough configurations seen"""
+        """Returns the top 1/eta configurations per rung if enough configurations seen."""
         for rung in range(self.max_rung + 1):
             if rung == self.max_rung:
                 # cease promotions for the highest rung (configs at max budget)
@@ -102,6 +104,6 @@ def retrieve_promotions(self) -> dict:
             top_k = len(self.rung_members_performance[rung]) // self.eta
             _ordered_idx = np.argsort(self.rung_members_performance[rung])
             self.rung_promotions[rung] = np.array(self.rung_members[rung])[_ordered_idx][
-                                         :top_k
-                                         ].tolist()
+                :top_k
+            ].tolist()
         return self.rung_promotions
diff --git a/neps/optimizers/multi_fidelity/sampling_policy.py b/neps/optimizers/multi_fidelity/sampling_policy.py
index da564c6b..9208e4c3 100644
--- a/neps/optimizers/multi_fidelity/sampling_policy.py
+++ b/neps/optimizers/multi_fidelity/sampling_policy.py
@@ -1,32 +1,38 @@
 # mypy: disable-error-code = assignment
+from __future__ import annotations
+
 import logging
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 import pandas as pd
 import torch
 
-from neps.utils.common import instance_from_map
-from ...search_spaces.search_space import SearchSpace
-from ..bayesian_optimization.acquisition_functions import AcquisitionMapping
-from ..bayesian_optimization.acquisition_functions.base_acquisition import (
-    BaseAcquisition,
-)
-from ..bayesian_optimization.acquisition_functions.prior_weighted import (
+from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping
+from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import (
     DecayingPriorWeightedAcquisition,
 )
-from ..bayesian_optimization.acquisition_samplers import AcquisitionSamplerMapping
-from ..bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-    AcquisitionSampler,
+from neps.optimizers.bayesian_optimization.acquisition_samplers import (
+    AcquisitionSamplerMapping,
 )
-from ..bayesian_optimization.models import SurrogateModelMapping
-from ..multi_fidelity_prior.utils import (
+from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
+from neps.optimizers.multi_fidelity_prior.utils import (
     compute_config_dist,
     custom_crossover,
     local_mutation,
     update_fidelity,
 )
+from neps.utils.common import instance_from_map
+
+if TYPE_CHECKING:
+    from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
+        BaseAcquisition,
+    )
+    from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
+        AcquisitionSampler,
+    )
+    from neps.search_spaces.search_space import SearchSpace
 
 TOLERANCE = 1e-2  # 1%
 SAMPLE_THRESHOLD = 1000  # num samples to be rejected for increasing hypersphere radius
@@ -35,7 +41,7 @@
 
 
 class SamplingPolicy(ABC):
-    """Base class for implementing a sampling strategy for SH and its subclasses"""
+    """Base class for implementing a sampling strategy for SH and its subclasses."""
 
     def __init__(self, pipeline_space: SearchSpace, patience: int = 100, logger=None):
         self.pipeline_space = pipeline_space
@@ -48,7 +54,7 @@ def sample(self, *args, **kwargs) -> SearchSpace:
 
 
 class RandomUniformPolicy(SamplingPolicy):
-    """A random policy for sampling configuration, i.e. the default for SH / hyperband
+    """A random policy for sampling configuration, i.e. the default for SH / hyperband.
 
     Args:
         SamplingPolicy ([type]): [description]
@@ -80,7 +86,7 @@ def __init__(
         self.fraction_from_prior = fraction_from_prior
 
     def sample(self, *args, **kwargs) -> SearchSpace:
-        """Samples from the prior with a certain probabiliyu
+        """Samples from the prior with a certain probabiliyu.
 
         Returns:
             SearchSpace: [description]
@@ -88,10 +94,9 @@ def sample(self, *args, **kwargs) -> SearchSpace:
         user_priors = False
         if np.random.uniform() < self.fraction_from_prior:
             user_priors = True
-        config = self.pipeline_space.sample(
+        return self.pipeline_space.sample(
             patience=self.patience, user_priors=user_priors, ignore_fidelity=True
         )
-        return config
 
 
 class EnsemblePolicy(SamplingPolicy):
@@ -151,9 +156,13 @@ def sample_neighbour(self, incumbent, distance, tolerance=TOLERANCE):
         return config
 
     def sample(
-        self, inc: SearchSpace = None, weights: dict[str, float] = None, *args, **kwargs
+        self,
+        inc: SearchSpace = None,
+        weights: dict[str, float] | None = None,
+        *args,
+        **kwargs,
     ) -> SearchSpace:
-        """Samples from the prior with a certain probability
+        """Samples from the prior with a certain probability.
 
         Returns:
             SearchSpace: [description]
@@ -256,7 +265,7 @@ def sample(
 
 
 class ModelPolicy(SamplingPolicy):
-    """A policy for sampling configuration, i.e. the default for SH / hyperband
+    """A policy for sampling configuration, i.e. the default for SH / hyperband.
 
     Args:
         SamplingPolicy ([type]): [description]
@@ -266,7 +275,7 @@ def __init__(
         self,
         pipeline_space: SearchSpace,
         surrogate_model: str | Any = "gp",
-        surrogate_model_args: dict = None,
+        surrogate_model_args: dict | None = None,
         acquisition: str | BaseAcquisition = "EI",
         log_prior_weighted: bool = False,
         acquisition_sampler: str | AcquisitionSampler = "random",
@@ -328,7 +337,10 @@ def update_model(self, train_x, train_y, pending_x, decay_t=None):
         # self.acquisition_sampler.set_state(x=train_x, y=train_y)
 
     def sample(
-        self, active_max_fidelity: int = None, fidelity: int = None, **kwargs
+        self,
+        active_max_fidelity: int | None = None,
+        fidelity: int | None = None,
+        **kwargs,
     ) -> SearchSpace:
         """Performs the equivalent of optimizing the acquisition function.
 
@@ -381,11 +393,10 @@ def sample(
         # computes the EI for all `samples`
         eis = self.acquisition.eval(x=samples, asscalar=True)
         # extracting the highest scored sample
-        config = samples[np.argmax(eis)]
+        return samples[np.argmax(eis)]
         # TODO: can generalize s.t. sampler works for all types, currently,
         #  random sampler in NePS does not do what is required here
         # return self.acquisition_sampler.sample(self.acquisition)
-        return config
 
 
 class BaseDynamicModelPolicy(SamplingPolicy):
@@ -394,10 +405,10 @@ def __init__(
         pipeline_space: SearchSpace,
         observed_configs: Any = None,
         surrogate_model: str | Any = "gp",
-        domain_se_kernel: str = None,
-        hp_kernels: list = None,
-        graph_kernels: list = None,
-        surrogate_model_args: dict = None,
+        domain_se_kernel: str | None = None,
+        hp_kernels: list | None = None,
+        graph_kernels: list | None = None,
+        surrogate_model_args: dict | None = None,
         acquisition: str | BaseAcquisition = "EI",
         use_priors: bool = False,
         log_prior_weighted: bool = False,
@@ -541,10 +552,9 @@ def sample(self, rand_promotion_prob=0.5, seed=777, is_promotion=False, **kwargs
 
         if is_promotion and promoted:
             return config_id
-        elif is_promotion:
+        if is_promotion:
             return None
-        else:
-            return config
+        return config
 
     # def sample(self, **kwargs):
     #     return self._sample(is_promotion=False, **kwargs)
diff --git a/neps/optimizers/multi_fidelity/successive_halving.py b/neps/optimizers/multi_fidelity/successive_halving.py
index 6df62333..b87f6b77 100644
--- a/neps/optimizers/multi_fidelity/successive_halving.py
+++ b/neps/optimizers/multi_fidelity/successive_halving.py
@@ -4,20 +4,12 @@
 import random
 import typing
 from copy import deepcopy
+from typing import Literal
+from typing_extensions import override
 
 import numpy as np
 import pandas as pd
-from typing import Literal
-from typing_extensions import override
 
-from neps.utils.types import ConfigResult, RawConfig
-from neps.search_spaces import (
-    CategoricalParameter,
-    ConstantParameter,
-    FloatParameter,
-    IntegerParameter,
-    SearchSpace,
-)
 from neps.optimizers.base_optimizer import BaseOptimizer
 from neps.optimizers.multi_fidelity.promotion_policy import (
     AsyncPromotionPolicy,
@@ -27,6 +19,16 @@
     FixedPriorPolicy,
     RandomUniformPolicy,
 )
+from neps.search_spaces import (
+    CategoricalParameter,
+    ConstantParameter,
+    FloatParameter,
+    IntegerParameter,
+    SearchSpace,
+)
+
+if typing.TYPE_CHECKING:
+    from neps.utils.types import ConfigResult, RawConfig
 
 CUSTOM_FLOAT_CONFIDENCE_SCORES = dict(FloatParameter.DEFAULT_CONFIDENCE_SCORES)
 CUSTOM_FLOAT_CONFIDENCE_SCORES.update({"ultra": 0.05})
@@ -43,7 +45,7 @@ class SuccessiveHalvingBase(BaseOptimizer):
     def __init__(
         self,
         pipeline_space: SearchSpace,
-        budget: int = None,
+        budget: int | None = None,
         eta: int = 3,
         early_stopping_rate: int = 0,
         initial_design_type: Literal["max_budget", "unique_configs"] = "max_budget",
@@ -54,7 +56,7 @@ def __init__(
         cost_value_on_error: None | float = None,
         ignore_errors: bool = False,
         logger=None,
-        prior_confidence: Literal["low", "medium", "high"] = None,
+        prior_confidence: Literal["low", "medium", "high"] | None = None,
         random_interleave_prob: float = 0.0,
         sample_default_first: bool = False,
         sample_default_at_target: bool = False,
@@ -142,9 +144,9 @@ def __init__(
         # crucial data structure used for determining promotion candidates
         self.observed_configs = pd.DataFrame([], columns=("config", "rung", "perf"))
         # stores which configs occupy each rung at any time
-        self.rung_members: dict = dict()  # stores config IDs per rung
-        self.rung_members_performance: dict = dict()  # performances recorded per rung
-        self.rung_promotions: dict = dict()  # records a promotable config per rung
+        self.rung_members: dict = {}  # stores config IDs per rung
+        self.rung_members_performance: dict = {}  # performances recorded per rung
+        self.rung_promotions: dict = {}  # records a promotable config per rung
         self.total_fevals = 0
 
         # setup SH state counter
@@ -178,7 +180,7 @@ def get_incumbent_score(self):
     def _get_rung_map(self, s: int = 0) -> dict:
         """Maps rungs (0,1,...,k) to a fidelity value based on fidelity bounds, eta, s."""
         assert s <= self.stopping_rate_limit
-        new_min_budget = self.min_budget * (self.eta ** s)
+        new_min_budget = self.min_budget * (self.eta**s)
         nrungs = (
             np.floor(np.log(self.max_budget / new_min_budget) / np.log(self.eta)).astype(
                 int
@@ -186,7 +188,7 @@ def _get_rung_map(self, s: int = 0) -> dict:
             + 1
         )
         _max_budget = self.max_budget
-        rung_map = dict()
+        rung_map = {}
         for i in reversed(range(nrungs)):
             rung_map[i + s] = (
                 int(_max_budget)
@@ -197,9 +199,9 @@ def _get_rung_map(self, s: int = 0) -> dict:
         return rung_map
 
     def _get_config_map(self, s: int = 0) -> dict:
-        """Maps rungs (0,1,...,k) to the number of configs for each fidelity"""
+        """Maps rungs (0,1,...,k) to the number of configs for each fidelity."""
         assert s <= self.stopping_rate_limit
-        new_min_budget = self.min_budget * (self.eta ** s)
+        new_min_budget = self.min_budget * (self.eta**s)
         nrungs = (
             np.floor(np.log(self.max_budget / new_min_budget) / np.log(self.eta)).astype(
                 int
@@ -209,8 +211,8 @@ def _get_config_map(self, s: int = 0) -> dict:
         s_max = self.stopping_rate_limit + 1
         _s = self.stopping_rate_limit - s
         # L2 from Alg 1 in https://arxiv.org/pdf/1603.06560.pdf
-        _n_config = np.floor(s_max / (_s + 1)) * self.eta ** _s
-        config_map = dict()
+        _n_config = np.floor(s_max / (_s + 1)) * self.eta**_s
+        config_map = {}
         for i in range(nrungs):
             config_map[i + s] = int(_n_config)
             _n_config //= self.eta
@@ -253,7 +255,6 @@ def _load_previous_observations(
             # rung histories are collected only for `previous` and not `pending` configs
             self.rung_histories[int(_rung)]["config"].append(int(_config))
             self.rung_histories[int(_rung)]["perf"].append(perf)
-        return
 
     def _handle_pending_evaluations(
         self, pending_evaluations: dict[str, ConfigResult]
@@ -274,12 +275,11 @@ def _handle_pending_evaluations(
             else:
                 self.observed_configs.at[int(_config), "rung"] = int(_rung)
                 self.observed_configs.at[int(_config), "perf"] = np.nan
-        return
 
     def clean_rung_information(self):
-        self.rung_members = {k: [] for k in self.rung_map.keys()}
-        self.rung_members_performance = {k: [] for k in self.rung_map.keys()}
-        self.rung_promotions = {k: [] for k in self.rung_map.keys()}
+        self.rung_members = {k: [] for k in self.rung_map}
+        self.rung_members_performance = {k: [] for k in self.rung_map}
+        self.rung_promotions = {k: [] for k in self.rung_map}
 
     def _get_rungs_state(self, observed_configs=None):
         """Collects info on configs at a rung and their performance there."""
@@ -303,7 +303,6 @@ def _get_rungs_state(self, observed_configs=None):
             idxs = observed_configs.rung == _rung
             self.rung_members[_rung] = observed_configs.index[idxs].values
             self.rung_members_performance[_rung] = observed_configs.perf[idxs].values
-        return
 
     def _handle_promotions(self):
         self.promotion_policy.set_state(
@@ -357,14 +356,12 @@ def load_optimization_state(
         # fit any model/surrogates
         self._fit_models()
 
-        return
-
     def is_init_phase(self) -> bool:
         return True
 
     def sample_new_config(
         self,
-        rung: int = None,
+        rung: int | None = None,
         **kwargs,
     ):
         # Samples configuration from policy or random
@@ -459,7 +456,7 @@ def _enhance_priors(self, confidence_score=None):
         for k, v in self.pipeline_space.items():
             if v.is_fidelity or isinstance(v, ConstantParameter):
                 continue
-            elif isinstance(v, (FloatParameter, IntegerParameter)):
+            if isinstance(v, FloatParameter | IntegerParameter):
                 if confidence_score is None:
                     confidence = CUSTOM_FLOAT_CONFIDENCE_SCORES[self.prior_confidence]
                 else:
@@ -478,7 +475,7 @@ def _enhance_priors(self, confidence_score=None):
 class SuccessiveHalving(SuccessiveHalvingBase):
     def _calc_budget_used_in_bracket(self, config_history: list[int]):
         budget = 0
-        for rung in self.config_map.keys():
+        for rung in self.config_map:
             count = sum(config_history == rung)
             # `range(min_rung, rung+1)` counts the black-box cost of promotions since
             # SH budgets assume each promotion involves evaluation from scratch
@@ -598,7 +595,7 @@ def __init__(
         cost_value_on_error: None | float = None,
         ignore_errors: bool = False,
         logger=None,
-        prior_confidence: Literal["low", "medium", "high"] = None,
+        prior_confidence: Literal["low", "medium", "high"] | None = None,
         random_interleave_prob: float = 0.0,
         sample_default_first: bool = False,
         sample_default_at_target: bool = False,
diff --git a/neps/optimizers/multi_fidelity/utils.py b/neps/optimizers/multi_fidelity/utils.py
index f551e73f..0158fbdf 100644
--- a/neps/optimizers/multi_fidelity/utils.py
+++ b/neps/optimizers/multi_fidelity/utils.py
@@ -1,25 +1,28 @@
 # type: ignore
-from typing import Any, Sequence
+from __future__ import annotations
 
+from collections.abc import Sequence
 from copy import deepcopy
+from typing import TYPE_CHECKING, Any
+
 import numpy as np
 import pandas as pd
-import torch
 
-from neps.search_spaces.search_space import SearchSpace
 from neps.optimizers.utils import map_real_hyperparameters_from_tabular_ids
 
+if TYPE_CHECKING:
+    from neps.search_spaces.search_space import SearchSpace
+
 
 def continuous_to_tabular(
     config: SearchSpace, categorical_space: SearchSpace
 ) -> SearchSpace:
-    """
-    Convert the continuous parameters in the config into categorical ones based on
-    the categorical_space provided
+    """Convert the continuous parameters in the config into categorical ones based on
+    the categorical_space provided.
     """
     result = config.clone()
     for hp_name, _ in config.items():
-        if hp_name in categorical_space.keys():
+        if hp_name in categorical_space:
             choices = np.array(categorical_space[hp_name].choices)
             diffs = choices - config[hp_name].value
             # NOTE: in case of a tie the first value in the choices array will be returned
@@ -44,14 +47,13 @@ def get_tokenized_data(
     configs: list[SearchSpace],
     ignore_fidelity: bool = True,
 ) -> np.ndarray:  # pd.Series:  # tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """Extracts configurations, indices and performances given a DataFrame
+    """Extracts configurations, indices and performances given a DataFrame.
 
     Tokenizes the given set of observations as required by a PFN surrogate model.
     """
-    configs = np.array(
+    return np.array(
         [normalize_vectorize_config(c, ignore_fidelity=ignore_fidelity) for c in configs]
     )
-    return configs
 
 
 def get_freeze_thaw_normalized_step(
@@ -98,8 +100,7 @@ def get_training_data_for_freeze_thaw(
 
 
 class MFObservedData:
-    """
-    (Under development)
+    """(Under development).
 
     This module is used to unify the data access across different Multi-Fidelity
     optimizers. It stores column names and index names. Possible optimizations
@@ -176,8 +177,7 @@ def completed_runs_index(self) -> pd.Index | pd.MultiIndex:
     def next_config_id(self) -> int:
         if len(self.seen_config_ids):
             return max(self.seen_config_ids) + 1
-        else:
-            return 0
+        return 0
 
     def add_data(
         self,
@@ -185,9 +185,7 @@ def add_data(
         index: tuple[int, ...] | Sequence[tuple[int, ...]] | Sequence[int] | int,
         error: bool = False,
     ):
-        """
-        Add data only if none of the indices are already existing in the DataFrame
-        """
+        """Add data only if none of the indices are already existing in the DataFrame."""
         # TODO: If index is only config_id extend it
         if not isinstance(index, list):
             index_list = [index]
@@ -213,16 +211,11 @@ def update_data(
         index: tuple[int, ...] | Sequence[tuple[int, ...]] | Sequence[int] | int,
         error: bool = False,
     ):
-        """
-        Update data if all the indices already exist in the DataFrame
-        """
-        if not isinstance(index, list):
-            index_list = [index]
-        else:
-            index_list = index
+        """Update data if all the indices already exist in the DataFrame."""
+        index_list = [index] if not isinstance(index, list) else index
         if self.df.index.isin(index_list).sum() == len(index_list):
-            column_names, data = zip(*data_dict.items())
-            data = list(zip(*data))
+            column_names, data = zip(*data_dict.items(), strict=False)
+            data = list(zip(*data, strict=False))
             self.df.loc[index_list, list(column_names)] = data
 
         elif error:
@@ -243,8 +236,7 @@ def all_configs_list(self) -> list[Any]:
         return self.df.loc[:, self.config_col].sort_index().values.tolist()
 
     def get_incumbents_for_budgets(self, maximize: bool = False):
-        """
-        Returns a series object with the best partial configuration for each budget id
+        """Returns a series object with the best partial configuration for each budget id.
 
         Note: this will always map the best lowest ID if two configurations
               have the same performance at the same fidelity
@@ -255,13 +247,14 @@ def get_incumbents_for_budgets(self, maximize: bool = False):
         else:
             config_ids = learning_curves.idxmin(axis=0)
 
-        indices = list(zip(config_ids.values.tolist(), config_ids.index.to_list()))
+        indices = list(
+            zip(config_ids.values.tolist(), config_ids.index.to_list(), strict=False)
+        )
         partial_configs = self.df.loc[indices, self.config_col].to_list()
         return pd.Series(partial_configs, index=config_ids.index, name=self.config_col)
 
     def get_best_performance_for_each_budget(self, maximize: bool = False):
-        """
-        Returns a series object with the best partial configuration for each budget id
+        """Returns a series object with the best partial configuration for each budget id.
 
         Note: this will always map the best lowest ID if two configurations
               has the same performance at the same fidelity
@@ -280,12 +273,10 @@ def get_budget_level_for_best_performance(self, maximize: bool = False) -> int:
         y_star = self.get_best_seen_performance(maximize=maximize)
         # uses the minimum of the budget that see the maximum obseved score
         op = max if maximize else min
-        z_inc = int(op([_z for _z, _y in perf_per_z.items() if _y == y_star]))
-        return z_inc
+        return int(op([_z for _z, _y in perf_per_z.items() if _y == y_star]))
 
     def get_best_learning_curve_id(self, maximize: bool = False):
-        """
-        Returns a single configuration id of the best observed performance
+        """Returns a single configuration id of the best observed performance.
 
         Note: this will always return the single best lowest ID
               if two configurations has the same performance
@@ -293,25 +284,20 @@ def get_best_learning_curve_id(self, maximize: bool = False):
         learning_curves = self.get_learning_curves()
         if maximize:
             return learning_curves.max(axis=1).idxmax()
-        else:
-            return learning_curves.min(axis=1).idxmin()
+        return learning_curves.min(axis=1).idxmin()
 
     def get_best_seen_performance(self, maximize: bool = False):
         learning_curves = self.get_learning_curves()
         if maximize:
             return learning_curves.max(axis=1).max()
-        else:
-            return learning_curves.min(axis=1).min()
+        return learning_curves.min(axis=1).min()
 
     def add_budget_column(self):
         combined_df = self.df.reset_index(level=1)
-        combined_df.set_index(
-            keys=[self.budget_idx], drop=False, append=True, inplace=True
-        )
-        return combined_df
+        return combined_df.set_index(keys=[self.budget_idx], drop=False, append=True)
 
     def reduce_to_max_seen_budgets(self):
-        self.df.sort_index(inplace=True)
+        self.df = self.df.sort_index()
         combined_df = self.add_budget_column()
         return combined_df.groupby(level=0).last()
 
@@ -344,7 +330,7 @@ def extract_learning_curve(
     def get_best_performance_per_config(self, maximize: bool = False) -> pd.Series:
         """Returns the best score recorded per config across fidelities seen."""
         op = np.max if maximize else np.min
-        perf = (
+        return (
             self.df.sort_values(
                 "budget_id", ascending=False
             )  # sorts with largest budget first
@@ -354,7 +340,6 @@ def get_best_performance_per_config(self, maximize: bool = False) -> pd.Series:
                 op
             )  # finds the minimum over per-config learning curve
         )
-        return perf
 
     def get_max_observed_fidelity_level_per_config(self) -> pd.Series:
         """Returns the highest fidelity level recorded per config seen."""
diff --git a/neps/optimizers/multi_fidelity_prior/async_priorband.py b/neps/optimizers/multi_fidelity_prior/async_priorband.py
index ce2352cf..0a859dec 100644
--- a/neps/optimizers/multi_fidelity_prior/async_priorband.py
+++ b/neps/optimizers/multi_fidelity_prior/async_priorband.py
@@ -1,18 +1,11 @@
-import typing
+from __future__ import annotations
 
-import numpy as np
+import typing
 from typing import Literal
 from typing_extensions import override
 
-from neps.state.optimizer import BudgetInfo
-from neps.utils.types import ConfigResult, RawConfig
-from neps.search_spaces.search_space import SearchSpace
-from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
-    BaseAcquisition,
-)
-from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-    AcquisitionSampler,
-)
+import numpy as np
+
 from neps.optimizers.multi_fidelity.mf_bo import MFBOBase
 from neps.optimizers.multi_fidelity.promotion_policy import AsyncPromotionPolicy
 from neps.optimizers.multi_fidelity.sampling_policy import EnsemblePolicy, ModelPolicy
@@ -21,6 +14,17 @@
 )
 from neps.optimizers.multi_fidelity_prior.priorband import PriorBandBase
 
+if typing.TYPE_CHECKING:
+    from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
+        BaseAcquisition,
+    )
+    from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
+        AcquisitionSampler,
+    )
+    from neps.search_spaces.search_space import SearchSpace
+    from neps.state.optimizer import BudgetInfo
+    from neps.utils.types import ConfigResult, RawConfig
+
 
 class PriorBandAsha(MFBOBase, PriorBandBase, AsynchronousSuccessiveHalvingWithPriors):
     """Implements a PriorBand on top of ASHA."""
@@ -50,12 +54,12 @@ def __init__(
         # arguments for model
         model_based: bool = False,  # crucial argument to set to allow model-search
         modelling_type: str = "joint",  # could also be {"rung"}
-        initial_design_size: int = None,
+        initial_design_size: int | None = None,
         model_policy: typing.Any = ModelPolicy,
         surrogate_model: str | typing.Any = "gp",
-        domain_se_kernel: str = None,
-        hp_kernels: list = None,
-        surrogate_model_args: dict = None,
+        domain_se_kernel: str | None = None,
+        hp_kernels: list | None = None,
+        surrogate_model_args: dict | None = None,
         acquisition: str | BaseAcquisition = "EI",
         log_prior_weighted: bool = False,
         acquisition_sampler: str | AcquisitionSampler = "random",
@@ -95,15 +99,15 @@ def __init__(
             },
         }
 
-        bo_args = dict(
-            surrogate_model=surrogate_model,
-            domain_se_kernel=domain_se_kernel,
-            hp_kernels=hp_kernels,
-            surrogate_model_args=surrogate_model_args,
-            acquisition=acquisition,
-            log_prior_weighted=log_prior_weighted,
-            acquisition_sampler=acquisition_sampler,
-        )
+        bo_args = {
+            "surrogate_model": surrogate_model,
+            "domain_se_kernel": domain_se_kernel,
+            "hp_kernels": hp_kernels,
+            "surrogate_model_args": surrogate_model_args,
+            "acquisition": acquisition,
+            "log_prior_weighted": log_prior_weighted,
+            "acquisition_sampler": acquisition_sampler,
+        }
         self.model_based = model_based
         self.modelling_type = modelling_type
         self.initial_design_size = initial_design_size
@@ -128,10 +132,7 @@ def get_config_and_ids(
             [type]: [description]
         """
         rung_to_promote = self.is_promotable()
-        if rung_to_promote is not None:
-            rung = rung_to_promote + 1
-        else:
-            rung = self.min_rung
+        rung = rung_to_promote + 1 if rung_to_promote is not None else self.min_rung
         self.set_sampling_weights_and_inc(rung=rung)
         # performs standard ASHA but sampling happens as per the EnsemblePolicy
         return super().get_config_and_ids()
@@ -166,43 +167,43 @@ def __init__(
         # arguments for model
         model_based: bool = False,  # crucial argument to set to allow model-search
         modelling_type: str = "joint",  # could also be {"rung"}
-        initial_design_size: int = None,
+        initial_design_size: int | None = None,
         model_policy: typing.Any = ModelPolicy,
         surrogate_model: str | typing.Any = "gp",
-        domain_se_kernel: str = None,
-        hp_kernels: list = None,
-        surrogate_model_args: dict = None,
+        domain_se_kernel: str | None = None,
+        hp_kernels: list | None = None,
+        surrogate_model_args: dict | None = None,
         acquisition: str | BaseAcquisition = "EI",
         log_prior_weighted: bool = False,
         acquisition_sampler: str | AcquisitionSampler = "random",
     ):
         # collecting arguments required by ASHA
-        args = dict(
-            pipeline_space=pipeline_space,
-            budget=budget,
-            eta=eta,
-            early_stopping_rate=self.early_stopping_rate,
-            initial_design_type=initial_design_type,
-            sampling_policy=sampling_policy,
-            promotion_policy=promotion_policy,
-            loss_value_on_error=loss_value_on_error,
-            cost_value_on_error=cost_value_on_error,
-            ignore_errors=ignore_errors,
-            logger=logger,
-            prior_confidence=prior_confidence,
-            random_interleave_prob=random_interleave_prob,
-            sample_default_first=sample_default_first,
-            sample_default_at_target=sample_default_at_target,
-        )
-        bo_args = dict(
-            surrogate_model=surrogate_model,
-            domain_se_kernel=domain_se_kernel,
-            hp_kernels=hp_kernels,
-            surrogate_model_args=surrogate_model_args,
-            acquisition=acquisition,
-            log_prior_weighted=log_prior_weighted,
-            acquisition_sampler=acquisition_sampler,
-        )
+        args = {
+            "pipeline_space": pipeline_space,
+            "budget": budget,
+            "eta": eta,
+            "early_stopping_rate": self.early_stopping_rate,
+            "initial_design_type": initial_design_type,
+            "sampling_policy": sampling_policy,
+            "promotion_policy": promotion_policy,
+            "loss_value_on_error": loss_value_on_error,
+            "cost_value_on_error": cost_value_on_error,
+            "ignore_errors": ignore_errors,
+            "logger": logger,
+            "prior_confidence": prior_confidence,
+            "random_interleave_prob": random_interleave_prob,
+            "sample_default_first": sample_default_first,
+            "sample_default_at_target": sample_default_at_target,
+        }
+        bo_args = {
+            "surrogate_model": surrogate_model,
+            "domain_se_kernel": domain_se_kernel,
+            "hp_kernels": hp_kernels,
+            "surrogate_model_args": surrogate_model_args,
+            "acquisition": acquisition,
+            "log_prior_weighted": log_prior_weighted,
+            "acquisition_sampler": acquisition_sampler,
+        }
         super().__init__(
             **args,
             prior_weight_type=prior_weight_type,
@@ -257,7 +258,7 @@ def load_optimization_state(
             previous_results=previous_results,
             pending_evaluations=pending_evaluations,
             budget_info=budget_info,
-            optimizer_state=optimizer_state
+            optimizer_state=optimizer_state,
         )
         # important for the global HB to run the right SH
         self._update_sh_bracket_state()
@@ -278,8 +279,7 @@ def _get_bracket_to_run(self):
             self.eta ** (K - s) * (K + 1) / (K - s + 1) for s in range(self.max_rung + 1)
         ]
         bracket_probs = np.array(bracket_probs) / sum(bracket_probs)
-        bracket_next = np.random.choice(range(self.max_rung + 1), p=bracket_probs)
-        return bracket_next
+        return np.random.choice(range(self.max_rung + 1), p=bracket_probs)
 
     def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]:
         """...and this is the method that decides which point to query.
diff --git a/neps/optimizers/multi_fidelity_prior/priorband.py b/neps/optimizers/multi_fidelity_prior/priorband.py
index be7b3151..bdf3a567 100644
--- a/neps/optimizers/multi_fidelity_prior/priorband.py
+++ b/neps/optimizers/multi_fidelity_prior/priorband.py
@@ -1,16 +1,10 @@
+from __future__ import annotations
+
 import typing
 from typing import Literal
 
 import numpy as np
 
-from neps.utils.types import RawConfig
-from neps.search_spaces.search_space import SearchSpace
-from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
-    BaseAcquisition,
-)
-from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-    AcquisitionSampler,
-)
 from neps.optimizers.multi_fidelity.hyperband import HyperbandCustomDefault
 from neps.optimizers.multi_fidelity.mf_bo import MFBOBase
 from neps.optimizers.multi_fidelity.promotion_policy import SyncPromotionPolicy
@@ -22,6 +16,16 @@
     get_prior_weight_for_decay,
 )
 
+if typing.TYPE_CHECKING:
+    from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
+        BaseAcquisition,
+    )
+    from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
+        AcquisitionSampler,
+    )
+    from neps.search_spaces.search_space import SearchSpace
+    from neps.utils.types import RawConfig
+
 
 class PriorBandBase:
     """Class that defines essential properties needed by PriorBand.
@@ -35,16 +39,14 @@ def find_all_distances_from_incumbent(self, incumbent):
         # computing distance of incumbent from all seen points in history
         distances = [dist(config) for config in self.observed_configs.config]
         # ensuring the distances exclude 0 or the distance from itself
-        distances = [d for d in distances if d > 0]
-        return distances
+        return [d for d in distances if d > 0]
 
     def find_1nn_distance_from_incumbent(self, incumbent):
         """Finds the distance to the nearest neighbour."""
         distances = self.find_all_distances_from_incumbent(incumbent)
-        distance = min(distances)
-        return distance
+        return min(distances)
 
-    def find_incumbent(self, rung: int = None) -> SearchSpace:
+    def find_incumbent(self, rung: int | None = None) -> SearchSpace:
         """Find the best performing configuration seen so far."""
         rungs = self.observed_configs.rung.values
         idxs = self.observed_configs.index.values
@@ -120,7 +122,7 @@ def is_activate_inc(self) -> bool:
         continuation_resources = bracket.rung_map[bracket.min_rung]
         resources = bracket.config_map[bracket.min_rung] * continuation_resources
         for r in range(1, len(bracket.rung_map)):
-            rung = sorted(list(bracket.rung_map.keys()), reverse=False)[r]
+            rung = sorted(bracket.rung_map.keys(), reverse=False)[r]
             continuation_resources = bracket.rung_map[rung] - bracket.rung_map[rung - 1]
             resources += bracket.config_map[rung] * continuation_resources
 
@@ -144,7 +146,7 @@ def calc_sampling_args(self, rung) -> dict:
             # scales weight of prior by eta raised to the current rung level
             # at the base rung thus w_prior = w_random
             # at the max rung r, w_prior = eta^r * w_random
-            _w_prior = (self.eta ** rung) * _w_random
+            _w_prior = (self.eta**rung) * _w_random
         elif self.prior_weight_type == "linear":
             _w_random = 1
             w_prior_min_rung = 1 * _w_random
@@ -174,12 +176,11 @@ def calc_sampling_args(self, rung) -> dict:
         w_inc = _w_inc * w_prior
         w_prior = _w_prior * w_prior
 
-        sampling_args = {
+        return {
             "prior": w_prior,
             "inc": w_inc,
             "random": w_random,
         }
-        return sampling_args
 
     def prior_to_incumbent_ratio(self) -> float | float:
         """Calculates the normalized weight distribution between prior and incumbent.
@@ -188,15 +189,14 @@ def prior_to_incumbent_ratio(self) -> float | float:
         """
         if self.inc_style == "constant":
             return self._prior_to_incumbent_ratio_constant()
-        elif self.inc_style == "decay":
+        if self.inc_style == "decay":
             resources = calc_total_resources_spent(self.observed_configs, self.rung_map)
             return self._prior_to_incumbent_ratio_decay(
                 resources, self.eta, self.min_budget, self.max_budget
             )
-        elif self.inc_style == "dynamic":
+        if self.inc_style == "dynamic":
             return self._prior_to_incumbent_ratio_dynamic(self.max_rung)
-        else:
-            raise ValueError(f"Invalid option {self.inc_style}")
+        raise ValueError(f"Invalid option {self.inc_style}")
 
     def _prior_to_incumbent_ratio_decay(
         self, resources: float, eta: int, min_budget, max_budget
@@ -256,17 +256,14 @@ def _prior_to_incumbent_ratio_dynamic(self, rung: int) -> float | float:
             # normalizing scores to be weighted ratios
             w_prior = prior_score / sum(weighted_top_config_scores)
             w_inc = inc_score / sum(weighted_top_config_scores)
+        elif rung == self.min_rung:
+            # setting `w_inc = eta * w_prior` as default till score calculation begins
+            w_prior = self.eta / (1 + self.eta)
+            w_inc = 1 / (1 + self.eta)
         else:
-            # if eta-configurations NOT recorded yet
-            # check if it is the base rung
-            if rung == self.min_rung:
-                # setting `w_inc = eta * w_prior` as default till score calculation begins
-                w_prior = self.eta / (1 + self.eta)
-                w_inc = 1 / (1 + self.eta)
-            else:
-                # if rung > min.rung then the lower rung could already have enough
-                # configurations and thus can be recursively queried till the base rung
-                return self._prior_to_incumbent_ratio_dynamic(rung - 1)
+            # if rung > min.rung then the lower rung could already have enough
+            # configurations and thus can be recursively queried till the base rung
+            return self._prior_to_incumbent_ratio_dynamic(rung - 1)
         return w_prior, w_inc
 
 
@@ -296,12 +293,12 @@ def __init__(
         # arguments for model
         model_based: bool = False,  # crucial argument to set to allow model-search
         modelling_type: str = "joint",  # could also be {"rung"}
-        initial_design_size: int = None,
+        initial_design_size: int | None = None,
         model_policy: typing.Any = ModelPolicy,
         surrogate_model: str | typing.Any = "gp",
-        domain_se_kernel: str = None,
-        hp_kernels: list = None,
-        surrogate_model_args: dict = None,
+        domain_se_kernel: str | None = None,
+        hp_kernels: list | None = None,
+        surrogate_model_args: dict | None = None,
         acquisition: str | BaseAcquisition = "EI",
         log_prior_weighted: bool = False,
         acquisition_sampler: str | AcquisitionSampler = "random",
@@ -340,15 +337,15 @@ def __init__(
             },
         }
 
-        bo_args = dict(
-            surrogate_model=surrogate_model,
-            domain_se_kernel=domain_se_kernel,
-            hp_kernels=hp_kernels,
-            surrogate_model_args=surrogate_model_args,
-            acquisition=acquisition,
-            log_prior_weighted=log_prior_weighted,
-            acquisition_sampler=acquisition_sampler,
-        )
+        bo_args = {
+            "surrogate_model": surrogate_model,
+            "domain_se_kernel": domain_se_kernel,
+            "hp_kernels": hp_kernels,
+            "surrogate_model_args": surrogate_model_args,
+            "acquisition": acquisition,
+            "log_prior_weighted": log_prior_weighted,
+            "acquisition_sampler": acquisition_sampler,
+        }
         self.model_based = model_based
         self.modelling_type = modelling_type
         self.initial_design_size = initial_design_size
diff --git a/neps/optimizers/multi_fidelity_prior/utils.py b/neps/optimizers/multi_fidelity_prior/utils.py
index 9f4c1a47..a1c5c6dd 100644
--- a/neps/optimizers/multi_fidelity_prior/utils.py
+++ b/neps/optimizers/multi_fidelity_prior/utils.py
@@ -1,16 +1,22 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 import numpy as np
-import pandas as pd
 import scipy
 
 from neps.search_spaces import (
     CategoricalParameter,
     ConstantParameter,
+    GraphParameter,
     NumericalParameter,
     Parameter,
-    GraphParameter,
     SearchSpace,
 )
 
+if TYPE_CHECKING:
+    import pandas as pd
+
 
 def update_fidelity(config, fidelity):
     config.fidelity.set_value(fidelity)
@@ -32,7 +38,6 @@ def local_mutation(
         new_config: dict[str, Parameter] = {}
 
         for hp_name, hp in config.items():
-
             if hp.is_fidelity or np.random.uniform() > mutation_rate:
                 new_config[hp_name] = hp.clone()
 
@@ -79,7 +84,6 @@ def custom_crossover(
     getting config2's value of the corresponding HP. By default, crossover rate is 50%.
     """
     for _ in range(patience):
-
         child_config = config1.clone()
         for key, hyperparameter in config1.items():
             if not hyperparameter.is_fidelity and np.random.random() < crossover_prob:
@@ -121,8 +125,7 @@ def compute_config_dist(config1: SearchSpace, config2: SearchSpace) -> float:
         config1["categorical"] + [0], config2["categorical"] + [0]
     )
 
-    distance = d_cont + d_cat
-    return distance
+    return d_cont + d_cat
 
 
 def compute_scores(
@@ -153,8 +156,7 @@ def calc_total_resources_spent(observed_configs: pd.DataFrame, rung_map: dict) -
         for i in range(len(observed_configs))
         if not np.isnan(observed_configs.at[i, "perf"])
     ]
-    total_resources = sum(rung_map[r] for r in rungs_used)
-    return total_resources
+    return sum(rung_map[r] for r in rungs_used)
 
 
 # def get_prior_weight_for_decay(
@@ -176,7 +178,7 @@ def calc_total_resources_spent(observed_configs: pd.DataFrame, rung_map: dict) -
 def get_prior_weight_for_decay(
     resources_used: float, eta: int, min_budget, max_budget
 ) -> float:
-    """Creates a step function schedule for the prior weight decay.
+    r"""Creates a step function schedule for the prior weight decay.
 
     The prior weight ratio is decayed every time the total resources used is
     equivalent to the cost of one successive halving bracket within the HB schedule.
@@ -186,5 +188,4 @@ def get_prior_weight_for_decay(
     decay = 2
     unit_resources = eta * max_budget
     idx = resources_used // unit_resources
-    weight = 1 / decay**idx
-    return weight
+    return 1 / decay**idx
diff --git a/neps/optimizers/multiple_knowledge_sources/prototype_optimizer.py b/neps/optimizers/multiple_knowledge_sources/prototype_optimizer.py
index d14657bf..2263e16d 100644
--- a/neps/optimizers/multiple_knowledge_sources/prototype_optimizer.py
+++ b/neps/optimizers/multiple_knowledge_sources/prototype_optimizer.py
@@ -1,11 +1,15 @@
+from __future__ import annotations
+
 import logging
-from typing import Any, override
+from typing import TYPE_CHECKING, Any, override
 
-from neps.state.optimizer import BudgetInfo, OptimizationState
-from neps.utils.types import ConfigResult, RawConfig
-from neps.search_spaces.search_space import SearchSpace
-from neps.utils.data_loading import read_tasks_and_dev_stages_from_disk
 from neps.optimizers.base_optimizer import BaseOptimizer
+from neps.utils.data_loading import read_tasks_and_dev_stages_from_disk
+
+if TYPE_CHECKING:
+    from neps.search_spaces.search_space import SearchSpace
+    from neps.state.optimizer import BudgetInfo
+    from neps.utils.types import ConfigResult, RawConfig
 
 
 # TODO: Test if anything breaks after the recent changes
diff --git a/neps/optimizers/random_search/optimizer.py b/neps/optimizers/random_search/optimizer.py
index abe16866..e2da6f40 100644
--- a/neps/optimizers/random_search/optimizer.py
+++ b/neps/optimizers/random_search/optimizer.py
@@ -1,11 +1,15 @@
-from typing import Any
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
 from typing_extensions import override
 
-from neps.state.optimizer import BudgetInfo
-from neps.utils.types import ConfigResult, RawConfig
-from neps.search_spaces.search_space import SearchSpace
 from neps.optimizers.base_optimizer import BaseOptimizer
 
+if TYPE_CHECKING:
+    from neps.search_spaces.search_space import SearchSpace
+    from neps.state.optimizer import BudgetInfo
+    from neps.utils.types import ConfigResult, RawConfig
+
 
 class RandomSearch(BaseOptimizer):
     def __init__(self, use_priors=False, ignore_fidelity=True, **optimizer_kwargs):
diff --git a/neps/optimizers/regularized_evolution/optimizer.py b/neps/optimizers/regularized_evolution/optimizer.py
index d112be31..215e95ce 100644
--- a/neps/optimizers/regularized_evolution/optimizer.py
+++ b/neps/optimizers/regularized_evolution/optimizer.py
@@ -1,19 +1,23 @@
+from __future__ import annotations
+
 import math
 import os
 import random
+from collections.abc import Callable
 from pathlib import Path
-from typing import Any, Callable
+from typing import TYPE_CHECKING, Any
 from typing_extensions import override
 
 import numpy as np
 import yaml
 
-from neps.state.optimizer import BudgetInfo
-from neps.utils.types import ConfigResult, RawConfig
-
-from neps.search_spaces.search_space import SearchSpace
 from neps.optimizers.base_optimizer import BaseOptimizer
 
+if TYPE_CHECKING:
+    from neps.search_spaces.search_space import SearchSpace
+    from neps.state.optimizer import BudgetInfo
+    from neps.utils.types import ConfigResult, RawConfig
+
 
 class RegularizedEvolution(BaseOptimizer):
     def __init__(
@@ -63,18 +67,19 @@ def load_optimization_state(
         train_x = [el.config for el in previous_results.values()]
         train_y = [self.get_loss(el.result) for el in previous_results.values()]
         self.num_train_x = len(train_x)
-        self.population = [
-            (x, y)
-            for x, y in zip(
-                train_x[-self.population_size:], train_y[-self.population_size:]
+        self.population = list(
+            zip(
+                train_x[-self.population_size :],
+                train_y[-self.population_size :],
+                strict=False,
             )
-        ]
-        self.pending_evaluations = [el for el in pending_evaluations.values()]
+        )
+        self.pending_evaluations = list(pending_evaluations.values())
 
     def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]:
         if len(self.population) < self.population_size:
             if self.assisted:
-                if 0 == len(os.listdir(self.assisted_init_population_dir)):
+                if len(os.listdir(self.assisted_init_population_dir)) == 0:
                     cur_population_size = self.population_size - len(self.population)
                     configs = [
                         self.pipeline_space.sample(
@@ -83,13 +88,12 @@ def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]:
                         for _ in range(cur_population_size * 2)
                     ]
                     if self.assisted_zero_cost_proxy is not None:
-                        zero_cost_proxy_values = self.assisted_zero_cost_proxy(
-                            x=configs)  # type:  ignore[misc]
+                        zero_cost_proxy_values = self.assisted_zero_cost_proxy(x=configs)  # type:  ignore[misc]
                     else:
                         raise Exception("Zero cost proxy function is not defined!")
                     indices = np.argsort(zero_cost_proxy_values)[-cur_population_size:][
-                              ::-1
-                              ]
+                        ::-1
+                    ]
                     for idx, config_idx in enumerate(indices):
                         filename = str(idx).zfill(
                             int(math.log10(cur_population_size)) + 1
diff --git a/neps/optimizers/utils.py b/neps/optimizers/utils.py
index e9d29222..8a9a030f 100644
--- a/neps/optimizers/utils.py
+++ b/neps/optimizers/utils.py
@@ -1,19 +1,24 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 import pandas as pd
 
-from neps.search_spaces.search_space import SearchSpace
-     
+if TYPE_CHECKING:
+    from neps.search_spaces.search_space import SearchSpace
+
 
 def map_real_hyperparameters_from_tabular_ids(
     x: pd.Series, pipeline_space: SearchSpace
 ) -> pd.Series:
-    """ Maps the tabular IDs to the actual HPs from the pipeline space.
-    
+    """Maps the tabular IDs to the actual HPs from the pipeline space.
+
     Args:
         x (pd.Series): A pandas series with the tabular IDs.
             TODO: Mention expected format of the series.
         pipeline_space (SearchSpace): The pipeline space.
 
-    Returns: 
+    Returns:
         pd.Series: A pandas series with the actual HPs.
             TODO: Mention expected format of the series.
     """
@@ -21,8 +26,11 @@ def map_real_hyperparameters_from_tabular_ids(
         return x
     # copying hyperparameter configs based on IDs
     _x = pd.Series(
-        [pipeline_space.custom_grid_table[x.loc[idx]["id"].value] for idx in x.index.values],
-        index=x.index
+        [
+            pipeline_space.custom_grid_table[x.loc[idx]["id"].value]
+            for idx in x.index.values
+        ],
+        index=x.index,
     )
     # setting the passed fidelities for the corresponding IDs
     for idx in _x.index.values:
diff --git a/neps/plot/tensorboard_eval.py b/neps/plot/tensorboard_eval.py
index 2211537d..66512ee9 100644
--- a/neps/plot/tensorboard_eval.py
+++ b/neps/plot/tensorboard_eval.py
@@ -335,10 +335,7 @@ def _write_image_config(
         if tblogger.current_epoch >= 0 and tblogger.current_epoch % counter == 0:
             # Log every multiple of "counter"
 
-            if num_images > len(image):
-                # If the number of images requested by the user
-                # is more than the ones available.
-                num_images = len(image)
+            num_images = min(num_images, len(image))
 
             if random_images is False:
                 subset_images = image[:num_images]
diff --git a/neps/sampling/distributions.py b/neps/sampling/distributions.py
index 6b557e5a..f865d173 100644
--- a/neps/sampling/distributions.py
+++ b/neps/sampling/distributions.py
@@ -3,9 +3,10 @@
 from __future__ import annotations
 
 import math
+from collections.abc import Mapping
 from dataclasses import dataclass
 from numbers import Number
-from typing import TYPE_CHECKING, ClassVar, Mapping
+from typing import TYPE_CHECKING, ClassVar
 from typing_extensions import override
 
 import torch
diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py
index f2373a68..83c40e68 100644
--- a/neps/sampling/priors.py
+++ b/neps/sampling/priors.py
@@ -9,8 +9,9 @@
 
 from __future__ import annotations
 
+from collections.abc import Container, Iterable, Mapping, Sequence
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Container, Iterable, Mapping, Protocol, Sequence
+from typing import TYPE_CHECKING, Any, Protocol
 from typing_extensions import override
 
 import torch
@@ -278,7 +279,9 @@ def __post_init__(self):
             self._meaningful_dists = []
             return
 
-        self._meaningful_ixs, self._meaningful_doms, self._meaningful_dists = zip(*rest)
+        self._meaningful_ixs, self._meaningful_doms, self._meaningful_dists = zip(
+            *rest, strict=False
+        )
 
     @property
     @override
@@ -311,7 +314,7 @@ def log_prob(self, x: torch.Tensor, *, frm: list[Domain] | Domain) -> torch.Tens
 
         # Calculate the log probabilities of the sample domain tensors under their
         # respective distributions.
-        itr = iter(zip(self._meaningful_ixs, self._meaningful_dists))
+        itr = iter(zip(self._meaningful_ixs, self._meaningful_dists, strict=False))
         first_i, first_dist = next(itr)
         log_probs = first_dist.log_prob(translated_x[..., first_i])
 
@@ -416,7 +419,7 @@ def ncols(self) -> int:
     def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor:
         # OPTIM: Avoid an initial allocation by using the output of the first
         # distribution to store the weighted probabilities
-        itr = zip(self.probabilities, self.priors)
+        itr = zip(self.probabilities, self.priors, strict=False)
         first_prob, first_prior = next(itr)
 
         weighted_probs = first_prob * first_prior.log_prob(x, frm=frm)
diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py
index c7456155..43758094 100644
--- a/neps/sampling/samplers.py
+++ b/neps/sampling/samplers.py
@@ -6,9 +6,10 @@
 
 from __future__ import annotations
 
+from collections.abc import Sequence
 from dataclasses import dataclass, field
 from functools import reduce
-from typing import Protocol, Sequence
+from typing import Protocol
 from typing_extensions import override
 
 import torch
diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py
index 7342a136..c1a10196 100644
--- a/neps/search_spaces/domain.py
+++ b/neps/search_spaces/domain.py
@@ -44,8 +44,9 @@
 from __future__ import annotations
 
 import math
+from collections.abc import Iterable
 from dataclasses import dataclass, field
-from typing import Generic, Iterable, TypeVar
+from typing import Generic, TypeVar
 
 import torch
 from torch import Tensor
@@ -351,7 +352,7 @@ def translate(
             )
 
         out = torch.empty_like(x)
-        for i, (f, t) in enumerate(zip(frm, to)):
+        for i, (f, t) in enumerate(zip(frm, to, strict=False)):
             out[..., i] = t.cast(x[..., i], frm=f)
 
         return out
diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
index 71555fef..5f68aff9 100644
--- a/neps/search_spaces/encoding.py
+++ b/neps/search_spaces/encoding.py
@@ -1,12 +1,11 @@
 from __future__ import annotations
 
+from collections.abc import Mapping, Sequence
 from dataclasses import dataclass, field
 from typing import (
     TYPE_CHECKING,
     Any,
     Generic,
-    Mapping,
-    Sequence,
     TypeAlias,
     TypeVar,
 )
@@ -264,14 +263,17 @@ def unpack(self, x: torch.Tensor) -> list[dict[str, Any]]:
             values[hp_name] = transformer.decode(tensor)
 
         keys = list(values.keys())
-        return [dict(zip(keys, vals)) for vals in zip(*values.values())]
+        return [
+            dict(zip(keys, vals, strict=False))
+            for vals in zip(*values.values(), strict=False)
+        ]
 
     @classmethod
     def default(cls, parameters: Mapping[str, Parameter]) -> TensorEncoder:
         sorted_params = sorted(parameters.items())
         transformers: dict[str, TensorTransformer] = {}
         for name, hp in sorted_params:
-            if isinstance(hp, (FloatParameter, IntegerParameter)):
+            if isinstance(hp, FloatParameter | IntegerParameter):
                 transformers[name] = MinMaxNormalizer(hp.domain)
             else:
                 assert isinstance(hp, CategoricalParameter)
diff --git a/neps/state/__init__.py b/neps/state/__init__.py
index 7a85c7d4..e870d656 100644
--- a/neps/state/__init__.py
+++ b/neps/state/__init__.py
@@ -1,3 +1,4 @@
+from neps.state.optimizer import BudgetInfo, OptimizationState, OptimizerInfo
 from neps.state.protocols import (
     Locker,
     ReaderWriter,
@@ -5,7 +6,6 @@
     VersionedResource,
     Versioner,
 )
-from neps.state.optimizer import BudgetInfo, OptimizationState, OptimizerInfo
 from neps.state.seed_snapshot import SeedSnapshot
 from neps.state.trial import Trial
 

From 54b605929d48ff059626c4fe3bfc9f3f03c4a1fd Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 18 Sep 2024 13:09:45 +0200
Subject: [PATCH 37/63] ci: Update deps on botorch/gpytorch

---
 pyproject.toml | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9b670189..3e304c55 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,7 +54,7 @@ pandas = "^2"
 networkx = "^2.6.3"
 nltk = "^3.6.4"
 scipy = ">=1.13.1"
-torch = ">1.7.0,!=2.0.1, !=2.1.0"
+torch = ">=2.0.1"
 matplotlib = "^3"
 more-itertools = "*"
 portalocker = "^2"
@@ -64,6 +64,8 @@ tensorboard = "^2"
 typing-extensions = "*"
 torchvision = ">=0.8.0"
 ifbo = ">=0.3.10"
+botorch = ">=0.12"
+gpytorch = "1.13.0"
 
 [tool.poetry.group.dev.dependencies]
 ruff = "*"
@@ -80,13 +82,6 @@ mkdocs-literate-nav = "*"
 mike = "*"
 black = "*"                                           # This allows mkdocstrings to format signatures in the docs
 
-
-[tool.poetry.group.experimental]
-optional = true
-
-[tool.poetry.group.experimental.dependencies]
-gpytorch = "1.8.0"
-
 [build-system]
 requires = ["poetry-core>=1.1.0"]
 build-backend = "poetry.core.masonry.api"

From 6782a8ec50ba3ac1b3479146f9ffd28d618e4142 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 18 Sep 2024 14:46:22 +0200
Subject: [PATCH 38/63] fix: rely on botorch's new defaults

https://github.com/pytorch/botorch/discussions/2451
---
 .../kernels/get_kernels.py                    |  40 -----
 .../bayesian_optimization/models/gp.py        | 163 +++---------------
 .../bayesian_optimization/optimizer.py        |  11 +-
 3 files changed, 27 insertions(+), 187 deletions(-)
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/get_kernels.py

diff --git a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py b/neps/optimizers/bayesian_optimization/kernels/get_kernels.py
deleted file mode 100644
index 36add92e..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from __future__ import annotations
-
-from neps.search_spaces.architecture.core_graph_grammar import CoreGraphGrammar
-from neps.search_spaces.hyperparameters.categorical import CategoricalParameter
-from neps.search_spaces.hyperparameters.float import FloatParameter
-from neps.search_spaces.hyperparameters.integer import IntegerParameter
-from neps.utils.common import has_instance, instance_from_map
-
-from . import GraphKernelMapping, StationaryKernelMapping
-
-
-def get_kernels(
-    pipeline_space, domain_se_kernel, graph_kernels, hp_kernels, optimal_assignment
-):
-    if not graph_kernels:
-        graph_kernels = []
-        if has_instance(pipeline_space.values(), CoreGraphGrammar):
-            graph_kernels.append("wl")
-    if not hp_kernels:
-        hp_kernels = []
-        if has_instance(pipeline_space.values(), FloatParameter, IntegerParameter):
-            hp_kernels.append("m52")
-        if has_instance(pipeline_space.values(), CategoricalParameter):
-            hp_kernels.append("hm")
-    graph_kernels = [
-        instance_from_map(GraphKernelMapping, kernel, "kernel", as_class=True)(
-            oa=optimal_assignment,
-            se_kernel=instance_from_map(
-                StationaryKernelMapping, domain_se_kernel, "se kernel"
-            ),
-        )
-        for kernel in graph_kernels
-    ]
-    hp_kernels = [
-        instance_from_map(StationaryKernelMapping, kernel, "kernel")
-        for kernel in hp_kernels
-    ]
-    if not graph_kernels and not hp_kernels:
-        raise ValueError("No kernels are provided!")
-    return graph_kernels, hp_kernels
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index 2ab0b897..0281cd1a 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import logging
-import math
 from collections.abc import Mapping
 from functools import reduce
 from typing import TYPE_CHECKING, Any, TypeVar
@@ -12,14 +11,13 @@
 import gpytorch.constraints
 import torch
 from botorch.acquisition.analytic import SingleTaskGP
-from botorch.models.gp_regression_mixed import (
-    CategoricalKernel,
-    Likelihood,
-    OutcomeTransform,
+from botorch.models.gp_regression import (
+    get_covar_module_with_dim_scaled_prior,
 )
+from botorch.models.gp_regression_mixed import CategoricalKernel, OutcomeTransform
 from botorch.models.transforms.outcome import Standardize
 from botorch.optim import optimize_acqf, optimize_acqf_mixed
-from gpytorch.kernels import MaternKernel, ScaleKernel
+from gpytorch.kernels import ScaleKernel
 from torch._dynamo.utils import product
 
 from neps.search_spaces.encoding import (
@@ -37,111 +35,6 @@
 T = TypeVar("T")
 
 
-def likelihood_with_prior_on_log_scale(
-    mean: float = 1e-2,
-    std: float = math.sqrt(3),
-    bounds: tuple[float, float] = (1e-6, 1),
-) -> gpytorch.likelihoods.GaussianLikelihood:
-    """Default Gaussian likelihood with priors for the noise."""
-    # The effect of the likelihood of noise is pretty crucial w.r.t.
-    # whether we are going to overfit every point by overfitting with
-    # the lengthscale, or whether we smooth through and assume variation
-    # is due to noise. Setting it's prior is hard. For a non-noisy
-    # function, we'd want it looooowww, like 1e-8 kind of low. For
-    # even a 0.01% noise, we need that all the way up to 1e-2. Hence
-    #
-    # If we had 10% noise and we allow the noise to easily optimize towards
-    # 1e-8, then the lengthscales are forced to become very small, essentially
-    # overfitting. If we have 0% noise and we don't allow it to easily get low
-    # then we will drastically underfit.
-    # A guiding principle here is that we should allow the noise to be just
-    # as if not slightly easier to tune than the lengthscales. I.e. we prefer
-    # smoother functions as it is easier to acquisition over. However once we
-    # over smooth and underfit, any new observations that inform us otherwise
-    # could just be attributed to noise.
-    #
-    # TOOD: We may want to move the likelihood inside the GP and decay the
-    # amount the GP can attribute to noise (reduce std and mean) relative
-    # to samples seen, effectively reducing the smoothness of the GP overtime
-    _noise_prior = gpytorch.priors.LogNormalPrior(math.log(mean) + std**2, std)
-    return gpytorch.likelihoods.GaussianLikelihood(
-        noise_prior=_noise_prior,
-        # Going below 1e-6 could introduuce a lot of numerical instability in the
-        # kernels, even if it's a noiseless function
-        noise_constraint=gpytorch.constraints.Interval(
-            lower_bound=bounds[0],
-            upper_bound=bounds[1],
-            initial_value=mean,
-        ),
-    )
-
-
-def default_signal_variance_prior() -> gpytorch.priors.NormalPrior:
-    """Default prior for the signal variance."""
-    # The outputscale prior is a bit more tricky. Essentially
-    # it describes how much we expect the function to move
-    # around the mean (0 as we normalize the `ys`)
-    # Based on `Vanilla GP work great in High Dimensions` by Carl Hvafner
-    # where it's fixed to `1.0`, we follow suit but allow some minor deviation
-    # with a prior.
-    return gpytorch.priors.NormalPrior(loc=1.0, scale=0.1)
-
-
-def default_lengthscale_prior(
-    N: int,
-) -> tuple[gpytorch.priors.LogNormalPrior, gpytorch.constraints.Interval]:
-    """Default prior for the lengthscale."""
-    # Based on `Vanilla GP work great in High Dimensions` by Carl Hvafner
-    # TODO: I'm not convinced entirely that the `std` is independant
-    # of the dimension and number of samples
-    lengthscale_prior = gpytorch.priors.LogNormalPrior(
-        loc=math.sqrt(2.0) + math.log(N) / 2,
-        scale=math.sqrt(3.0) * math.log(N),
-    )
-    # NOTE: It's possible to just specify `GreaterThan`, however
-    # digging through the code, if this ends up at botorch's optimize,
-    # it will read this and take the bounds and give it to Scipy's
-    # L-BFGS-B optimizer. Without an upper bound, it defaults to `inf`,
-    # which can impact gradient estimates.
-    # tldr; set a bound if you have one, it always helps
-    lengthscale_constraint = gpytorch.constraints.Interval(
-        lower_bound=1e-4,
-        upper_bound=1e3,
-        initial_value=math.sqrt(2.0) + math.log(N) / 2,
-    )
-    return lengthscale_prior, lengthscale_constraint
-
-
-def default_mean() -> gpytorch.means.ConstantMean:
-    """Default mean for the GP."""
-    return gpytorch.means.ConstantMean(
-        constant_prior=gpytorch.priors.NormalPrior(0, 0.2),
-        constant_constraint=gpytorch.constraints.Interval(
-            lower_bound=-1e6,
-            upper_bound=1e6,
-            initial_value=0.0,
-        ),
-    )
-
-
-def default_matern_kernel(
-    N: int,
-    active_dims: tuple[int, ...] | None = None,
-) -> ScaleKernel:
-    """Default Matern kernel for the GP."""
-    lengthscale_prior, lengthscale_constraint = default_lengthscale_prior(N)
-
-    return ScaleKernel(
-        MaternKernel(
-            nu=2.5,
-            ard_num_dims=N,
-            active_dims=active_dims,
-            lengthscale_prior=lengthscale_prior,
-            lengthscale_constraint=lengthscale_constraint,
-        ),
-    )
-
-
 def default_categorical_kernel(
     N: int,
     active_dims: tuple[int, ...] | None = None,
@@ -161,14 +54,14 @@ def default_single_obj_gp(
     x: TensorPack,
     y: torch.Tensor,
     *,
-    outcome_transform: OutcomeTransform | None = None,
-) -> tuple[SingleTaskGP, Likelihood]:
+    y_transform: OutcomeTransform | None = None,
+) -> SingleTaskGP:
     """Default GP for single objective optimization."""
     if y.ndim == 1:
         y = y.unsqueeze(-1)
 
-    if outcome_transform is None:
-        outcome_transform = Standardize(m=1)
+    if y_transform is None:
+        y_transform = Standardize(m=1)
 
     encoder = x.encoder
     numerics: list[int] = []
@@ -179,38 +72,24 @@ def default_single_obj_gp(
         else:
             numerics.append(encoder.index_of[hp_name])
 
-    # TODO: If we have a low cardinality integer, we should consider
-    # just treating it as a categorical...
-    likelihood = likelihood_with_prior_on_log_scale()
-
     # Purely vectorial
     if len(categoricals) == 0:
-        gp = SingleTaskGP(
-            train_X=x.tensor,
-            train_Y=y,
-            mean_module=default_mean(),
-            likelihood=likelihood,
-            # Only matern kernel
-            covar_module=default_matern_kernel(len(numerics)),
-            outcome_transform=outcome_transform,
-        )
-        return gp, likelihood
+        return SingleTaskGP(train_X=x.tensor, train_Y=y, outcome_transform=y_transform)
 
     # Purely categorical
     if len(numerics) == 0:
-        gp = SingleTaskGP(
+        return SingleTaskGP(
             train_X=x.tensor,
             train_Y=y,
-            mean_module=default_mean(),
-            likelihood=likelihood,
-            # Only categorical kernel
             covar_module=default_categorical_kernel(len(categoricals)),
-            outcome_transform=outcome_transform,
+            outcome_transform=y_transform,
         )
-        return gp, likelihood
 
     # Mixed
-    numeric_kernel = default_matern_kernel(len(numerics), active_dims=tuple(numerics))
+    numeric_kernel = get_covar_module_with_dim_scaled_prior(
+        ard_num_dims=len(numerics),
+        active_dims=tuple(numerics),
+    )
     cat_kernel = default_categorical_kernel(
         len(categoricals), active_dims=tuple(categoricals)
     )
@@ -223,19 +102,17 @@ def default_single_obj_gp(
     #
     # In a toy example with a single binary categorical which acted like F * {0, 1},
     # the model collapsed to always predicting `0`. Causing all parameters defining F
-    # to essentially be guess at random. This is a lot more stable while testing...
-    # TODO: Figure out why...
+    # to essentially be guess at random. This is a lot more stable but likely not as
+    # good...
+    # TODO: Figure out how to improve stability of this.
     kernel = numeric_kernel + cat_kernel
 
-    gp = SingleTaskGP(
+    return SingleTaskGP(
         train_X=x.tensor,
         train_Y=y,
-        mean_module=default_mean(),
-        likelihood=likelihood,
         covar_module=kernel,
-        outcome_transform=outcome_transform,
+        outcome_transform=y_transform,
     )
-    return gp, likelihood
 
 
 def optimize_acq(
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 6fe20655..d9bd10e3 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -312,13 +312,15 @@ def ask(
         y = _missing_y_strategy(y)
 
         # Now fit our model
-        y_model, y_likelihood = default_single_obj_gp(
+        y_model = default_single_obj_gp(
             x,
             y,
             # TODO: We should consider applying some heurisitc to see if this should
             # also include a log transform, similar as we do to cost if using `use_cost`.
-            outcome_transform=Standardize(m=1),
+            y_transform=Standardize(m=1),
         )
+        y_likelihood = y_model.likelihood
+
         fit_gpytorch_mll(
             ExactMarginalLogLikelihood(likelihood=y_likelihood, model=y_model)
         )
@@ -368,16 +370,17 @@ def ask(
             cost = torch.tensor(costs, dtype=torch.float64, device=self.device)
             cost_z_score = _missing_cost_strategy(cost)
 
-            cost_model, cost_likelihood = default_single_obj_gp(
+            cost_model = default_single_obj_gp(
                 x,
                 cost_z_score,
-                outcome_transform=ChainedOutcomeTransform(
+                y_transform=ChainedOutcomeTransform(
                     # TODO: Maybe some way for a user to specify their cost
                     # is on a log scale?
                     log=Log(),
                     standardize=Standardize(m=1),
                 ),
             )
+            cost_likelihood = cost_model.likelihood
 
             # Optimize the cost model
             fit_gpytorch_mll(

From 4e8e30824b059a930380c740ca64752c37b6cacb Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 18 Sep 2024 14:58:58 +0200
Subject: [PATCH 39/63] merge

---
 neps/__init__.py                              |   2 -
 neps/search_spaces/__init__.py                |   2 -
 neps/search_spaces/architecture/api.py        |  33 +-
 neps/search_spaces/architecture/cfg.py        | 115 ++-
 .../cfg_variants/cfg_resolution.py            | 385 ---------
 .../cfg_variants/constrained_cfg.py           |  78 +-
 .../architecture/core_graph_grammar.py        | 705 +++-------------
 neps/search_spaces/architecture/crossover.py  | 135 +--
 neps/search_spaces/architecture/graph.py      | 470 +++--------
 .../architecture/graph_grammar.py             | 793 ++----------------
 neps/search_spaces/architecture/mutations.py  |  47 +-
 neps/search_spaces/architecture/primitives.py | 332 ++++----
 neps/search_spaces/architecture/topologies.py |  81 +-
 ...rs_for_architecture_and_hyperparameters.py |   6 +-
 .../experimental/hierarchical_architecture.py |   8 +-
 ...erarchical_architecture_hierarchical_GP.py |   6 +-
 pyproject.toml                                | 245 +++---
 17 files changed, 788 insertions(+), 2655 deletions(-)
 delete mode 100644 neps/search_spaces/architecture/cfg_variants/cfg_resolution.py

diff --git a/neps/__init__.py b/neps/__init__.py
index b2276ca3..c27257ef 100644
--- a/neps/__init__.py
+++ b/neps/__init__.py
@@ -8,8 +8,6 @@
     FloatParameter,
     FunctionParameter,
     GraphGrammar,
-    GraphGrammarCell,
-    GraphGrammarRepetitive,
     IntegerParameter,
 )
 from neps.status.status import get_summary_dict, status
diff --git a/neps/search_spaces/__init__.py b/neps/search_spaces/__init__.py
index 8a289100..0cfbd9bc 100644
--- a/neps/search_spaces/__init__.py
+++ b/neps/search_spaces/__init__.py
@@ -2,8 +2,6 @@
 from neps.search_spaces.architecture.graph_grammar import (
     CoreGraphGrammar,
     GraphGrammar,
-    GraphGrammarCell,
-    GraphGrammarRepetitive,
     GraphParameter,
 )
 from neps.search_spaces.hyperparameters import (
diff --git a/neps/search_spaces/architecture/api.py b/neps/search_spaces/architecture/api.py
index a3af1510..de19a9ef 100644
--- a/neps/search_spaces/architecture/api.py
+++ b/neps/search_spaces/architecture/api.py
@@ -1,18 +1,20 @@
 
 
 import inspect
-from typing import Callable
+from typing import TYPE_CHECKING, Callable
 
 import networkx as nx
-from torch import nn
 
 from .cfg import Grammar
 from .cfg_variants.constrained_cfg import ConstrainedGrammar
-from .graph_grammar import GraphGrammar, GraphGrammarMultipleRepetitive
+from .graph_grammar import GraphGrammar
+
+if TYPE_CHECKING:
+    from torch import nn
 
 
 def _dict_structure_to_str(
-    structure: dict, primitives: dict, repetitive_mapping: dict = None
+    structure: dict, primitives: dict, repetitive_mapping: dict | None = None
 ) -> str:
     def _save_replace(string: str, __old: str, __new: str):
         while string.count(__old) > 0:
@@ -25,18 +27,18 @@ def _save_replace(string: str, __old: str, __new: str):
     grammar = grammar.replace("(", " ")
     grammar = grammar.replace(")", "")
     grammar = grammar.replace(",", "")
-    for primitive in primitives.keys():
+    for primitive in primitives:
         grammar = _save_replace(grammar, f" {primitive} ", f' "{primitive}" ')
         grammar = _save_replace(grammar, f" {primitive}\n", f' "{primitive}"\n')
     if repetitive_mapping is not None:
-        for placeholder in repetitive_mapping.keys():
+        for placeholder in repetitive_mapping:
             grammar = _save_replace(grammar, f" {placeholder} ", f' "{placeholder}" ')
             grammar = _save_replace(grammar, f" {placeholder}\n", f' "{placeholder}"\n')
     return grammar
 
 
 def _build(graph, set_recursive_attribute):
-    in_node = [n for n in graph.nodes if graph.in_degree(n) == 0][0]
+    in_node = next(n for n in graph.nodes if graph.in_degree(n) == 0)
     for n in nx.topological_sort(graph):
         for pred in graph.predecessors(n):
             e = (pred, n)
@@ -44,20 +46,17 @@ def _build(graph, set_recursive_attribute):
             if pred == in_node:
                 predecessor_values = None
             else:
-                pred_pred = list(graph.predecessors(pred))[0]
+                pred_pred = next(iter(graph.predecessors(pred)))
                 predecessor_values = graph.edges[(pred_pred, pred)]
             graph.edges[e].update(set_recursive_attribute(op_name, predecessor_values))
 
 
 def ArchitectureParameter(**kwargs):
-    """Factory function"""
-
+    """Factory function."""
     if "structure" not in kwargs:
         raise ValueError("Factory function requires structure")
     if not isinstance(kwargs["structure"], list) or len(kwargs["structure"]) == 1:
         base = GraphGrammar
-    else:
-        base = GraphGrammarMultipleRepetitive
 
     class _FunctionParameter(base):
         def __init__(
@@ -89,9 +88,9 @@ def __init__(
                     _dict_structure_to_str(
                         st,
                         primitives,
-                        repetitive_mapping=kwargs["terminal_to_sublanguage_map"]
-                        if "terminal_to_sublanguage_map" in kwargs
-                        else None,
+                        repetitive_mapping=kwargs.get(
+                            "terminal_to_sublanguage_map", None
+                        ),
                     )
                     if isinstance(st, dict)
                     else st
@@ -144,9 +143,7 @@ def to_pytorch(self) -> nn.Module:
                 self.prune_graph()
 
                 if self._set_recursive_attribute:
-                    m = _build(
-                        self, self._set_recursive_attribute
-                    )
+                    m = _build(self, self._set_recursive_attribute)
 
                 if m is not None:
                     return m
diff --git a/neps/search_spaces/architecture/cfg.py b/neps/search_spaces/architecture/cfg.py
index 7e4aa453..958d09f3 100644
--- a/neps/search_spaces/architecture/cfg.py
+++ b/neps/search_spaces/architecture/cfg.py
@@ -1,25 +1,21 @@
+from __future__ import annotations
 
 import itertools
 import math
 import sys
 from collections import defaultdict, deque
-from functools import partial
-from queue import LifoQueue
-from typing import Deque, Tuple, Hashable
+from typing import Hashable
 
 import numpy as np
 from nltk import CFG, Production
 from nltk.grammar import Nonterminal
-from scipy.integrate._ivp.radau import P
-from torch import Value
 
 
 class Grammar(CFG):
-    """
-    Extended context free grammar (CFG) class from the NLTK python package
+    """Extended context free grammar (CFG) class from the NLTK python package
     We have provided functionality to sample from the CFG.
     We have included generation capability within the class (before it was an external function)
-    Also allow sampling to return whole trees (not just the string of terminals)
+    Also allow sampling to return whole trees (not just the string of terminals).
     """
 
     def __init__(self, *args, **kwargs):
@@ -96,7 +92,9 @@ def compute_space_size(self) -> int:
             int: size of space described by grammar.
         """
 
-        def recursive_worker(nonterminal: Nonterminal, memory_bank: dict = None) -> int:
+        def recursive_worker(
+            nonterminal: Nonterminal, memory_bank: dict | None = None
+        ) -> int:
             if memory_bank is None:
                 memory_bank = {}
 
@@ -110,7 +108,7 @@ def recursive_worker(nonterminal: Nonterminal, memory_bank: dict = None) -> int:
                 ]
                 possibilities_per_edge = [
                     memory_bank[str(e_nonterminal)]
-                    if str(e_nonterminal) in memory_bank.keys()
+                    if str(e_nonterminal) in memory_bank
                     else recursive_worker(e_nonterminal, memory_bank)
                     for e_nonterminal in edges_nonterminals
                 ]
@@ -165,7 +163,7 @@ def sampler_restricted(self, n, max_length=5, cfactor=0.1, min_length=0):
     def sampler(
         self,
         n=1,
-        start_symbol: str = None,
+        start_symbol: str | None = None,
         user_priors: bool = False,
     ):
         # sample n sequences from the CFG
@@ -178,24 +176,27 @@ def sampler(
         # less likely it is to terminate. Therefore, we set the default sampler (setting convergent=True) to
         # downweight frequent productions when traversing the grammar.
         # see https://eli.thegreenplace.net/2010/01/28/generating-random-sentences-from-a-context-free-236grammar
-        if start_symbol is None:
-            start_symbol = self.start()
-        else:
-            start_symbol = Nonterminal(start_symbol)
+        start_symbol = self.start() if start_symbol is None else Nonterminal(start_symbol)
 
         if self.convergent:
             cfactor = 0.1
             return [
                 f"{self._convergent_sampler(symbol=start_symbol, cfactor=cfactor)[0]})"
-                for i in range(0, n)
+                for _ in range(n)
             ]
         else:
             return [
                 f"{self._sampler(symbol=start_symbol, user_priors=user_priors)})"
-                for i in range(0, n)
+                for _ in range(n)
             ]
 
-    def _sampler(self, symbol=None, user_priors: bool = False, *, _cache: dict[Hashable, str] | None = None):
+    def _sampler(
+        self,
+        symbol=None,
+        user_priors: bool = False,
+        *,
+        _cache: dict[Hashable, str] | None = None,
+    ):
         # simple sampler where each production is sampled uniformly from all possible productions
         # Tree choses if return tree or list of terminals
         # recursive implementation
@@ -207,7 +208,7 @@ def _sampler(self, symbol=None, user_priors: bool = False, *, _cache: dict[Hasha
         # collect possible productions from the starting symbol
         productions = self.productions(lhs=symbol)
         # sample
-        if 0 == len(productions):
+        if len(productions) == 0:
             raise Exception(f"Nonterminal {symbol} has no productions!")
         if user_priors and self._prior is not None:
             production = choice(productions, probs=self._prior[str(symbol)])
@@ -228,7 +229,7 @@ def _sampler(self, symbol=None, user_priors: bool = False, *, _cache: dict[Hasha
 
         return tree
 
-    def sampler_maxMin_func(self, symbol: str = None, largest: bool = True):
+    def sampler_maxMin_func(self, symbol: str | None = None, largest: bool = True):
         tree = "(" + str(symbol)
         # collect possible productions from the starting symbol
         productions = self.productions(lhs=symbol)
@@ -242,9 +243,7 @@ def sampler_maxMin_func(self, symbol: str = None, largest: bool = True):
                 tree = tree + " " + self.sampler_maxMin_func(sym, largest=largest) + ")"
         return tree
 
-    def _convergent_sampler(
-        self, cfactor, symbol=None, pcount=defaultdict(int)
-    ):
+    def _convergent_sampler(self, cfactor, symbol=None, pcount=None):
         # sampler that down-weights the probability of selcting the same production many times
         # ensuring that the sampled trees are not 'too' long (size to be controlled by cfactor)
         #
@@ -252,6 +251,8 @@ def _convergent_sampler(
         #:pcount: storage for the productions used in the current branch
 
         # init the sequence
+        if pcount is None:
+            pcount = defaultdict(int)
         tree = "(" + str(symbol)
         # init counter of tree depth and number of production rules
         depth, num_prod = 1, 1
@@ -301,8 +302,7 @@ def compute_prior(self, string_tree: str, log: bool = True) -> float:
         symbols = self.nonterminals + self.terminals
         q_production_rules: list[tuple[list, int]] = []
         non_terminal_productions: dict[str, list[Production]] = {
-            sym: self.productions(lhs=Nonterminal(sym))
-            for sym in self.nonterminals
+            sym: self.productions(lhs=Nonterminal(sym)) for sym in self.nonterminals
         }
 
         _symbols_by_size = sorted(symbols, key=len, reverse=True)
@@ -322,11 +322,11 @@ def compute_prior(self, string_tree: str, log: bool = True) -> float:
                     continue
 
                 # special case: "(" is (part of) a terminal
-                if string_tree[i - 1: i + 2] != " ( ":
+                if string_tree[i - 1 : i + 2] != " ( ":
                     i += 1
                     continue
 
-            if char == ")" and not string_tree[i - 1] == " ":
+            if char == ")" and string_tree[i - 1] != " ":
                 # closing symbol of production
                 production = q_production_rules.pop()[0][0]
                 lhs_production = production.lhs()
@@ -336,7 +336,7 @@ def compute_prior(self, string_tree: str, log: bool = True) -> float:
                     prior_prob += np.log(self.prior[(lhs_production)][idx] + 1e-15)
                 else:
                     prior_prob *= self.prior[str(lhs_production)][idx]
-                i+=1
+                i += 1
                 continue
 
             _s = string_tree[i : i + _longest]
@@ -344,7 +344,9 @@ def compute_prior(self, string_tree: str, log: bool = True) -> float:
                 if _s.startswith(sym):
                     break
             else:
-                raise RuntimeError(f"Terminal or nonterminal at position {i} does not exist")
+                raise RuntimeError(
+                    f"Terminal or nonterminal at position {i} does not exist"
+                )
 
             i += len(sym) - 1
 
@@ -362,8 +364,7 @@ def compute_prior(self, string_tree: str, log: bool = True) -> float:
                     new_productions = [
                         production
                         for production in _productions
-                        if str(production.rhs()[_count])
-                        == sym
+                        if str(production.rhs()[_count]) == sym
                     ]
                     q_production_rules[-1] = (new_productions, _count + 1)
 
@@ -378,8 +379,7 @@ def compute_prior(self, string_tree: str, log: bool = True) -> float:
         return prior_prob
 
     def _generate(self, start=None, depth=None, n=None):
-        """
-        see https://www.nltk.org/_modules/nltk/parse/generate.html
+        """See https://www.nltk.org/_modules/nltk/parse/generate.html
         Generates an iterator of all sentences from a CFG.
 
         :param grammar: The Grammar used to generate sentences.
@@ -461,9 +461,7 @@ def mutate(
                 break
             _patience -= 1
 
-        child = self._remove_empty_spaces(child)
-
-        return child
+        return self._remove_empty_spaces(child)
 
     def crossover(
         self,
@@ -506,7 +504,7 @@ def crossover(
 
         return False, False
 
-    def rand_subtree(self, tree: str) -> Tuple[str, int]:
+    def rand_subtree(self, tree: str) -> tuple[str, int]:
         """Helper function to choose a random subtree in a given parse tree.
         Runs a single pass through the tree (stored as string) to look for
         the location of swappable nonterminal symbols.
@@ -520,7 +518,7 @@ def rand_subtree(self, tree: str) -> Tuple[str, int]:
         split_tree = tree.split(" ")
         swappable_indices = [
             i
-            for i in range(0, len(split_tree))
+            for i in range(len(split_tree))
             if split_tree[i][1:] in self.swappable_nonterminals
         ]
         r = np.random.randint(1, len(swappable_indices))
@@ -530,7 +528,7 @@ def rand_subtree(self, tree: str) -> Tuple[str, int]:
 
     @staticmethod
     def rand_subtree_fixed_head(
-        tree: str, head_node: str, swappable_indices: list = None
+        tree: str, head_node: str, swappable_indices: list | None = None
     ) -> int:
         # helper function to choose a random subtree from a given tree with a specific head node
         # if no such subtree then return False, otherwise return the index of the subtree
@@ -539,7 +537,7 @@ def rand_subtree_fixed_head(
         if swappable_indices is None:
             split_tree = tree.split(" ")
             swappable_indices = [
-                i for i in range(0, len(split_tree)) if split_tree[i][1:] == head_node
+                i for i in range(len(split_tree)) if split_tree[i][1:] == head_node
             ]
         if not isinstance(swappable_indices, list):
             raise TypeError("Expected list for swappable indices!")
@@ -553,15 +551,14 @@ def rand_subtree_fixed_head(
                 if len(swappable_indices) > 1
                 else 0
             )
-            chosen_non_terminal_index = swappable_indices[r]
-            return chosen_non_terminal_index
+            return swappable_indices[r]
 
     @staticmethod
-    def remove_subtree(tree: str, index: int) -> Tuple[str, str, str]:
+    def remove_subtree(tree: str, index: int) -> tuple[str, str, str]:
         """Helper functioon to remove a subtree from a parse tree
         given its index.
         E.g. '(S (S (T 2)) (ADD +) (T 1))'
-        becomes '(S (S (T 2)) ', '(T 1))'  after removing (ADD +)
+        becomes '(S (S (T 2)) ', '(T 1))'  after removing (ADD +).
 
         Args:
             tree (str): parse tree
@@ -613,7 +610,7 @@ def __init__(self, *args, **kwargs):
 
     def set_depth_constraints(self, depth_constraints):
         self.depth_constraints = depth_constraints
-        if not all(k in self.nonterminals for k in self.depth_constraints.keys()):
+        if not all(k in self.nonterminals for k in self.depth_constraints):
             raise Exception(
                 f"Nonterminal {set(self.depth_constraints.keys())-set(self.nonterminals)} does not exist in grammar"
             )
@@ -625,27 +622,24 @@ def is_depth_constrained():
     def sampler(  # type: ignore[override]
         self,
         n: int = 1,
-        start_symbol: str = None,
-        depth_information: dict = None,
+        start_symbol: str | None = None,
+        depth_information: dict | None = None,
     ):
         if self.depth_constraints is None:
             raise ValueError("Depth constraints are not set!")
 
-        if start_symbol is None:
-            start_symbol = self.start()
-        else:
-            start_symbol = Nonterminal(start_symbol)
+        start_symbol = self.start() if start_symbol is None else Nonterminal(start_symbol)
 
         if depth_information is None:
             depth_information = {}
         return [
             f"{self._depth_constrained_sampler(symbol=start_symbol, depth_information=depth_information)})"
-            for i in range(0, n)
+            for i in range(n)
         ]
 
     def _compute_depth_information_for_pre(self, tree: str) -> dict:
         depth_information = {nt: 0 for nt in self.nonterminals}
-        q_nonterminals: Deque = deque()
+        q_nonterminals: deque = deque()
         for split in tree.split(" "):
             if split == "":
                 continue
@@ -666,7 +660,7 @@ def _compute_depth_information(self, tree: str) -> tuple:
         helper_subtree_depth = [0] * len(split_tree)
         helper_dict_depth_information = {nt: 0 for nt in self.nonterminals}
         helper_dict_subtree_depth: dict = {nt: deque() for nt in self.nonterminals}
-        q_nonterminals: Deque = deque()
+        q_nonterminals: deque = deque()
         for i, split in enumerate(split_tree):
             if split == "":
                 continue
@@ -692,7 +686,7 @@ def _compute_depth_information(self, tree: str) -> tuple:
     def _compute_max_depth(self, tree: str, subtree_node: str) -> int:
         max_depth = 0
         depth_information = {nt: 0 for nt in self.nonterminals}
-        q_nonterminals: Deque = deque()
+        q_nonterminals: deque = deque()
         for split in tree.split(" "):
             if split == "":
                 continue
@@ -708,19 +702,21 @@ def _compute_max_depth(self, tree: str, subtree_node: str) -> int:
                 split = split[:-1]
         return max_depth
 
-    def _depth_constrained_sampler(self, symbol=None, depth_information: dict = None):
+    def _depth_constrained_sampler(
+        self, symbol=None, depth_information: dict | None = None
+    ):
         if depth_information is None:
             depth_information = {}
         # init the sequence
         tree = "(" + str(symbol)
         # collect possible productions from the starting symbol & filter if constraints are violated
         lhs = str(symbol)
-        if lhs in depth_information.keys():
+        if lhs in depth_information:
             depth_information[lhs] += 1
         else:
             depth_information[lhs] = 1
         if (
-            lhs in self.depth_constraints.keys()
+            lhs in self.depth_constraints
             and depth_information[lhs] >= self.depth_constraints[lhs]
         ):
             productions = [
@@ -769,8 +765,7 @@ def mutate(
             if parent != child:  # ensure that parent is really mutated
                 break
             _patience -= 1
-        child = self._remove_empty_spaces(child)
-        return child
+        return self._remove_empty_spaces(child)
 
     def crossover(
         self,
diff --git a/neps/search_spaces/architecture/cfg_variants/cfg_resolution.py b/neps/search_spaces/architecture/cfg_variants/cfg_resolution.py
deleted file mode 100644
index 5bc8fb5e..00000000
--- a/neps/search_spaces/architecture/cfg_variants/cfg_resolution.py
+++ /dev/null
@@ -1,385 +0,0 @@
-from collections import deque
-from typing import Deque
-
-import networkx as nx
-import numpy as np
-from nltk.grammar import Nonterminal
-
-from ..cfg import Grammar, choice
-
-
-class ResolutionGrammar(Grammar):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.n_downsamples = None
-        self.terminal_to_graph_map = None
-        self.downsampling_lhs = None
-        self.downsample_terminal = None
-        self.depth_constraints = None
-
-    def set_resolution_constraints(
-        self,
-        n_downsamples: int,
-        terminal_to_graph: dict,
-        downsampling_lhs: str,
-        downsample_terminal: str = "downsample",
-        depth_constraints: dict = None,
-    ):
-        self.n_downsamples = n_downsamples
-
-        terminal_to_graph_map: dict = {}
-        for k, v in terminal_to_graph.items():
-            terminal_to_graph_map[k] = {}
-            terminal_to_graph_map[k]["edge_list"] = v
-
-            G = nx.DiGraph()
-            G.add_edges_from(v)
-            src = [n for n, d in G.in_degree() if d == 0][0]
-            tgt = [n for n, d in G.out_degree() if d == 0][0]
-            terminal_to_graph_map[k]["paths"] = {
-                k: [] for k in range(1, nx.dag_longest_path_length(G) + 1)
-            }
-            for path in nx.all_simple_edge_paths(G, source=src, target=tgt):
-                terminal_to_graph_map[k]["paths"][len(path)].append(path[::-1])
-
-        self.terminal_to_graph_map = terminal_to_graph_map
-
-        self.downsampling_lhs = downsampling_lhs
-        self.swappable_nonterminals.remove(self.downsampling_lhs)
-
-        self.downsample_terminal = downsample_terminal
-
-        if depth_constraints is not None:
-            self.depth_constraints = depth_constraints
-            if not all(k in self.nonterminals for k in self.depth_constraints.keys()):
-                raise Exception(
-                    f"Nonterminal {set(self.depth_constraints.keys())-set(self.nonterminals)} does not exist in grammar"
-                )
-        else:
-            self.depth_constraints = {}
-
-    @staticmethod
-    def is_resolution_constrained():
-        return True
-
-    def sampler(
-        self,
-        n=1,
-        start_symbol: str = None,
-        n_downsamples: int = None,
-        depth_information: dict = None,
-    ):
-        if start_symbol is None:
-            start_symbol = self.start()
-        else:
-            start_symbol = Nonterminal(start_symbol)
-
-        if depth_information is None:
-            depth_information = {}
-        if n_downsamples is None:
-            n_downsamples = self.n_downsamples
-        return [
-            f"{self._resolution_constrained_sampler(symbol=start_symbol, n_downsamples=n_downsamples, depth_information=depth_information)})"
-            for _ in range(n)
-        ]
-
-    def _compute_depth_information_for_pre(self, tree: str) -> dict:
-        depth_information = {nt: 0 for nt in self.nonterminals}
-        q_nonterminals: Deque = deque()
-        for split in tree.split(" "):
-            if split == "":
-                continue
-            elif split[0] == "(":
-                q_nonterminals.append(split[1:])
-                depth_information[split[1:]] += 1
-                continue
-            while split[-1] == ")":
-                nt = q_nonterminals.pop()
-                depth_information[nt] -= 1
-                split = split[:-1]
-        return depth_information
-
-    def _compute_depth_information(self, tree: str) -> tuple:
-        split_tree = tree.split(" ")
-        depth_information = [0] * len(split_tree)
-        subtree_depth = [0] * len(split_tree)
-        helper_subtree_depth = [0] * len(split_tree)
-        helper_dict_depth_information = {nt: 0 for nt in self.nonterminals}
-        helper_dict_subtree_depth: dict = {nt: deque() for nt in self.nonterminals}
-        q_nonterminals: Deque = deque()
-        for i, split in enumerate(split_tree):
-            if split == "":
-                continue
-            elif split[0] == "(":
-                nt = split[1:]
-                q_nonterminals.append(nt)
-                depth_information[i] = helper_dict_depth_information[nt] + 1
-                helper_dict_depth_information[nt] += 1
-                helper_dict_subtree_depth[nt].append(i)
-                for j in helper_dict_subtree_depth[nt]:
-                    subtree_depth[j] = max(subtree_depth[j], helper_subtree_depth[j] + 1)
-                    helper_subtree_depth[j] += 1
-                continue
-            while split[-1] == ")":
-                nt = q_nonterminals.pop()
-                helper_dict_depth_information[nt] -= 1
-                for j in helper_dict_subtree_depth[nt]:
-                    helper_subtree_depth[j] -= 1
-                _ = helper_dict_subtree_depth[nt].pop()
-                split = split[:-1]
-        return depth_information, subtree_depth
-
-    def _compute_max_depth(self, tree: str, subtree_node: str) -> int:
-        max_depth = 0
-        depth_information = {nt: 0 for nt in self.nonterminals}
-        q_nonterminals: Deque = deque()
-        for split in tree.split(" "):
-            if split == "":
-                continue
-            elif split[0] == "(":
-                q_nonterminals.append(split[1:])
-                depth_information[split[1:]] += 1
-                if split[1:] == subtree_node and depth_information[split[1:]] > max_depth:
-                    max_depth = depth_information[split[1:]]
-                continue
-            while split[-1] == ")":
-                nt = q_nonterminals.pop()
-                depth_information[nt] -= 1
-                split = split[:-1]
-        return max_depth
-
-    @staticmethod
-    def assign_downsamples(edge_list, paths, n_downsamples):
-        if n_downsamples == 0:
-            return [0] * len(edge_list)
-        edge_list_to_downsamples = {e: 0 for e in edge_list}
-
-        if max(paths.keys()) >= n_downsamples:
-            for path in paths[n_downsamples]:
-                for e in path:
-                    edge_list_to_downsamples[e] = 1
-
-        for k in reversed(sorted(paths.keys())):
-            k_paths = paths[k]
-            if len(k_paths) == 0 or k == n_downsamples:
-                continue
-            tmp_indices = list(range(len(k_paths)))
-            np.random.shuffle(tmp_indices)
-            for idx in tmp_indices:
-                path = k_paths[idx]
-                already_set_n_downsamples = sum(edge_list_to_downsamples[e] for e in path)
-                if already_set_n_downsamples == n_downsamples:
-                    continue
-                _path = [e for e in path if edge_list_to_downsamples[e] == 0]
-
-                _n_downsamples = n_downsamples - already_set_n_downsamples
-                if len(_path) == 1:
-                    edge_list_to_downsamples[path[0]] = _n_downsamples
-                elif len(_path) < _n_downsamples:
-                    indices = np.random.choice(
-                        list(range(len(_path))),
-                        size=n_downsamples // len(_path),
-                        replace=False,
-                    )
-                    for i, e in enumerate(_path):
-                        edge_list_to_downsamples[e] = (
-                            n_downsamples // len(_path) + 1
-                            if i in indices
-                            else n_downsamples // len(_path)
-                        )
-                else:
-                    indices = np.random.choice(
-                        list(range(len(_path))),
-                        size=_n_downsamples,
-                        replace=False,
-                    )
-                    for i in indices:
-                        edge_list_to_downsamples[_path[i]] = 1
-
-        return [edge_list_to_downsamples[e] for e in edge_list]
-
-    def _resolution_constrained_sampler(
-        self, symbol=None, n_downsamples: int = 0, depth_information: dict = None
-    ):
-        if depth_information is None:
-            depth_information = {}
-
-        # init the sequence
-        tree = "(" + str(symbol)
-
-        lhs = str(symbol)
-        if lhs in depth_information.keys():
-            depth_information[lhs] += 1
-        else:
-            depth_information[lhs] = 1
-
-        # collect possible productions from the starting symbol & filter if constraints are violated
-        if lhs == self.downsampling_lhs:
-            productions = [
-                production
-                for production in self.productions(lhs=symbol)
-                if sum(str(x) == self.downsample_terminal for x in production.rhs())
-                == n_downsamples
-            ]
-        elif (
-            lhs in self.depth_constraints.keys()
-            and depth_information[lhs] < self.depth_constraints[lhs]["min"]["number"]
-        ):
-            productions = [
-                production
-                for production in self.productions(lhs=symbol)
-                if not (
-                    len(production.rhs()) == 1
-                    and str(production.rhs()[0])
-                    in self.depth_constraints[lhs]["min"]["exclude_rhs"]
-                )
-            ]
-        elif (
-            lhs in self.depth_constraints.keys()
-            and depth_information[lhs] >= self.depth_constraints[lhs]["max"]["number"]
-        ):
-            productions = [
-                production
-                for production in self.productions(lhs=symbol)
-                if lhs
-                not in [str(sym) for sym in production.rhs() if not isinstance(sym, str)]
-            ]
-        else:
-            productions = self.productions(lhs=symbol)
-
-        if len(productions) == 0:
-            raise Exception(
-                "There can be no word sampled! This is due to the grammar and/or constraints."
-            )
-
-        # sample
-        production = choice(productions)
-        n_downsamples_per_edge = []
-        counter = 0
-        for sym in production.rhs():
-            if isinstance(sym, str):
-                tree = tree + " " + sym
-                if sym in self.terminal_to_graph_map.keys():
-                    n_downsamples_per_edge = self.assign_downsamples(
-                        self.terminal_to_graph_map[sym]["edge_list"],
-                        self.terminal_to_graph_map[sym]["paths"],
-                        n_downsamples,
-                    )
-            else:
-                if counter < len(n_downsamples_per_edge):
-                    _n_downsamples = n_downsamples_per_edge[counter]
-                elif (
-                    len(production.rhs()) == 1
-                    and str(production.rhs()[0]) == self.downsampling_lhs
-                ):
-                    _n_downsamples = n_downsamples
-                else:
-                    _n_downsamples = 0
-                tree = (
-                    tree
-                    + " "
-                    + self._resolution_constrained_sampler(
-                        sym,
-                        n_downsamples=_n_downsamples,
-                        depth_information=depth_information,
-                    )
-                    + ")"
-                )
-                counter += 1
-
-        depth_information[lhs] -= 1
-        return tree
-
-    def mutate(
-        self, parent: str, subtree_index: int, subtree_node: str, patience: int = 50
-    ) -> str:
-        # chop out subtree
-        pre, _, post = self.remove_subtree(parent, subtree_index)
-        _patience = patience
-        while _patience > 0:
-            # only sample subtree -> avoids full sampling of large parse trees
-            depth_information = self._compute_depth_information_for_pre(pre)
-            new_subtree = self.sampler(
-                1, start_symbol=subtree_node, depth_information=depth_information
-            )[0]
-            child = pre + new_subtree + post
-            if parent != child:  # ensure that parent is really mutated
-                break
-            _patience -= 1
-        child = self._remove_empty_spaces(child)
-        return child
-
-    def crossover(
-        self,
-        parent1: str,
-        parent2: str,
-        patience: int = 50,
-        return_crossover_subtrees: bool = False,
-    ):
-        # randomly swap subtrees in two trees
-        # if no suitiable subtree exists then return False
-        subtree_node, subtree_index = self.rand_subtree(parent1)
-        # chop out subtree
-        pre, sub, post = self.remove_subtree(parent1, subtree_index)
-        head_node_depth = self._compute_depth_information_for_pre(pre)[subtree_node] + 1
-        sub_depth = self._compute_max_depth(sub, subtree_node)
-        _patience = patience
-        while _patience > 0:
-            # sample subtree from donor
-            donor_subtree_index = self._rand_subtree_fixed_head(
-                parent2, subtree_node, head_node_depth, sub_depth=sub_depth
-            )
-            # if no subtrees with right head node return False
-            if not donor_subtree_index:
-                _patience -= 1
-            else:
-                donor_pre, donor_sub, donor_post = self.remove_subtree(
-                    parent2, donor_subtree_index
-                )
-                # return the two new tree
-                child1 = pre + donor_sub + post
-                child2 = donor_pre + sub + donor_post
-
-                child1 = self._remove_empty_spaces(child1)
-                child2 = self._remove_empty_spaces(child2)
-
-                if return_crossover_subtrees:
-                    return (
-                        child1,
-                        child2,
-                        (pre, sub, post),
-                        (donor_pre, donor_sub, donor_post),
-                    )
-
-                return child1, child2
-
-        return False, False
-
-    def _rand_subtree_fixed_head(
-        self,
-        tree: str,
-        head_node: str,
-        head_node_depth: int = 0,
-        sub_depth: int = 0,
-    ) -> int:
-        # helper function to choose a random subtree from a given tree with a specific head node
-        # if no such subtree then return False, otherwise return the index of the subtree
-
-        # single pass through tree (stored as string) to look for the location of swappable_non_terminmals
-        if head_node in self.depth_constraints:
-            depth_information, subtree_depth = self._compute_depth_information(tree)
-            split_tree = tree.split(" ")
-            swappable_indices = [
-                i
-                for i in range(len(split_tree))
-                if split_tree[i][1:] == head_node
-                and head_node_depth - 1 + subtree_depth[i]
-                <= self.depth_constraints[head_node]
-                and depth_information[i] - 1 + sub_depth
-                <= self.depth_constraints[head_node]
-            ]
-        else:
-            swappable_indices = None
-        return super().rand_subtree_fixed_head(
-            tree=tree, head_node=head_node, swappable_indices=swappable_indices
-        )
diff --git a/neps/search_spaces/architecture/cfg_variants/constrained_cfg.py b/neps/search_spaces/architecture/cfg_variants/constrained_cfg.py
index a79ce212..00564835 100644
--- a/neps/search_spaces/architecture/cfg_variants/constrained_cfg.py
+++ b/neps/search_spaces/architecture/cfg_variants/constrained_cfg.py
@@ -6,16 +6,15 @@
 from copy import deepcopy
 from functools import partial
 from queue import LifoQueue
-from typing import Deque
 
 import numpy as np
 from nltk.grammar import Nonterminal
 
-from ..cfg import Grammar, choice
+from neps.search_spaces.architecture.cfg import Grammar, choice
 
 
 class Constraint:
-    def __init__(self, current_derivation: str = None) -> None:
+    def __init__(self, current_derivation: str | None = None) -> None:
         self.current_derivation = current_derivation
 
     @staticmethod
@@ -46,7 +45,7 @@ def __init__(self, *args, **kwargs):
 
         self._prior: dict = None
 
-    def set_constraints(self, constraints: dict, none_operation: str = None):
+    def set_constraints(self, constraints: dict, none_operation: str | None = None):
         self.constraints = constraints
         self.none_operation = none_operation
         self.constraint_is_class = isinstance(self.constraints, Constraint)
@@ -85,14 +84,11 @@ def _check_prior(value: dict):
     def sampler(  # type: ignore[override]
         self,
         n=1,
-        start_symbol: str = None,
+        start_symbol: str | None = None,
         not_allowed_productions=None,
         user_priors: bool = False,
     ):
-        if start_symbol is None:
-            start_symbol = self.start()
-        else:
-            start_symbol = Nonterminal(start_symbol)
+        start_symbol = self.start() if start_symbol is None else Nonterminal(start_symbol)
 
         return [
             self._constrained_sampler(
@@ -154,7 +150,7 @@ def _constrained_sampler(
                 probs = [p for i, p in enumerate(probs) if i not in not_allowed_indices]
                 # rescale s.t. probs sum up to one
                 cur_prob_sum = sum(probs)
-                probs = list(map(lambda x: x / cur_prob_sum, probs))
+                probs = [x / cur_prob_sum for x in probs]
             assert len(probs) == len(productions)
 
             production = choice(productions, probs=probs)
@@ -216,9 +212,7 @@ def skip_char(char: str) -> bool:
                 and string_tree[i + 1] == " "
             ):
                 return False
-            if char == "(":
-                return True
-            return False
+            return char == "("
 
         def find_longest_match(
             i: int, string_tree: str, symbols: list, max_match: int
@@ -253,7 +247,7 @@ def find_longest_match(
             char = string_tree[i]
             if skip_char(char):
                 pass
-            elif char == ")" and not string_tree[i - 1] == " ":
+            elif char == ")" and string_tree[i - 1] != " ":
                 # closing symbol of production
                 production = q_production_rules.get(block=False)[0][0]
                 idx = self.productions(production.lhs()).index(production)
@@ -264,9 +258,9 @@ def find_longest_match(
                 ):
                     outer_production = q_production_rules.queue[-1][0][0]
                     if len(q_production_rules.queue) not in current_derivations:
-                        current_derivations[
-                            len(q_production_rules.queue)
-                        ] = self.constraints(outer_production.rhs()[0])
+                        current_derivations[len(q_production_rules.queue)] = (
+                            self.constraints(outer_production.rhs()[0])
+                        )
                     context_information = self.constraints(
                         outer_production.rhs()[0],
                         current_derivations[len(q_production_rules.queue)],
@@ -303,7 +297,7 @@ def find_longest_match(
                         ]
                         # rescale s.t. prior sum up to one
                         cur_prob_sum = sum(prior)
-                        prior = list(map(lambda x: x / cur_prob_sum, prior))
+                        prior = [x / cur_prob_sum for x in prior]
                         idx -= sum(idx > i for i in not_allowed_indices)
 
                 prior = prior[idx]
@@ -343,7 +337,7 @@ def find_longest_match(
         return prior_prob
 
     def _compute_current_context(self, pre_subtree: str, post_subtree: str):
-        q_nonterminals: Deque = deque()
+        q_nonterminals: deque = deque()
         for sym in pre_subtree.split(" "):
             if sym == "":
                 continue
@@ -371,7 +365,7 @@ def _compute_current_context(self, pre_subtree: str, post_subtree: str):
         if len(productions) == 0:
             raise Exception("Cannot find corresponding production!")
 
-        q_context: Deque = deque()
+        q_context: deque = deque()
         current_derivation = []
         rhs_counter = 0
         tmp_str = ""
@@ -476,9 +470,9 @@ def mutate(
                     not_allowed_productions = self._get_not_allowed_productions(
                         self.productions(lhs=Nonterminal(subtree_node)),
                         context_information[
-                            [i for i, cd in enumerate(current_derivation) if cd is None][
-                                0
-                            ]
+                            next(
+                                i for i, cd in enumerate(current_derivation) if cd is None
+                            )
                         ],
                     )
                 elif isinstance(context_information, bool):
@@ -508,8 +502,7 @@ def mutate(
             ):
                 break
             _patience -= 1
-        child = self._remove_empty_spaces(child)
-        return child
+        return self._remove_empty_spaces(child)
 
     def crossover(
         self,
@@ -534,7 +527,7 @@ def crossover(
                 parent1_not_allowed_productions = self._get_not_allowed_productions(
                     self.productions(lhs=Nonterminal(subtree_node)),
                     context_information[
-                        [i for i, cd in enumerate(current_derivation) if cd is None][0]
+                        next(i for i, cd in enumerate(current_derivation) if cd is None)
                     ],
                 )
             elif isinstance(context_information, bool):
@@ -570,11 +563,11 @@ def crossover(
                             self._get_not_allowed_productions(
                                 self.productions(lhs=Nonterminal(subtree_node)),
                                 context_information[
-                                    [
+                                    next(
                                         i
                                         for i, cd in enumerate(current_derivation)
                                         if cd is None
-                                    ][0]
+                                    )
                                 ],
                             )
                         )
@@ -637,7 +630,9 @@ def compute_space_size(self) -> int:
             int: size of space described by grammar.
         """
 
-        def recursive_worker(nonterminal: Nonterminal, memory_bank: dict = None) -> int:
+        def recursive_worker(
+            nonterminal: Nonterminal, memory_bank: dict | None = None
+        ) -> int:
             def _get_all_variants(production):
                 variants = [production]
                 nonterminals = [
@@ -715,18 +710,17 @@ def _get_all_variants(production):
                             potential_production
                         )
                     )
-                else:
-                    if any(
-                        production.rhs()[0] == self.none_operation
-                        for nonterminal in nonterminals
-                        for production in self.productions(nonterminal)
-                    ):
-                        potential_productions += _get_all_variants(potential_production)
-                    elif not (
-                        len(potential_production.rhs()) == 1
-                        and potential_production.rhs()[0] == self.none_operation
-                    ):
-                        potential_productions.append(potential_production)
+                elif any(
+                    production.rhs()[0] == self.none_operation
+                    for nonterminal in nonterminals
+                    for production in self.productions(nonterminal)
+                ):
+                    potential_productions += _get_all_variants(potential_production)
+                elif not (
+                    len(potential_production.rhs()) == 1
+                    and potential_production.rhs()[0] == self.none_operation
+                ):
+                    potential_productions.append(potential_production)
             _possibilites = 0
             for potential_production in potential_productions:
                 nonterminals = [
@@ -736,7 +730,7 @@ def _get_all_variants(production):
                 ]
                 possibilities_per_edge = [
                     memory_bank[str(e_nonterminal)]
-                    if str(e_nonterminal) in memory_bank.keys()
+                    if str(e_nonterminal) in memory_bank
                     else recursive_worker(e_nonterminal, memory_bank)
                     for e_nonterminal in nonterminals
                 ]
diff --git a/neps/search_spaces/architecture/core_graph_grammar.py b/neps/search_spaces/architecture/core_graph_grammar.py
index 17323b48..f0862882 100644
--- a/neps/search_spaces/architecture/core_graph_grammar.py
+++ b/neps/search_spaces/architecture/core_graph_grammar.py
@@ -6,7 +6,6 @@
 from abc import abstractmethod
 from copy import deepcopy
 from functools import partial
-from typing import Deque
 
 import networkx as nx
 import numpy as np
@@ -29,11 +28,14 @@ def get_edge_lists_of_topologies(terminal_map: dict) -> dict:
         if is_topology:
             if isinstance(v, partial):
                 if hasattr(v.func, "get_edge_list"):
-                    func_args = inspect.getfullargspec(v.func.get_edge_list).args  # type: ignore[attr-defined]
+                    func_args = inspect.getfullargspec(
+                        v.func.get_edge_list).args  # type: ignore[attr-defined]
                     kwargs = {k: v for k, v in v.keywords.items() if k in func_args}
-                    topology_edge_lists[k] = v.func.get_edge_list(**kwargs)  # type: ignore[attr-defined]
+                    topology_edge_lists[k] = v.func.get_edge_list(
+                        **kwargs)  # type: ignore[attr-defined]
                 elif hasattr(v.func, "edge_list"):
-                    topology_edge_lists[k] = v.func.edge_list  # type: ignore[attr-defined]
+                    topology_edge_lists[
+                        k] = v.func.edge_list  # type: ignore[attr-defined]
                 else:
                     raise Exception(
                         f"Please implement a get_edge_list static method for {v.func.__name__} or set edge_list!"
@@ -48,13 +50,13 @@ def __init__(
         self,
         grammars: list[Grammar] | Grammar,
         terminal_to_op_names: dict,
-        terminal_to_graph_edges: dict = None,
+        terminal_to_graph_edges: dict | None = None,
         edge_attr: bool = True,
         edge_label: str = "op_name",
-        zero_op: list = None,
-        identity_op: list = None,
-        name: str = None,
-        scope: str = None,
+        zero_op: list | None = None,
+        identity_op: list | None = None,
+        name: str | None = None,
+        scope: str | None = None,
         return_all_subgraphs: bool = False,
         return_graph_per_hierarchy: bool = False,
     ):
@@ -95,7 +97,7 @@ def get_grammars(self) -> list[Grammar]:
 
     def clear_graph(self):
         while len(self.nodes()) != 0:
-            self.remove_node(list(self.nodes())[0])
+            self.remove_node(next(iter(self.nodes())))
 
     @abstractmethod
     def id_to_string_tree(self, identifier: str):
@@ -124,7 +126,7 @@ def prune_tree(
         terminal_to_torch_map_keys: collections.abc.KeysView,
         node_label: str = "op_name",
     ) -> nx.DiGraph:
-        """Prunes unnecessary parts of parse tree, i.e., only one child
+        """Prunes unnecessary parts of parse tree, i.e., only one child.
 
         Args:
             tree (nx.DiGraph): Parse tree
@@ -154,11 +156,11 @@ def dfs(visited: set, tree: nx.DiGraph, node: int) -> nx.DiGraph:
                     if len(predecessor) > 0:
                         tree.add_edge(predecessor[0], tree.nodes[node]["children"][0])
                         old_children = tree.nodes[predecessor[0]]["children"]
-                        idx = [i for i, c in enumerate(old_children) if c == node][0]
+                        idx = next(i for i, c in enumerate(old_children) if c == node)
                         tree.nodes[predecessor[0]]["children"] = (
                             old_children[: idx + 1]
                             + [tree.nodes[node]["children"][0]]
-                            + old_children[idx + 1 :]
+                            + old_children[idx + 1:]
                         )
                         tree.nodes[predecessor[0]]["children"].remove(node)
 
@@ -167,474 +169,16 @@ def dfs(visited: set, tree: nx.DiGraph, node: int) -> nx.DiGraph:
                     tree.nodes[node]["terminal"]
                     and tree.nodes[node][node_label] not in terminal_to_torch_map_keys
                 ):
-                    predecessor = list(tree.pred[node])[0]
+                    predecessor = next(iter(tree.pred[node]))
                     tree.nodes[predecessor]["children"].remove(node)
                     tree.remove_node(node)
             return tree
 
         return dfs(set(), tree, self._find_root(tree))
 
-    @staticmethod
-    def _dfs_preorder_nodes(G: nx.DiGraph, source: str = None) -> list[int]:
-        """Generates nodes in DFS pre-ordering starting at source.
-        Note that after pruning we cannot reconstruct the associated string tree!
-
-        Args:
-            G (nx.DiGraph): NetworkX DAG
-            source (str, optional): Starting node for DFS. Defaults to None.
-
-        Returns:
-            generator: List of nodes in a DFS pre-ordering.
-        """
-        edges = nx.dfs_labeled_edges(G, source=source)
-        return list(v for _, v, d in edges if d == "forward")
-
-    @staticmethod
-    def _find_leafnodes(G):
-        leafnode = []
-        for i in G.nodes:
-            head = []
-            if nx.descendants(G, i) == set():  # find all leaf nodes
-                for a in nx.ancestors(G, i):  # get all ancestors for leaf node
-                    if (
-                        nx.ancestors(G, a) == set()
-                    ):  # Determine if ancestor is a head node
-                        head.append(a)
-            if len(head) == 1:  # if this leaf had only one head then append to leafnode
-                leafnode.append(i)
-        return leafnode
-
-    @staticmethod
-    def _get_neighbors_from_parse_tree(tree: nx.DiGraph, node: int) -> list[int]:
-        return tree.nodes[node]["children"]
-
     @staticmethod
     def _find_root(G):
-        return [n for n, d in G.in_degree() if d == 0][0]
-
-    @staticmethod
-    def _relabel_nodes(G: nx.DiGraph, mapping: dict) -> nx.DiGraph:
-        """Relabels the nodes and adjusts children list accordingly.
-
-        Args:
-            G (nx.DiGraph): graph to relabel
-            mapping (dict): node mapping
-
-        Returns:
-            nx.DiGraph: relabeled graph (copied)
-        """
-        # recreation of graph is faster
-        tree_relabeled = nx.DiGraph()
-        tree_relabeled.add_nodes_from(
-            [
-                (
-                    mapping[n[0]],
-                    {
-                        k: v if k != "children" else [mapping[_n] for _n in v]
-                        for k, v in n[1].items()
-                    },
-                )
-                for n in G.nodes(data=True)
-            ]
-        )
-        tree_relabeled.add_edges_from([(mapping[e[0]], mapping[e[1]]) for e in G.edges()])
-        return tree_relabeled
-
-    def assemble_trees(
-        self,
-        base_tree: str | nx.DiGraph,
-        motif_trees: list[str] | list[nx.DiGraph],
-        terminal_to_sublanguage_map: dict = None,
-        node_label: str = "op_name",
-    ) -> str | nx.DiGraph:
-        """Assembles the base parse tree with the motif parse trees
-
-        Args:
-            base_tree (nx.DiGraph): Base parse tree
-            motif_trees (List[nx.DiGraph]): List of motif parse trees
-            node_label (str, optional): node label key. Defaults to "op_name".
-
-        Returns:
-            nx.DiGraph: Assembled parse tree
-        """
-        if not all([isinstance(base_tree, type(tree)) for tree in motif_trees]):
-            raise ValueError("All trees must be of the same type!")
-        if isinstance(base_tree, str):
-            ensembled_tree_string = base_tree
-            if terminal_to_sublanguage_map is None:
-                raise NotImplementedError
-
-            for motif, replacement in zip(
-                terminal_to_sublanguage_map.keys(), motif_trees
-            ):
-                if motif in ensembled_tree_string:
-                    ensembled_tree_string = ensembled_tree_string.replace(
-                        motif, replacement
-                    )
-            return ensembled_tree_string
-        elif isinstance(base_tree, nx.DiGraph):
-            raise NotImplementedError
-            leafnodes = self._find_leafnodes(base_tree)
-            root_nodes = [self._find_root(G) for G in motif_trees]
-            root_op_names = np.array(
-                [
-                    motif_tree.nodes[root_node][node_label]
-                    for motif_tree, root_node in zip(motif_trees, root_nodes)
-                ]
-            )
-            largest_node_number = max(base_tree.nodes())
-            # ensembled_tree = base_tree.copy()
-            # recreation is slightly faster
-            ensembled_tree: nx.DiGraph = nx.DiGraph()
-            ensembled_tree.add_nodes_from(base_tree.nodes(data=True))
-            ensembled_tree.add_edges_from(base_tree.edges())
-            for leafnode in leafnodes:
-                idx = np.where(base_tree.nodes[leafnode][node_label] == root_op_names)[0]
-                if len(idx) == 0:
-                    continue
-                if len(idx) > 1:
-                    raise ValueError(
-                        "More than two similar terminal/start symbols are not supported!"
-                    )
-
-                tree = motif_trees[idx[0]]
-                # generate mapping
-                mapping = {
-                    n: n_new
-                    for n, n_new in zip(
-                        tree.nodes(),
-                        range(
-                            largest_node_number + 1,
-                            largest_node_number + 1 + len(tree),
-                        ),
-                    )
-                }
-                largest_node_number = largest_node_number + 1 + len(tree)
-                tree_relabeled = self._relabel_nodes(G=tree, mapping=mapping)
-
-                # compose trees
-                predecessor_in_base_tree = list(ensembled_tree.pred[leafnode])[0]
-                motif_tree_root_node = self._find_root(tree_relabeled)
-                successors_in_motif_tree = tree_relabeled.nodes[motif_tree_root_node][
-                    "children"
-                ]
-
-                # delete unnecessary edges
-                ensembled_tree.remove_node(leafnode)
-                tree_relabeled.remove_node(motif_tree_root_node)
-                # add new edges
-                tree_relabeled.add_node(predecessor_in_base_tree)
-                for n in successors_in_motif_tree:
-                    tree_relabeled.add_edge(predecessor_in_base_tree, n)
-
-                ensembled_tree.update(
-                    edges=tree_relabeled.edges(data=True),
-                    nodes=tree_relabeled.nodes(data=True),
-                )
-
-                idx = np.where(
-                    np.array(ensembled_tree.nodes[predecessor_in_base_tree]["children"])
-                    == leafnode
-                )[0][0]
-                old_children = ensembled_tree.nodes[predecessor_in_base_tree]["children"]
-                ensembled_tree.nodes[predecessor_in_base_tree]["children"] = (
-                    old_children[: idx + 1]
-                    + successors_in_motif_tree
-                    + old_children[idx + 1 :]
-                )
-                ensembled_tree.nodes[predecessor_in_base_tree]["children"].remove(
-                    leafnode
-                )
-            return ensembled_tree
-        else:
-            raise NotImplementedError(
-                f"Assembling of trees of type {type(base_tree)} is not supported!"
-            )
-
-    def build_graph_from_tree(
-        self,
-        tree: nx.DiGraph,
-        terminal_to_torch_map: dict,
-        node_label: str = "op_name",
-        flatten_graph: bool = True,
-        return_cell: bool = False,
-    ) -> None | Graph:
-        """Builds the computational graph from a parse tree.
-
-        Args:
-            tree (nx.DiGraph): parse tree.
-            terminal_to_torch_map (dict): Mapping from terminal symbols to primitives or topologies.
-            node_label (str, optional): Key to access terminal symbol. Defaults to "op_name".
-            return_cell (bool, optional): Whether to return a cell. Is only needed if cell is repeated multiple times.
-            Defaults to False.
-
-        Returns:
-            Tuple[Union[None, Graph]]: computational graph (self) or cell.
-        """
-
-        def _build_graph_from_tree(
-            visited: set,
-            tree: nx.DiGraph,
-            node: int,
-            terminal_to_torch_map: dict,
-            node_label: str,
-            is_primitive: bool = False,
-        ):
-            """Recursive DFS-esque function to build computational graph from parse tree
-
-            Args:
-                visited (set): set of visited nodes.
-                tree (nx.DiGraph): parse tree.
-                node (int): node index.
-                terminal_to_torch_map (dict): mapping from terminal symbols to primitives or topologies.
-                node_label (str): key to access operation name
-
-            Raises:
-                Exception: primitive or topology is unknown, i.e., it is probably missing in the terminal to
-                torch mapping
-                Exception: leftmost children can only be primitive, topology or have one child
-
-            Returns:
-                [type]: computational graph.
-            """
-            if node not in visited:
-                subgraphs = []
-                primitive_hps = []
-                if len(tree.out_edges(node)) == 0:
-                    if is_primitive:
-                        return tree.nodes[node][node_label]
-                    else:
-                        if (
-                            tree.nodes[node][node_label]
-                            not in terminal_to_torch_map.keys()
-                        ):
-                            raise Exception(
-                                f"Unknown primitive or topology: {tree.nodes[node][node_label]}"
-                            )
-                        return deepcopy(
-                            terminal_to_torch_map[tree.nodes[node][node_label]]
-                        )
-                if len(tree.out_edges(node)) == 1:
-                    return _build_graph_from_tree(
-                        visited,
-                        tree,
-                        list(tree.neighbors(node))[0],
-                        terminal_to_torch_map,
-                        node_label,
-                        is_primitive,
-                    )
-                # for idx, neighbor in enumerate(tree.neighbors(node)):
-                for idx, neighbor in enumerate(
-                    self._get_neighbors_from_parse_tree(tree, node)
-                ):
-                    if idx == 0:  # topology or primitive
-                        n = neighbor
-                        while not tree.nodes[n]["terminal"]:
-                            if len(tree.out_edges(n)) != 1:
-                                raise Exception(
-                                    "Leftmost Child can only be primitive, topology or recursively have one child!"
-                                )
-                            n = next(tree.neighbors(n))
-                        if is_primitive:
-                            primitive_hp_key = tree.nodes[n][node_label]
-                            primitive_hp_dict = {primitive_hp_key: None}
-                            is_primitive_op = True
-                        else:
-                            if (
-                                tree.nodes[n][node_label]
-                                not in terminal_to_torch_map.keys()
-                            ):
-                                raise Exception(
-                                    f"Unknown primitive or topology: {tree.nodes[n][node_label]}"
-                                )
-                            graph_el = terminal_to_torch_map[tree.nodes[n][node_label]]
-                            is_primitive_op = issubclass(
-                                graph_el.func
-                                if isinstance(graph_el, partial)
-                                else graph_el,
-                                AbstractPrimitive,
-                            )
-                    elif not tree.nodes[neighbor][
-                        "terminal"
-                    ]:  # exclude '[' ']' ... symbols
-                        if is_primitive:
-                            primitive_hp_dict[primitive_hp_key] = _build_graph_from_tree(
-                                visited,
-                                tree,
-                                neighbor,
-                                terminal_to_torch_map,
-                                node_label,
-                                is_primitive_op,
-                            )
-                        elif is_primitive_op:
-                            primitive_hps.append(
-                                _build_graph_from_tree(
-                                    visited,
-                                    tree,
-                                    neighbor,
-                                    terminal_to_torch_map,
-                                    node_label,
-                                    is_primitive_op,
-                                )
-                            )
-                        else:
-                            subgraphs.append(
-                                _build_graph_from_tree(
-                                    visited,
-                                    tree,
-                                    neighbor,
-                                    terminal_to_torch_map,
-                                    node_label,
-                                    is_primitive_op,
-                                )
-                            )
-                    elif (
-                        tree.nodes[neighbor][node_label] in terminal_to_torch_map.keys()
-                    ):  # exclude '[' ']' ... symbols
-                        # TODO check if there is a potential bug here?
-                        subgraphs.append(
-                            deepcopy(
-                                terminal_to_torch_map[tree.nodes[neighbor][node_label]]
-                            )
-                        )
-
-                if is_primitive:
-                    return primitive_hp_dict
-                elif is_primitive_op:
-                    return dict(
-                        collections.ChainMap(*([{"op": graph_el}] + primitive_hps))
-                    )
-                else:
-                    return graph_el(*subgraphs)
-
-        def _flatten_graph(
-            graph,
-            flattened_graph,
-            start_node: int = None,
-            end_node: int = None,
-        ):
-            nodes: dict = {}
-            for u, v, data in graph.edges(data=True):
-                if u in nodes.keys():
-                    _u = nodes[u]
-                else:
-                    _u = (
-                        1
-                        if len(flattened_graph.nodes.keys()) == 0
-                        else max(flattened_graph.nodes.keys()) + 1
-                    )
-                    _u = (
-                        start_node
-                        if graph.in_degree(u) == 0 and start_node is not None
-                        else _u
-                    )
-                    nodes[u] = _u
-                    if _u not in flattened_graph.nodes.keys():
-                        flattened_graph.add_node(_u)
-
-                if v in nodes.keys():
-                    _v = nodes[v]
-                else:
-                    _v = max(flattened_graph.nodes.keys()) + 1
-                    _v = (
-                        end_node
-                        if graph.out_degree(v) == 0 and end_node is not None
-                        else _v
-                    )
-                    nodes[v] = _v
-                    if _v not in flattened_graph.nodes.keys():
-                        flattened_graph.add_node(_v)
-
-                if isinstance(data["op"], Graph):
-                    flattened_graph = _flatten_graph(
-                        data["op"], flattened_graph, start_node=_u, end_node=_v
-                    )
-                else:
-                    flattened_graph.add_edge(_u, _v)
-                    flattened_graph.edges[_u, _v].update(data)
-
-            return flattened_graph
-
-        root_node = self._find_root(tree)
-        graph = _build_graph_from_tree(
-            set(), tree, root_node, terminal_to_torch_map, node_label
-        )
-        self._check_graph(graph)
-        if return_cell:
-            cell = (
-                _flatten_graph(graph, flattened_graph=Graph()) if flatten_graph else graph
-            )
-            return cell
-        else:
-            if flatten_graph:
-                _flatten_graph(graph, flattened_graph=self)
-            else:
-                self.add_edge(0, 1)
-                self.edges[0, 1].set("op", graph)
-            return None
-
-    def to_graph_repr(self, graph: Graph, edge_attr: bool) -> nx.DiGraph:
-        """Transforms NASLib-esque graph to NetworkX graph.
-
-        Args:
-            graph (Graph): NASLib-esque graph.
-            edge_attr (bool): Transform to edge attribution or node attribution.
-
-        Returns:
-            nx.DiGraph: edge- or node-attributed representation of computational graph.
-        """
-        if edge_attr:
-            g = nx.DiGraph()
-            g.add_nodes_from(graph.nodes())
-            for u, v in graph.edges():
-                if isinstance(graph.edges[u, v]["op"], Graph):
-                    g.add_edge(u, v, op_name=graph.edges[u, v]["op"].name)
-                else:
-                    g.add_edge(
-                        u, v, **{self.edge_label: graph.edges[u, v][self.edge_label]}
-                    )
-            g.graph_type = "edge_attr"
-        else:
-            g = nx.DiGraph()
-            src = [n for n in graph.nodes() if graph.in_degree(n) == 0][0]
-            tgt = [n for n in graph.nodes() if graph.out_degree(n) == 0][0]
-            nof_edges = graph.size()
-            g.add_nodes_from(
-                [
-                    (0, {self.edge_label: "input"}),
-                    (nof_edges + 1, {self.edge_label: "output"}),
-                ]
-            )
-            node_counter = 1
-            open_edge: dict = {}
-            for node in nx.topological_sort(graph):
-                for edge in graph.out_edges(node):
-                    g.add_node(
-                        node_counter,
-                        **{self.edge_label: graph.edges[edge][self.edge_label]},
-                    )
-
-                    u, v = edge
-                    if u == src:  # special case for input node
-                        g.add_edge(0, node_counter)
-                    if v == tgt:  # special case of output node
-                        g.add_edge(node_counter, nof_edges + 1)
-                    if (
-                        u in open_edge.keys()
-                    ):  # add edge between already seen nodes and new node
-                        for node_count in open_edge[u]:
-                            g.add_edge(node_count, node_counter)
-
-                    if v in open_edge.keys():
-                        open_edge[v].append(node_counter)
-                    else:
-                        open_edge[v] = [node_counter]
-                    node_counter += 1
-            g.graph_type = "node_attr"
-
-        self._check_graph(g)
-
-        return g
+        return next(n for n, d in G.in_degree() if d == 0)
 
     @staticmethod
     def from_stringTree_to_nxTree(
@@ -662,9 +206,7 @@ def skip_char(char: str) -> bool:
                 and string_tree[i + 1] == " "
             ):
                 return False
-            if char == "(":
-                return True
-            return False
+            return char == "("
 
         def find_longest_match(
             i: int, string_tree: str, symbols: list[str], max_match: int
@@ -710,7 +252,7 @@ def find_longest_match(
             char = string_tree[i]
             if skip_char(char):
                 pass
-            elif char == ")" and not string_tree[i - 1] == " ":
+            elif char == ")" and string_tree[i - 1] != " ":
                 # closing symbol of production
                 _node_number = q.get(block=False)
                 _node_children = q_children.get(block=False)
@@ -740,34 +282,6 @@ def find_longest_match(
             raise Exception("Invalid string_tree")
         return G
 
-    def from_nxTree_to_stringTree(
-        self, nxTree: nx.DiGraph, node_label: str = "op_name"
-    ) -> str:
-        """Transforms parse tree represented as NetworkX DAG to string representation.
-
-        Args:
-            nxTree (nx.DiGraph): parse tree.
-            node_label (str, optional): key to access operation names. Defaults to "op_name".
-
-        Returns:
-            str: parse tree represented as string.
-        """
-
-        def dfs(visited, graph, node):
-            if node not in visited:
-                visited.add(node)
-                if graph.nodes[node]["terminal"]:
-                    return f"{graph.nodes[node][node_label]}"
-                tmp_str = f"{f'({graph.nodes[node][node_label]}'}" + " "
-                # for neighbor in graph.neighbors(node):
-                for neighbor in self._get_neighbors_from_parse_tree(graph, node):
-                    tmp_str += dfs(visited, graph, neighbor) + " "
-                tmp_str = tmp_str[:-1] + ")"
-                return tmp_str
-            return ""
-
-        return dfs(set(), nxTree, node=self._find_root(nxTree))
-
     def update_op_names(self):
         # update op names
         for u, v in self.edges():
@@ -785,8 +299,8 @@ def from_stringTree_to_graph_repr(
         sym_name: str = "op_name",
         prune: bool = True,
         add_subtree_map: bool = False,
-        return_all_subgraphs: bool = None,
-        return_graph_per_hierarchy: bool = None,
+        return_all_subgraphs: bool | None = None,
+        return_graph_per_hierarchy: bool | None = None,
     ) -> nx.DiGraph | tuple[nx.DiGraph, collections.OrderedDict]:
         """Generates graph from parse tree in string representation.
         Note that we ignore primitive HPs!
@@ -821,17 +335,17 @@ def get_node_labels(graph: nx.DiGraph):
         def get_hierarchicy_dict(
             string_tree: str,
             subgraphs: dict,
-            hierarchy_dict: dict = None,
+            hierarchy_dict: dict | None = None,
             hierarchy_level_counter: int = 0,
         ):
             if hierarchy_dict is None:
                 hierarchy_dict = {}
-            if hierarchy_level_counter not in hierarchy_dict.keys():
+            if hierarchy_level_counter not in hierarchy_dict:
                 hierarchy_dict[hierarchy_level_counter] = []
             hierarchy_dict[hierarchy_level_counter].append(string_tree)
             node_labels = get_node_labels(subgraphs[string_tree])
             for _, node_label in node_labels:
-                if node_label in subgraphs.keys():
+                if node_label in subgraphs:
                     hierarchy_dict = get_hierarchicy_dict(
                         node_label, subgraphs, hierarchy_dict, hierarchy_level_counter + 1
                     )
@@ -916,15 +430,13 @@ def to_node_attributed_edge_list(
                 if v == tgt:
                     node_list.append((ni, 1))
 
-                for e_ in filter(
-                    lambda e: (e[1] == u), edge_list
-                ):
+                for e_ in filter(lambda e: (e[1] == u), edge_list):
                     node_list.append((edge_to_node_map[e_], ni))
 
             return node_list, edge_to_node_map
 
         def skip_char(char: str) -> bool:
-            return True if char in [" ", "\t", "\n", "[", "]"] else False
+            return char in [" ", "\t", "\n", "[", "]"]
 
         if prune:
             add_subtree_map = False
@@ -937,14 +449,14 @@ def skip_char(char: str) -> bool:
 
         G = nx.DiGraph()
         if add_subtree_map:
-            q_nonterminals: Deque = collections.deque()
+            q_nonterminals: collections.deque = collections.deque()
         if compute_subgraphs:
-            q_subtrees: Deque = collections.deque()
-            q_subgraphs: Deque = collections.deque()
+            q_subtrees: collections.deque = collections.deque()
+            q_subgraphs: collections.deque = collections.deque()
             subgraphs_dict = collections.OrderedDict()
         if edge_attr:
             node_offset = 0
-            q_el: Deque = collections.deque()  # edge-attr
+            q_el: collections.deque = collections.deque()  # edge-attr
             terminal_to_graph = self.terminal_to_graph_edges
         else:  # node-attributed
             G.add_node(0, **{sym_name: "input"})
@@ -1137,11 +649,11 @@ def skip_char(char: str) -> bool:
                                     [
                                         (n_in, n_out)
                                         for n_in in q_subgraphs[-1]["graph"].predecessors(
-                                            n
-                                        )
+                                        n
+                                    )
                                         for n_out in q_subgraphs[-1]["graph"].successors(
-                                            n
-                                        )
+                                        n
+                                    )
                                     ]
                                 )
                                 q_subgraphs[-1]["graph"].remove_node(n)
@@ -1261,16 +773,18 @@ def get_graph_representation(
     ) -> nx.DiGraph:
         """This functions takes an identifier and constructs the
         (multi-variate) composition of the functions it describes.
+
         Args:
             identifier (str): identifier
             grammar (Grammar): grammar
             flatten_graph (bool, optional): Whether to flatten the graph. Defaults to True.
+
         Returns:
-            nx.DiGraph: (multi-variate) composition of functions
+            nx.DiGraph: (multi-variate) composition of functions.
         """
 
         def _skip_char(char: str) -> bool:
-            return True if char in [" ", "\t", "\n", "[", "]"] else False
+            return char in [" ", "\t", "\n", "[", "]"]
 
         def _get_sym_from_split(split: str) -> str:
             start_idx, end_idx = 0, len(split)
@@ -1298,9 +812,7 @@ def to_node_attributed_edge_list(
                 if v in tgt:
                     node_list.append((ni, v))
 
-                for e_ in filter(
-                    lambda e: (e[1] == u), edge_list
-                ):
+                for e_ in filter(lambda e: (e[1] == u), edge_list):
                     node_list.append((edge_to_node_map[e_], ni))
 
             return node_list, edge_to_node_map
@@ -1329,12 +841,11 @@ def to_node_attributed_edge_list(
 
             if sym in grammar.terminals:
                 is_topology = False
-                if inspect.isclass(self.terminal_to_op_names[sym]) and issubclass(
-                    self.terminal_to_op_names[sym], AbstractTopology
-                ):
-                    is_topology = True
-                elif isinstance(self.terminal_to_op_names[sym], partial) and issubclass(
-                    self.terminal_to_op_names[sym].func, AbstractTopology
+                if (
+                    inspect.isclass(self.terminal_to_op_names[sym])
+                    and issubclass(self.terminal_to_op_names[sym], AbstractTopology)
+                    or isinstance(self.terminal_to_op_names[sym], partial)
+                    and issubclass(self.terminal_to_op_names[sym].func, AbstractTopology)
                 ):
                     is_topology = True
 
@@ -1354,16 +865,13 @@ def to_node_attributed_edge_list(
                     if q_nonterminals.qsize() == q_topologies.qsize():
                         topology, number_of_primitives = q_topologies.get(block=False)
                         primitives = [
-                            q_primitives.get(block=False)
-                            for _ in range(number_of_primitives)
-                        ][::-1]
+                                         q_primitives.get(block=False)
+                                         for _ in range(number_of_primitives)
+                                     ][::-1]
                         if (
                             topology in terminal_to_graph
                             and terminal_to_graph[topology] is not None
-                        ):
-                            raise NotImplementedError
-                            # edges = terminal_to_graph[topology]
-                        elif isinstance(topology, partial):
+                        ) or isinstance(topology, partial):
                             raise NotImplementedError
                         else:
                             composed_function = topology(*primitives)
@@ -1435,27 +943,27 @@ def prune_graph(self, graph: nx.DiGraph | Graph = None, edge_attr: bool = True):
             graph.remove_edges_from(remove_edge_list)
         else:
             for n in list(nx.topological_sort(graph)):
-                if n in graph.nodes():
-                    if (
-                        graph.nodes[n]["op_name"] in self.zero_op
-                        or graph.nodes[n]["op_name"] in self.identity_op
-                    ):
-                        if graph.nodes[n]["op_name"] in self.identity_op:
-                            # reconnect edges for removed nodes with 'skip_connect'
-                            graph.add_edges_from(
-                                [
-                                    (e_i[0], e_o[1])
-                                    for e_i in graph.in_edges(n)
-                                    for e_o in graph.out_edges(n)
-                                ]
-                            )
-                        # remove nodes with 'skip_connect' or 'none' label
-                        graph.remove_node(n)
+                if n in graph.nodes() and (
+                    graph.nodes[n]["op_name"] in self.zero_op
+                    or graph.nodes[n]["op_name"] in self.identity_op
+                ):
+                    if graph.nodes[n]["op_name"] in self.identity_op:
+                        # reconnect edges for removed nodes with 'skip_connect'
+                        graph.add_edges_from(
+                            [
+                                (e_i[0], e_o[1])
+                                for e_i in graph.in_edges(n)
+                                for e_o in graph.out_edges(n)
+                            ]
+                        )
+                    # remove nodes with 'skip_connect' or 'none' label
+                    graph.remove_node(n)
 
         graph = self.prune_unconnected_parts(graph, src_node, tgt_node)
 
         if not use_self:
             return graph
+        return None
 
     @staticmethod
     def prune_unconnected_parts(graph, src_node, tgt_node):
@@ -1486,35 +994,18 @@ def _backtrack_remove(graph, node: int):
                     graph = _backtrack_remove(graph, n)
         return graph
 
-    def _sampler_maxMin(self, largest: bool = True) -> str | list[str]:
-        """Samples new parse tree(s) based on grammars.
-        Assumes that the first rule of each production leads to
-        smallest DAG and last to largest DAG!
-
-        Args:
-            largest (bool, optional): To find largest DAG, set to True. For smallest DAG set to False. Defaults to True.
-
-        Returns:
-            Union[str, List[str]]: Parse tree or list of parse trees
-        """
-        trees = [
-            grammar.sampler_maxMin_func(grammar.start(), largest) + ")"
-            for grammar in self.grammars
-        ]
-        return trees if len(trees) > 1 else trees[0]
-
     @staticmethod
     def flatten_graph(
         graph: nx.DiGraph,
         flattened_graph: Graph = None,
-        start_node: int = None,
-        end_node: int = None,
+        start_node: int | None = None,
+        end_node: int | None = None,
     ):
         if flattened_graph is None:
             flattened_graph = Graph()
         nodes: dict = {}
         for u, v, data in graph.edges(data=True):
-            if u in nodes.keys():
+            if u in nodes:
                 _u = nodes[u]
             else:
                 _u = (
@@ -1528,17 +1019,18 @@ def flatten_graph(
                     else _u
                 )
                 nodes[u] = _u
-                if _u not in flattened_graph.nodes.keys():  # type: ignore[union-attr]
+                if _u not in flattened_graph.nodes:  # type: ignore[union-attr]
                     flattened_graph.add_node(_u)  # type: ignore[union-attr]
-                    flattened_graph.nodes[_u].update(graph.nodes[u])  # type: ignore[union-attr]
+                    flattened_graph.nodes[_u].update(
+                        graph.nodes[u])  # type: ignore[union-attr]
 
-            if v in nodes.keys():
+            if v in nodes:
                 _v = nodes[v]
             else:
                 _v = max(flattened_graph.nodes.keys()) + 1  # type: ignore[union-attr]
                 _v = end_node if graph.out_degree(v) == 0 and end_node is not None else _v
                 nodes[v] = _v
-                if _v not in flattened_graph.nodes.keys():  # type: ignore[union-attr]
+                if _v not in flattened_graph.nodes:  # type: ignore[union-attr]
                     flattened_graph.add_node(_v)  # type: ignore[union-attr]
                 flattened_graph.nodes[_v].update(  # type: ignore[union-attr]
                     graph.nodes[v]
@@ -1587,13 +1079,14 @@ def _compose_functions(
             char = descriptor[i]
             if skip_char(char, descriptor, i):
                 pass
-            elif char == ")" and not descriptor[i - 1] == " ":
+            elif char == ")" and descriptor[i - 1] != " ":
                 # closing symbol of production
                 if q_nonterminals.qsize() == q_topologies.qsize():
                     topology, number_of_primitives = q_topologies.get(block=False)
                     primitives = [
-                        q_primitives.get(block=False) for _ in range(number_of_primitives)
-                    ][::-1]
+                                     q_primitives.get(block=False) for _ in
+                                     range(number_of_primitives)
+                                 ][::-1]
                     composed_function = topology(*primitives)
                     if not q_topologies.empty():
                         q_primitives.put(composed_function)
@@ -1606,14 +1099,13 @@ def _compose_functions(
 
                 if sym in grammar.terminals and descriptor[i - 1] != "(":
                     is_topology = False
-                    if inspect.isclass(self.terminal_to_op_names[sym]) and issubclass(
-                        self.terminal_to_op_names[sym], AbstractTopology
-                    ):
-                        is_topology = True
-                    elif isinstance(
-                        self.terminal_to_op_names[sym], partial
-                    ) and issubclass(
+                    if (
+                        inspect.isclass(self.terminal_to_op_names[sym])
+                        and issubclass(self.terminal_to_op_names[sym], AbstractTopology)
+                        or isinstance(self.terminal_to_op_names[sym], partial)
+                        and issubclass(
                         self.terminal_to_op_names[sym].func, AbstractTopology
+                    )
                     ):
                         is_topology = True
 
@@ -1640,7 +1132,7 @@ def _compose_functions(
         return composed_function
 
     def graph_to_self(self, graph: nx.DiGraph, clear_self: bool = True) -> None:
-        """Copies graph to self
+        """Copies graph to self.
 
         Args:
             graph (nx.DiGraph): graph
@@ -1654,7 +1146,10 @@ def graph_to_self(self, graph: nx.DiGraph, clear_self: bool = True) -> None:
             self.nodes[n].update(**data)
 
     def _unparse_tree(
-        self, identifier: str, grammar: Grammar, as_composition: bool = True,
+        self,
+        identifier: str,
+        grammar: Grammar,
+        as_composition: bool = True,
     ):
         descriptor = self.id_to_string_tree(identifier)
 
@@ -1675,13 +1170,14 @@ def _unparse_tree(
             char = descriptor[i]
             if skip_char(char, descriptor, i):
                 pass
-            elif char == ")" and not descriptor[i - 1] == " ":
+            elif char == ")" and descriptor[i - 1] != " ":
                 # closing symbol of production
                 if q_nonterminals.qsize() == q_topologies.qsize():
                     topology, number_of_primitives = q_topologies.get(block=False)
                     primitives = [
-                        q_primitives.get(block=False) for _ in range(number_of_primitives)
-                    ][::-1]
+                                     q_primitives.get(block=False) for _ in
+                                     range(number_of_primitives)
+                                 ][::-1]
                     if as_composition:
                         if topology == "Linear1":
                             composed_function = primitives[0]
@@ -1691,7 +1187,7 @@ def _unparse_tree(
                             )
                         # composed_function = topology + "(" + ", ".join(primitives) + ")"
                     else:
-                        composed_function = " ".join([topology] + primitives)
+                        composed_function = " ".join([topology, *primitives])
                     if not q_topologies.empty():
                         q_primitives.put(composed_function)
                         q_topologies.queue[-1][1] += 1
@@ -1703,14 +1199,13 @@ def _unparse_tree(
 
                 if sym in grammar.terminals:
                     is_topology = False
-                    if inspect.isclass(self.terminal_to_op_names[sym]) and issubclass(
-                        self.terminal_to_op_names[sym], AbstractTopology
-                    ):
-                        is_topology = True
-                    elif isinstance(
-                        self.terminal_to_op_names[sym], partial
-                    ) and issubclass(
+                    if (
+                        inspect.isclass(self.terminal_to_op_names[sym])
+                        and issubclass(self.terminal_to_op_names[sym], AbstractTopology)
+                        or isinstance(self.terminal_to_op_names[sym], partial)
+                        and issubclass(
                         self.terminal_to_op_names[sym].func, AbstractTopology
+                    )
                     ):
                         is_topology = True
 
@@ -1738,9 +1233,7 @@ def skip_char(char: str, descriptor: str, i: int) -> bool:
     # special case: "(" is (part of) a terminal
     if i != 0 and char == "(" and descriptor[i - 1] == " " and descriptor[i + 1] == " ":
         return False
-    if char == "(":
-        return True
-    return False
+    return char == "("
 
 
 def find_longest_match(
diff --git a/neps/search_spaces/architecture/crossover.py b/neps/search_spaces/architecture/crossover.py
index 83e104a1..a630e528 100644
--- a/neps/search_spaces/architecture/crossover.py
+++ b/neps/search_spaces/architecture/crossover.py
@@ -1,9 +1,12 @@
+from __future__ import annotations
+
 import random
-from typing import Callable, List, Tuple
+from typing import TYPE_CHECKING, Callable
 
 import numpy as np
 
-from .cfg import Grammar
+if TYPE_CHECKING:
+    from .cfg import Grammar
 
 
 def simple_crossover(
@@ -12,7 +15,7 @@ def simple_crossover(
     grammar: Grammar,
     patience: int = 50,
     return_crossover_subtrees: bool = False,
-) -> Tuple[str, str]:
+) -> tuple[str, str]:
     if return_crossover_subtrees:
         return grammar.crossover(
             parent1=parent1,
@@ -28,10 +31,10 @@ def simple_crossover(
 
 
 def repetitive_search_space_crossover(
-    base_parent: Tuple[str, str],
-    motif_parents: Tuple[List[str], List[str]],
+    base_parent: tuple[str, str],
+    motif_parents: tuple[list[str], list[str]],
     base_grammar: Grammar,
-    motif_grammars: List[Grammar],
+    motif_grammars: list[Grammar],
     terminal_to_sublanguage_map: dict,
     number_of_repetitive_motifs_per_grammar: list,
     inner_crossover_strategy: Callable,
@@ -54,7 +57,7 @@ def _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map):
         base_parent[1], terminal_to_sublanguage_map
     )
 
-    random_draw = random.randint(
+    random_draw = random.randint(  # noqa: S311
         1 if fixed_macro_parent else 0,
         min(
             len(parent1_potential_motif_candidates),
@@ -62,12 +65,6 @@ def _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map):
         ),
     )
     if random_draw == 0:  # crossover high level grammar, but keep repetitive motifs fixed
-        # parent1_motifs = _motifs_in_base_tree(
-        #     child1_string_trees[0], terminal_to_sublanguage_map
-        # )
-        # parent2_motifs = _motifs_in_base_tree(
-        #     child2_string_trees[0], terminal_to_sublanguage_map
-        # )
         (
             _,
             _,
@@ -81,96 +78,44 @@ def _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map):
         )
         subtrees_child1 = list(subtrees_child1)
         subtrees_child2 = list(subtrees_child2)
-        # new_child1_motifs = _motifs_in_base_tree(
-        #     subtrees_child2[1], terminal_to_sublanguage_map
-        # )
-        # new_child2_motifs = _motifs_in_base_tree(
-        #     subtrees_child1[1], terminal_to_sublanguage_map
-        # )
-
-        # old_child1_string_trees = deepcopy(child1_string_trees)
-        # tmp = number_of_repetitive_motifs_per_grammar[1]
-        # free_motifs = list(set(range(1, tmp + 1)) - set(parent1_motifs))
-        # if len(free_motifs) > 0:
-        #     substitute_terminals = list(terminal_to_sublanguage_map.keys())
-        #     if len(new_child1_motifs) > len(free_motifs):  # too many new child motifs
-        #         new_child1_motifs = random.sample(
-        #             new_child1_motifs,
-        #             k=len(free_motifs),
-        #         )
-        #     elif len(new_child1_motifs) < len(
-        #         free_motifs
-        #     ):  # more free spots than necessary
-        #         free_motifs = random.sample(
-        #             free_motifs,
-        #             k=len(new_child1_motifs),
-        #         )
-        #     for fm, nm in zip(free_motifs, new_child1_motifs):
-        #         child1_string_trees[fm] = child2_string_trees[nm].replace(
-        #             substitute_terminals[nm], substitute_terminals[fm]
-        #         )
-        #         subtrees_child2[1] = subtrees_child2[1].replace(
-        #             substitute_terminals[nm], substitute_terminals[fm]
-        #         )
         child1_string_trees[0] = (
             subtrees_child1[0] + subtrees_child2[1] + subtrees_child1[2]
         )
 
-        # free_motifs = list(set(range(1, tmp + 1)) - set(parent2_motifs))
-        # if len(free_motifs) > 0:
-        #     substitute_terminals = list(terminal_to_sublanguage_map.keys())
-        #     if len(new_child2_motifs) > len(free_motifs):
-        #         new_child2_motifs = random.sample(
-        #             new_child2_motifs,
-        #             k=len(free_motifs),
-        #         )
-        #     elif len(new_child2_motifs) < len(free_motifs):
-        #         free_motifs = random.sample(
-        #             free_motifs,
-        #             k=len(new_child2_motifs),
-        #         )
-        #     for fm, nm in zip(free_motifs, new_child2_motifs):
-        #         child2_string_trees[fm] = old_child1_string_trees[nm].replace(
-        #             substitute_terminals[nm], substitute_terminals[fm]
-        #         )
-        #         subtrees_child1[1] = subtrees_child1[1].replace(
-        #             substitute_terminals[nm], substitute_terminals[fm]
-        #         )
         child2_string_trees[0] = (
             subtrees_child2[0] + subtrees_child1[1] + subtrees_child2[2]
         )
+    elif multiple_repetitive:
+        # TODO more general procedure
+        coin_toss = random.randint(1, len(child1_string_trees) - 1)
+        motif_grammar_idx = next(
+            i
+            for i, x in enumerate(np.cumsum(number_of_repetitive_motifs_per_grammar))
+            if x >= coin_toss
+        )
+        (
+            child1_string_trees[coin_toss],
+            child2_string_trees[coin_toss],
+        ) = inner_crossover_strategy(
+            child1_string_trees[coin_toss],
+            child2_string_trees[coin_toss],
+            motif_grammars[motif_grammar_idx],
+        )
     else:
-        if multiple_repetitive:
-            # TODO more general procedure
-            coin_toss = random.randint(1, len(child1_string_trees) - 1)
-            motif_grammar_idx = next(
-                i
-                for i, x in enumerate(np.cumsum(number_of_repetitive_motifs_per_grammar))
-                if x >= coin_toss
-            )
-            (
-                child1_string_trees[coin_toss],
-                child2_string_trees[coin_toss],
-            ) = inner_crossover_strategy(
-                child1_string_trees[coin_toss],
-                child2_string_trees[coin_toss],
-                motif_grammars[motif_grammar_idx],
-            )
-        else:
-            parent1_random_draw = random.randint(
-                0, len(parent1_potential_motif_candidates) - 1
-            )
-            parent2_random_draw = random.randint(
-                0, len(parent2_potential_motif_candidates) - 1
-            )
-            (
-                child1_string_trees[parent1_random_draw + 1],
-                child2_string_trees[parent2_random_draw + 1],
-            ) = inner_crossover_strategy(
-                child1_string_trees[parent1_random_draw + 1],
-                child2_string_trees[parent2_random_draw + 1],
-                motif_grammars[0],
-            )
+        parent1_random_draw = random.randint(
+            0, len(parent1_potential_motif_candidates) - 1
+        )
+        parent2_random_draw = random.randint(
+            0, len(parent2_potential_motif_candidates) - 1
+        )
+        (
+            child1_string_trees[parent1_random_draw + 1],
+            child2_string_trees[parent2_random_draw + 1],
+        ) = inner_crossover_strategy(
+            child1_string_trees[parent1_random_draw + 1],
+            child2_string_trees[parent2_random_draw + 1],
+            motif_grammars[0],
+        )
 
     if any(not st for st in child1_string_trees) or any(
         not st for st in child2_string_trees
diff --git a/neps/search_spaces/architecture/graph.py b/neps/search_spaces/architecture/graph.py
index f776b231..6412083e 100644
--- a/neps/search_spaces/architecture/graph.py
+++ b/neps/search_spaces/architecture/graph.py
@@ -1,20 +1,26 @@
+from __future__ import annotations
+
 import copy
 import inspect
 import logging
 import os
 import random
 import sys
-from collections import Counter
-from typing import Callable
-from typing import Counter as CounterType
 import types
+from collections import Counter
+from pathlib import Path
+from typing import (
+    Callable,
+    Counter as CounterType,
+)
+
 import networkx as nx
 import torch
 from networkx.algorithms.dag import lexicographical_topological_sort
-from pathlib import Path
 from torch import nn
 
 from neps.utils.types import AttrDict
+
 from .primitives import AbstractPrimitive, Identity
 
 
@@ -28,10 +34,9 @@ def log_formats(x):
 
 
 def _find_caller():
-    """
-    Returns:
-        str: module name of the caller
-        tuple: a hashable key to be used to identify different callers
+    """Returns:
+    str: module name of the caller
+    tuple: a hashable key to be used to identify different callers.
     """
     frame = sys._getframe(2)
     while frame:
@@ -42,6 +47,7 @@ def _find_caller():
                 mod_name = "detectron2"
             return mod_name, (code.co_filename, frame.f_lineno, code.co_name)
         frame = frame.f_back
+    return None
 
 
 _LOG_COUNTER: CounterType = Counter()
@@ -49,8 +55,8 @@ def _find_caller():
 
 
 def log_first_n(lvl, msg, n=1, *, name=None, key="caller"):
-    """
-    Log only for the first n times.
+    """Log only for the first n times.
+
     Args:
         lvl (int): the logging level
         msg (str):
@@ -75,7 +81,7 @@ def log_first_n(lvl, msg, n=1, *, name=None, key="caller"):
     if "caller" in key:
         hash_key = hash_key + caller_key
     if "message" in key:
-        hash_key = hash_key + (msg,)
+        hash_key = (*hash_key, msg)
 
     _LOG_COUNTER[hash_key] += 1
     if _LOG_COUNTER[hash_key] <= n:
@@ -83,9 +89,7 @@ def log_first_n(lvl, msg, n=1, *, name=None, key="caller"):
 
 
 def iter_flatten(iterable):
-    """
-    Flatten a potentially deeply nested python list
-    """
+    """Flatten a potentially deeply nested python list."""
     # taken from https://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html
     it = iter(iterable)
     for e in it:
@@ -99,8 +103,7 @@ def iter_flatten(iterable):
 
 
 class Graph(torch.nn.Module, nx.DiGraph):
-    """
-    Base class for defining a search space. Add nodes and edges
+    """Base class for defining a search space. Add nodes and edges
     as for a directed acyclic graph in `networkx`. Nodes can contain
     graphs as children, also edges can contain graphs as operations.
 
@@ -163,19 +166,18 @@ class Graph(torch.nn.Module, nx.DiGraph):
     """
     QUERYABLE = False
 
-    def __init__(self, name: str = None, scope: str = None):
-        """
-        Initialise a graph. The edges are automatically filled with an EdgeData object
+    def __init__(self, name: str | None = None, scope: str | None = None):
+        """Initialise a graph. The edges are automatically filled with an EdgeData object
         which defines the default operation as Identity. The default combination operation
         is set as sum.
 
         Note:
-            When inheriting form `Graph` note that `__init__()` cannot take any parameters.
-            This is due to the way how networkx is implemented, i.e. graphs are reconstructed
-            internally and no parameters for init are considered.
+            When inheriting form `Graph` note that `__init__()` cannot take any
+            parameters. This is due to the way how networkx is implemented, i.e. graphs
+            are reconstructed internally and no parameters for init are considered.
 
-            Our recommended solution is to create static attributes before initialization and
-            then load them dynamically in `__init__()`.
+            Our recommended solution is to create static attributes before initialization
+            and then load them dynamically in `__init__()`.
 
             >>> def __init__(self):
             >>>     num_classes = self.NUM_CLASSES
@@ -207,7 +209,7 @@ def __init__(self, name: str = None, scope: str = None):
         # `input` is required for storing the results of incoming edges.
 
         # self._nxgraph.node_attr_dict_factory = lambda: dict({'input': {}, 'comb_op': sum})
-        self.node_attr_dict_factory = lambda: dict({"input": {}, "comb_op": sum})
+        self.node_attr_dict_factory = lambda: {"input": {}, "comb_op": sum}
 
         # remember to add all members also in `unparse()`
         self.name = name
@@ -220,8 +222,7 @@ def __eq__(self, other):
         return self.name == other.name and self.scope == other.scope
 
     def __hash__(self):
-        """
-        As it is very complicated to compare graphs (i.e. check all edge
+        """As it is very complicated to compare graphs (i.e. check all edge
         attributes, do the have shared attributes, ...) use just the name
         for comparison.
 
@@ -234,27 +235,20 @@ def __hash__(self):
         return h
 
     def __repr__(self):
-        return "Graph {}-{:.07f}, scope {}, {} nodes".format(
-            self.name, self._id, self.scope, self.number_of_nodes()
-        )
+        return f"Graph {self.name}-{self._id:.07f}, scope {self.scope}, {self.number_of_nodes()} nodes"
 
     def modules_str(self):
-        """
-        Once the graph has been parsed, prints the modules as they appear in pytorch.
-        """
+        """Once the graph has been parsed, prints the modules as they appear in pytorch."""
         if self.is_parsed:
             result = ""
-            for g in self._get_child_graphs(single_instances=True) + [self]:
-                result += "Graph {}:\n {}\n==========\n".format(
-                    g.name, torch.nn.Module.__repr__(g)
-                )
+            for g in [*self._get_child_graphs(single_instances=True), self]:
+                result += f"Graph {g.name}:\n {torch.nn.Module.__repr__(g)}\n==========\n"
             return result
         else:
             return self.__repr__()
 
     def set_scope(self, scope: str, recursively=True):
-        """
-        Sets the scope of this instance of the graph.
+        """Sets the scope of this instance of the graph.
 
         The function should be used in a builder-like pattern
         `'subgraph'=Graph().set_scope("scope")`.
@@ -274,8 +268,7 @@ def set_scope(self, scope: str, recursively=True):
         return self
 
     def add_node(self, node_index, **attr):
-        """
-        Adds a node to the graph.
+        """Adds a node to the graph.
 
         Note that adding a node using an index that has been used already
         will override its attributes.
@@ -288,8 +281,7 @@ def add_node(self, node_index, **attr):
         nx.DiGraph.add_node(self, node_index, **attr)
 
     def copy(self):
-        """
-        Copy as defined in networkx, i.e. a shallow copy.
+        """Copy as defined in networkx, i.e. a shallow copy.
 
         Just handling recursively nested graphs seperately.
         """
@@ -301,7 +293,7 @@ def copy_dict(d):
                     copied_dict[k] = v.copy()
                 elif isinstance(v, list):
                     copied_dict[k] = [i.copy() if isinstance(i, Graph) else i for i in v]
-                elif isinstance(v, torch.nn.Module) or isinstance(v, AbstractPrimitive):
+                elif isinstance(v, (AbstractPrimitive, torch.nn.Module)):
                     copied_dict[k] = copy.deepcopy(v)
             return copied_dict
 
@@ -317,9 +309,8 @@ def copy_dict(d):
         G.name = self.name
         return G
 
-    def set_input(self, node_idxs: list):
-        """
-        Route the input from specific parent edges to the input nodes of
+    def set_input(self, node_idxs: list):  # noqa: D417
+        """Route the input from specific parent edges to the input nodes of
         this subgraph. Inputs are assigned in lexicographical order.
 
         Example:
@@ -345,17 +336,15 @@ def set_input(self, node_idxs: list):
 
         """
         num_innodes = sum(self.in_degree(n) == 0 for n in self.nodes)
-        assert num_innodes == len(
-            node_idxs
-        ), "Expecting node index for every input node. Excpected {}, got {}".format(
-            num_innodes, len(node_idxs)
+        assert num_innodes == len(node_idxs), (
+            f"Expecting node index for every input node. Excpected {num_innodes}, "
+            f"got {len(node_idxs)}"
         )
         self.input_node_idxs = node_idxs  # type: ignore[assignment]
         return self
 
     def num_input_nodes(self) -> int:
-        """
-        The number of input nodes, i.e. the nodes without an
+        """The number of input nodes, i.e. the nodes without an
         incoming edge.
 
         Returns:
@@ -363,135 +352,6 @@ def num_input_nodes(self) -> int:
         """
         return sum(self.in_degree(n) == 0 for n in self.nodes)
 
-    def _assign_x_to_nodes(self, x):
-        """
-        Assign x to the input nodes of self. Depending whether on
-        edge or nodes.
-
-        Performs also several sanity checks of the input.
-
-        Args:
-            x (Tensor or dict): Input to be assigned.
-        """
-        # We need dict in case of cell and int in case of motif
-        assert isinstance(x, dict) or isinstance(x, torch.Tensor)
-
-        if self.input_node_idxs is None:
-            assert (
-                self.num_input_nodes() == 1
-            ), "There are more than one input nodes but input indeces are not defined."
-            input_node = [n for n in self.nodes if self.in_degree(n) == 0][0]
-            assert (
-                len(list(self.predecessors(input_node))) == 0
-            ), "Expecting node 1 to be the parent."
-            assert (
-                "subgraph" not in self.nodes[input_node].keys()
-            ), "Expecting node 1 not to have a subgraph as it serves as input node."
-            assert isinstance(x, torch.Tensor)
-            self.nodes[input_node]["input"] = {0: x}
-        else:
-            # assign the input to the corresponding nodes
-            assert all(
-                [i in x.keys() for i in self.input_node_idxs]
-            ), "got x from an unexpected input edge"
-            if self.num_input_nodes() > len(x):
-                # here is the case where the same input is assigned to more than one node
-                # this can happen when there are cells with two inputs but at the very first
-                # layer of the network, there is just one output (i.e. the data inputed to the
-                # makro input node). Handle it and log a Info. This should happen only rarly
-                logger.debug(
-                    f"We are using the same x for two inputs in graph {self.name}"
-                )
-            input_node_iterator = iter(self.input_node_idxs)
-            for node_idx in lexicographical_topological_sort(self):
-                if self.in_degree(node_idx) == 0:
-                    self.nodes[node_idx]["input"] = {0: x[next(input_node_iterator)]}
-
-    def forward(self, x, *args):
-        """
-        Forward some data through the graph. This is done recursively
-        in case there are graphs defined on nodes or as 'op' on edges.
-
-        Args:
-            x (Tensor or dict): The input. If the graph sits on a node the
-                input can be a dict with {source_idx: Tensor} to be routed
-                to the defined input nodes. If the graph sits on an edge,
-                x is the feature tensor.
-            args: This is only required to handle cases where the graph sits
-                on an edge and receives an EdgeData object which will be ignored
-        """
-        logger.debug(f"Graph {self.name} called. Input {log_formats(x)}.")
-
-        # Assign x to the corresponding input nodes
-        self._assign_x_to_nodes(x)
-
-        for node_idx in lexicographical_topological_sort(self):
-            node = self.nodes[node_idx]
-            logger.debug(
-                "Node {}-{}, current data {}, start processing...".format(
-                    self.name, node_idx, log_formats(node)
-                )
-            )
-
-            # node internal: process input if necessary
-            if ("subgraph" in node and "comb_op" not in node) or (
-                "comb_op" in node and "subgraph" not in node
-            ):
-                log_first_n(
-                    logging.WARN, "Comb_op is ignored if subgraph is defined!", n=1
-                )
-            # TODO: merge 'subgraph' and 'comb_op'. It is basicallly the same thing. Also in parse()
-            if "subgraph" in node:
-                x = node["subgraph"].forward(node["input"])
-            else:
-                if len(node["input"].values()) == 1:
-                    x = list(node["input"].values())[0]
-                else:
-                    x = node["comb_op"](
-                        [node["input"][k] for k in sorted(node["input"].keys())]
-                    )
-            node["input"] = {}  # clear the input as we have processed it
-
-            if (
-                len(list(self.neighbors(node_idx))) == 0
-                and node_idx < list(lexicographical_topological_sort(self))[-1]
-            ):
-                # We have more than one output node. This is e.g. the case for
-                # auxillary losses. Attach them to the graph, handling must done
-                # by the user.
-                logger.debug(
-                    "Graph {} has more then one output node. Storing output of non-maximum index node {} at graph dict".format(
-                        self, node_idx
-                    )
-                )
-                self.graph[f"out_from_{node_idx}"] = x
-            else:
-                # outgoing edges: process all outgoing edges
-                for neigbor_idx in self.neighbors(node_idx):
-                    edge_data = self.get_edge_data(node_idx, neigbor_idx)
-                    # inject edge data only for AbstractPrimitive, not Graphs
-                    if isinstance(edge_data.op, Graph):
-                        edge_output = edge_data.op.forward(x)
-                    elif isinstance(edge_data.op, AbstractPrimitive):
-                        logger.debug(
-                            "Processing op {} at edge {}-{}".format(
-                                edge_data.op, node_idx, neigbor_idx
-                            )
-                        )
-                        edge_output = edge_data.op.forward(x)
-                    else:
-                        raise ValueError(
-                            "Unknown class as op: {}. Expected either Graph or AbstactPrimitive".format(
-                                edge_data.op
-                            )
-                        )
-                    self.nodes[neigbor_idx]["input"].update({node_idx: edge_output})
-
-            logger.debug(f"Node {self.name}-{node_idx}, processing done.")
-
-        logger.debug(f"Graph {self.name} exiting. Output {log_formats(x)}.")
-        return x
-
     def to_pytorch(self, **kwargs) -> nn.Module:
         return self._to_pytorch(**kwargs)
 
@@ -504,7 +364,7 @@ def _import_code(code: str, name: str):
         if not self.is_parsed:
             self.parse()
 
-        input_node = [n for n in self.nodes if self.in_degree(n) == 0][0]
+        input_node = next(n for n in self.nodes if self.in_degree(n) == 0)
         input_name = "x0"
         self.nodes[input_node]["input"] = {0: input_name}
 
@@ -522,7 +382,7 @@ def _import_code(code: str, name: str):
                 input_name = f"x{max_xidx + 1}"
                 used_input_names.append(max_xidx + 1)
                 forward_f.append(_forward_f)
-                x = f"x{max_xidx+1}"
+                x = f"x{max_xidx + 1}"
             else:
                 if len(node["input"].values()) == 1:
                     x = next(iter(node["input"].values()))
@@ -532,7 +392,7 @@ def _import_code(code: str, name: str):
                         "__name__" in dir(node["comb_op"])
                         and node["comb_op"].__name__ == "sum"
                     ):
-                        _forward_f = f"x{max_xidx+1}=sum(["
+                        _forward_f = f"x{max_xidx + 1}=sum(["
                     elif isinstance(node["comb_op"], torch.nn.Module):
                         submodule_list.append(node["comb_op"])
                         _forward_f = f"x{max_xidx + 1}=self.module_list[{len(submodule_list) - 1}](["
@@ -543,7 +403,7 @@ def _import_code(code: str, name: str):
                         _forward_f += inp + ","
                     _forward_f = _forward_f[:-1] + "])"
                     forward_f.append(_forward_f)
-                    x = f"x{max_xidx+1}"
+                    x = f"x{max_xidx + 1}"
                 if int(x[1:]) not in used_input_names:
                     used_input_names.append(int(x[1:]))
             node["input"] = {}  # clear the input as we have processed it
@@ -579,9 +439,7 @@ def _import_code(code: str, name: str):
                         forward_f.append(_forward_f)
                     else:
                         raise ValueError(
-                            "Unknown class as op: {}. Expected either Graph or AbstactPrimitive".format(
-                                edge_data.op
-                            )
+                            f"Unknown class as op: {edge_data.op}. Expected either Graph or AbstactPrimitive"
                         )
                     self.nodes[neigbor_idx]["input"].update({node_idx: input_name})
 
@@ -615,8 +473,7 @@ def _import_code(code: str, name: str):
         return model
 
     def parse(self):
-        """
-        Convert the graph into a neural network which can then
+        """Convert the graph into a neural network which can then
         be optimized by pytorch.
         """
         for node_idx in lexicographical_topological_sort(self):
@@ -626,12 +483,11 @@ def parse(self):
                     f"{self.name}-subgraph_at({node_idx})",
                     self.nodes[node_idx]["subgraph"],
                 )
-            else:
-                if isinstance(self.nodes[node_idx]["comb_op"], torch.nn.Module):
-                    self.add_module(
-                        f"{self.name}-comb_op_at({node_idx})",
-                        self.nodes[node_idx]["comb_op"],
-                    )
+            elif isinstance(self.nodes[node_idx]["comb_op"], torch.nn.Module):
+                self.add_module(
+                    f"{self.name}-comb_op_at({node_idx})",
+                    self.nodes[node_idx]["comb_op"],
+                )
 
             for neigbor_idx in self.neighbors(node_idx):
                 edge_data = self.get_edge_data(node_idx, neigbor_idx)
@@ -649,8 +505,7 @@ def parse(self):
         self.is_parsed = True
 
     def unparse(self):
-        """
-        Undo the pytorch parsing by reconstructing the graph uusing the
+        """Undo the pytorch parsing by reconstructing the graph uusing the
         networkx data structures.
 
         This is done recursively also for child graphs.
@@ -689,8 +544,7 @@ def unparse(self):
         return g
 
     def _get_child_graphs(self, single_instances: bool = False) -> list:
-        """
-        Get all child graphs of the current graph.
+        """Get all child graphs of the current graph.
 
         Args:
             single_instances (bool): Whether to return multiple instances
@@ -730,9 +584,7 @@ def _get_child_graphs(self, single_instances: bool = False) -> list:
                                 graphs.append(child_op._get_child_graphs())
                     else:
                         logger.debug(
-                            "Got embedded op, but is neither a graph nor a list: {}".format(
-                                embedded_ops
-                            )
+                            f"Got embedded op, but is neither a graph nor a list: {embedded_ops}"
                         )
             elif inspect.isclass(edge_data.op):
                 assert not issubclass(
@@ -744,7 +596,7 @@ def _get_child_graphs(self, single_instances: bool = False) -> list:
             else:
                 raise ValueError(f"Unknown format of op: {edge_data.op}")
 
-        graphs = [g for g in iter_flatten(graphs)]
+        graphs = list(iter_flatten(graphs))
 
         if single_instances:
             single: list = []
@@ -755,50 +607,9 @@ def _get_child_graphs(self, single_instances: bool = False) -> list:
         else:
             return sorted(graphs, key=lambda g: g.name)
 
-    def get_all_edge_data(
-        self, key: str, scope="all", private_edge_data: bool = False
-    ) -> list:
-        """
-        Get edge attributes of this graph and all child graphs in one go.
-
-        Args:
-            key (str): The key of the attribute
-            scope (str): The scope to be applied
-            private_edge_data (bool): Whether to return data from graph copies as well.
-
-        Returns:
-            list: All data in a list.
-        """
-        assert scope is not None
-        result = []
-        for graph in self._get_child_graphs(single_instances=not private_edge_data) + [
-            self
-        ]:
-            if (
-                scope == "all"
-                or graph.scope == scope
-                or (isinstance(scope, list) and graph.scope in scope)
-            ):
-                for _, _, edge_data in graph.edges.data():
-                    if edge_data.has(key):
-                        result.append(edge_data[key])
-        return result
-
-    def set_at_edges(self, key, value, shared=False):
-        """
-        Sets the attribute for all edges in this and any child graph
-        """
-        for graph in self._get_child_graphs(single_instances=shared) + [self]:
-            logger.debug(f"Updating edges of graph {graph.name}")
-            for _, _, edge_data in graph.edges.data():
-                if not edge_data.is_final():
-                    edge_data.set(key, value, shared)
-
     def compile(self):
-        """
-        Instanciates the ops at the edges using the arguments specified at the edges
-        """
-        for graph in self._get_child_graphs(single_instances=False) + [self]:
+        """Instanciates the ops at the edges using the arguments specified at the edges."""
+        for graph in [*self._get_child_graphs(single_instances=False), self]:
             logger.debug(f"Compiling graph {graph.name}")
             for _, v, edge_data in graph.edges.data():
                 if not edge_data.is_final():
@@ -832,8 +643,7 @@ def compile(self):
 
     @staticmethod
     def _verify_update_function(update_func: Callable, private_edge_data: bool):
-        """
-        Verify that the update function actually modifies only
+        """Verify that the update function actually modifies only
         shared/private edge data attributes based on setting of
         `private_edge_data`.
 
@@ -844,7 +654,6 @@ def _verify_update_function(update_func: Callable, private_edge_data: bool):
                 to all graph instances including copies or just to one instance
                 per graph
         """
-
         test = EdgeData()
         test.set("shared", True, shared=True)
         test.set("op", [True])
@@ -881,8 +690,7 @@ def _verify_update_function(update_func: Callable, private_edge_data: bool):
     def update_edges(
         self, update_func: Callable, scope="all", private_edge_data: bool = False
     ):
-        """
-        This updates the edge data of this graph and all child graphs.
+        """This updates the edge data of this graph and all child graphs.
         This is the preferred way to manipulate the edges after the definition
         of the graph, e.g. by optimizers who want to insert their own op.
         `update_func(current_edge_data)`. This way optimizers
@@ -905,13 +713,14 @@ def update_edges(
         """
         Graph._verify_update_function(update_func, private_edge_data)
         assert scope is not None
-        for graph in self._get_child_graphs(single_instances=not private_edge_data) + [
-            self
+        for graph in [
+            *self._get_child_graphs(single_instances=not private_edge_data),
+            self,
         ]:
             if (
-                scope == "all"
-                or scope == graph.scope
-                or (isinstance(scope, list) and graph.scope in scope)
+                scope in ("all", graph.scope)
+                or isinstance(scope, list)
+                and graph.scope in scope
             ):
                 logger.debug(f"Updating edges of graph {graph.name}")
                 for u, v, edge_data in graph.edges.data():
@@ -923,8 +732,7 @@ def update_edges(
     def update_nodes(
         self, update_func: Callable, scope="all", single_instances: bool = True
     ):
-        """
-        Update the nodes of the graph and its incoming and outgoing edges by iterating over the
+        """Update the nodes of the graph and its incoming and outgoing edges by iterating over the
         graph and applying `update_func` to each of it. This is the
         preferred way to change the search space once it has been defined.
 
@@ -952,11 +760,11 @@ def update_nodes(
                 with MixedOp or SampleOp)
         """
         assert scope is not None
-        for graph in self._get_child_graphs(single_instances) + [self]:
+        for graph in [*self._get_child_graphs(single_instances), self]:
             if (
-                scope == "all"
-                or graph.scope == scope
-                or (isinstance(scope, list) and graph.scope in scope)
+                scope in ("all", graph.scope)
+                or isinstance(scope, list)
+                and graph.scope in scope
             ):
                 logger.debug(f"Updating nodes of graph {graph.name}")
                 for node_idx in lexicographical_topological_sort(graph):
@@ -973,11 +781,10 @@ def update_nodes(
         self._delete_flagged_edges()
 
     def _delete_flagged_edges(self):
-        """
-        Delete edges which associated EdgeData is flagged as deleted.
-        """
-        for graph in self._get_child_graphs(single_instances=False) + [
-            self
+        """Delete edges which associated EdgeData is flagged as deleted."""
+        for graph in [
+            *self._get_child_graphs(single_instances=False),
+            self,
         ]:  # we operate on shallow copies
             to_remove = []
             for u, v, edge_data in graph.edges.data():
@@ -988,79 +795,16 @@ def _delete_flagged_edges(self):
                 graph.remove_edges_from(to_remove)
 
     def clone(self):
-        """
-        Deep copy of the current graph.
+        """Deep copy of the current graph.
 
         Returns:
             Graph: Deep copy of the graph.
         """
         return copy.deepcopy(self)
 
-    def reset_weights(self, inplace: bool = False):
-        """
-        Resets the weights for the 'op' at all edges.
-
-        Args:
-            inplace (bool): Do the operation in place or
-                return a modified copy.
-        Returns:
-            Graph: Returns the modified version of the graph.
-        """
-
-        def weight_reset(m):
-            if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear):
-                m.reset_parameters()
-
-        if inplace:
-            graph = self
-        else:
-            graph = self.clone()
-
-        graph.apply(weight_reset)
-
-        return graph
-
-    def prepare_discretization(self):
-        """
-        In some cases the search space is manipulated before the final
-        discretization is happening, e.g. DARTS. In such chases this should
-        be defined in the search space, so all optimizers can call it.
-        """
-
-    def prepare_evaluation(self):
-        """
-        In some cases the evaluation architecture does not match the searched
-        one. An example is where the makro_model is extended to increase the
-        parameters. This is done here.
-        """
-
-    def get_dense_edges(self):
-        """
-        Returns the edge indices (i, j) that would make a fully connected
-        DAG without circles such that i < j and i != j. Assumes nodes are
-        already created.
-
-        Returns:
-            list: list of edge indices.
-        """
-        edges = []
-        nodes = sorted(list(self.nodes()))
-        for i in nodes:
-            for j in nodes:
-                if i != j and j > i:
-                    edges.append((i, j))
-        return edges
-
-    def add_edges_densly(self):
-        """
-        Adds edges to get a fully connected DAG without cycles
-        """
-        self.add_edges_from(self.get_dense_edges())
-
 
 class EdgeData:
-    """
-    Class that holds data for each edge.
+    """Class that holds data for each edge.
     Data can be shared between instances of the graph
     where the edges lives in.
 
@@ -1071,10 +815,9 @@ class EdgeData:
     in a dict-like fashion with `[key]`. To set a new item use `.set()`.
     """
 
-    def __init__(self, data: dict = None):
-        """
-        Initializes a new EdgeData object.
-        'op' is set as Identity() and private by default
+    def __init__(self, data: dict | None = None):
+        """Initializes a new EdgeData object.
+        'op' is set as Identity() and private by default.
 
         Args:
             data (dict): Inject some initial data. Will be always private.
@@ -1094,8 +837,7 @@ def __init__(self, data: dict = None):
             self.set(k, v, shared=False)
 
     def has(self, key: str):
-        """
-        Checks whether `key` exists.
+        """Checks whether `key` exists.
 
         Args:
             key (str): The key to check.
@@ -1105,7 +847,7 @@ def has(self, key: str):
 
         """
         assert not key.startswith("_"), "Access to private keys not allowed!"
-        return key in self._private.keys() or key in self._shared.keys()
+        return key in self._private or key in self._shared
 
     def __getitem__(self, key: str):
         assert not str(key).startswith("_"), "Access to private keys not allowed!"
@@ -1119,7 +861,7 @@ def get(self, key: str, default):
 
     def __getattr__(self, key: str):
         if key.startswith("__"):  # Required for deepcopy, not sure why
-            raise AttributeError(key)  #
+            raise AttributeError(key)
         assert not key.startswith("_"), "Access to private keys not allowed!"
         if key in self._private:
             return self._private[key]
@@ -1135,14 +877,13 @@ def __setattr__(self, name: str, val):
             raise ValueError("not allowed. use set().")
 
     def __str__(self):
-        return f"private: <{str(self._private)}>, shared: <{str(self._shared)}>"
+        return f"private: <{self._private!s}>, shared: <{self._shared!s}>"
 
     def __repr__(self):
         return self.__str__()
 
     def update(self, data):
-        """
-        Update the data in here. If the data is added as dict,
+        """Update the data in here. If the data is added as dict,
         then all variables will be handled as private.
 
         Args:
@@ -1159,8 +900,7 @@ def update(self, data):
             raise ValueError(f"Unsupported type {data}")
 
     def remove(self, key: str):
-        """
-        Removes an item from the EdgeData
+        """Removes an item from the EdgeData.
 
         Args:
             key (str): The key for the item to be removed.
@@ -1173,8 +913,7 @@ def remove(self, key: str):
             raise KeyError(f"Tried to delete unkown key {key}")
 
     def copy(self):
-        """
-        When a graph is copied to get multiple instances (e.g. when
+        """When a graph is copied to get multiple instances (e.g. when
         reusing subgraphs at more than one location) then
         this function will be called for all edges.
 
@@ -1204,8 +943,7 @@ def copy(self):
         return new_self
 
     def set(self, key: str, value, shared=False):
-        """
-        Used to assign a new item to the EdgeData object.
+        """Used to assign a new item to the EdgeData object.
 
         Args:
             key (str): The key.
@@ -1214,9 +952,7 @@ def set(self, key: str, value, shared=False):
                 be a shallow copy between different instances of EdgeData
                 (and consequently between different instances of Graph).
         """
-        assert isinstance(key, str), "Accepting only string keys, got {}".format(
-            type(key)
-        )
+        assert isinstance(key, str), f"Accepting only string keys, got {type(key)}"
         assert not key.startswith("_"), "Access to private keys not allowed!"
         assert not self.is_final(), "Trying to change finalized edge!"
         if shared:
@@ -1224,15 +960,13 @@ def set(self, key: str, value, shared=False):
                 raise ValueError("Key {} alredy defined as non-shared")
             else:
                 self._shared[key] = value
+        elif key in self._shared:
+            raise ValueError(f"Key {key} alredy defined as shared")
         else:
-            if key in self._shared:
-                raise ValueError(f"Key {key} alredy defined as shared")
-            else:
-                self._private[key] = value
+            self._private[key] = value
 
     def clone(self):
-        """
-        Return a true deep copy of EdgeData. Even shared
+        """Return a true deep copy of EdgeData. Even shared
         items are not shared anymore.
 
         Returns:
@@ -1241,20 +975,15 @@ def clone(self):
         return copy.deepcopy(self)
 
     def delete(self):
-        """
-        Flag to delete the edge where this instance is attached to.
-        """
+        """Flag to delete the edge where this instance is attached to."""
         self._shared["_deleted"] = True
 
     def is_deleted(self):
-        """
-        Returns true if the edge is flagged to be deleted
-        """
+        """Returns true if the edge is flagged to be deleted."""
         return self._shared["_deleted"]
 
     def finalize(self):
-        """
-        Sets this edge as final. This means it cannot be changed
+        """Sets this edge as final. This means it cannot be changed
         anymore and will also not appear in the update functions
         of the graph.
         """
@@ -1262,9 +991,8 @@ def finalize(self):
         return self
 
     def is_final(self):
-        """
-        Returns:
-            bool: True if the edge was finalized, False else
+        """Returns:
+        bool: True if the edge was finalized, False else.
         """
         return self._private["_final"]
 
diff --git a/neps/search_spaces/architecture/graph_grammar.py b/neps/search_spaces/architecture/graph_grammar.py
index 1c9fa159..932de768 100644
--- a/neps/search_spaces/architecture/graph_grammar.py
+++ b/neps/search_spaces/architecture/graph_grammar.py
@@ -7,18 +7,23 @@
 from typing import Any, ClassVar, Mapping
 from typing_extensions import override, Self
 from neps.utils.types import NotSet
+from typing import TYPE_CHECKING, Any, ClassVar, Mapping
+from typing_extensions import Self, override
 
 import networkx as nx
 import numpy as np
-from nltk import Nonterminal
 
-from ..parameter import ParameterWithPrior, MutatableParameter
-from .cfg import Grammar
-from .cfg_variants.constrained_cfg import ConstrainedGrammar
+from neps.search_spaces.parameter import MutatableParameter, ParameterWithPrior
+from neps.utils.types import NotSet
+
 from .core_graph_grammar import CoreGraphGrammar
 from .crossover import repetitive_search_space_crossover, simple_crossover
 from .mutations import bananas_mutate, repetitive_search_space_mutation, simple_mutate
 
+if TYPE_CHECKING:
+    from .cfg import Grammar
+    from .cfg_variants.constrained_cfg import ConstrainedGrammar
+
 
 # TODO(eddiebergman): This is a halfway solution, but essentially a lot
 # of things `Parameter` does, does not fit nicely with a Graph based
@@ -28,7 +33,9 @@
 # The problem here is that the `Parameter` expects the `load_from`
 # and the `.value` to be the same type, which is not the case for
 # graph based parameters.
-class GraphParameter(ParameterWithPrior[nx.DiGraph, str], MutatableParameter):
+class GraphParameter(  # noqa: D101
+    ParameterWithPrior[nx.DiGraph, str], MutatableParameter
+):
     # NOTE(eddiebergman): What I've managed to learn so far is that
     # these hyperparameters work mostly with strings externally,
     # i.e. setting the value through `load_from` or `set_value` should be a string.
@@ -38,7 +45,8 @@ class GraphParameter(ParameterWithPrior[nx.DiGraph, str], MutatableParameter):
     # At serialization time, it doesn't actually serialize the .value but instead
     # relies on the string it was passed initially, I'm not actually sure if there's
     # a way to go from the graph object to the string in this code...
-    # Essentially on the outside, we need to ensure we don't pass ih the graph object itself
+    # Essentially on the outside, we need to ensure we don't pass ih the graph object
+    # itself
     DEFAULT_CONFIDENCE_SCORES: ClassVar[Mapping[str, float]] = {"not_in_use": 1.0}
     default_confidence_choice = "not_in_use"
     has_prior: bool
@@ -46,26 +54,22 @@ class GraphParameter(ParameterWithPrior[nx.DiGraph, str], MutatableParameter):
 
     @property
     @abstractmethod
-    def id(self) -> str:
-        ...
+    def id(self) -> str: ...
 
     # NOTE(eddiebergman): Unlike traditional parameters, it seems
     @property
     @abstractmethod
-    def value(self) -> nx.DiGraph:
-        ...
+    def value(self) -> nx.DiGraph: ...
 
     # NOTE(eddiebergman): This is a function common to the three graph
     # parameters that is used for `load_from`
     @abstractmethod
-    def create_from_id(self, value: str) -> None:
-        ...
+    def create_from_id(self, value: str) -> None: ...
 
     # NOTE(eddiebergman): Function shared between graph parameters.
     # Used to `set_value()`
     @abstractmethod
-    def reset(self) -> None:
-        ...
+    def reset(self) -> None: ...
 
     @override
     def __eq__(self, other: Any) -> bool:
@@ -75,8 +79,7 @@ def __eq__(self, other: Any) -> bool:
         return self.id == other.id
 
     @abstractmethod
-    def compute_prior(self, normalized_value: float) -> float:
-        ...
+    def compute_prior(self, normalized_value: float) -> float: ...
 
     @override
     def set_value(self, value: str | None) -> None:
@@ -85,8 +88,8 @@ def set_value(self, value: str | None) -> None:
         # `self.value = None`
         if not isinstance(value, str):
             raise ValueError(
-                f"Expected a string for setting value a `GraphParameter`",
-                f" got {type(value)}"
+                "Expected a string for setting value a `GraphParameter`",
+                f" got {type(value)}",
             )
         self.reset()
         self.normalized_value = value
@@ -142,21 +145,22 @@ def load_from(self, value: str | Self) -> None:
         self.create_from_id(value)
 
     @abstractmethod
-    def mutate(self, parent: Self | None = None, *,
-               mutation_strategy: str = "bananas") -> Self:
-        ...
+    def mutate(  # noqa: D102
+        self, parent: Self | None = None, *, mutation_strategy: str = "bananas"
+    ) -> Self: ...
 
     @abstractmethod
-    def crossover(self, parent1: Self, parent2: Self | None = None) -> tuple[Self, Self]:
-        ...
+    def crossover(  # noqa: D102
+        self, parent1: Self, parent2: Self | None = None
+    ) -> tuple[Self, Self]: ...
 
     def _get_non_unique_neighbors(self, num_neighbours: int) -> list[Self]:
         raise NotImplementedError
 
-    def value_to_normalized(self, value: nx.DiGraph) -> float:
+    def value_to_normalized(self, value: nx.DiGraph) -> float:  # noqa: D102
         raise NotImplementedError
 
-    def normalized_to_value(self, normalized_value: float) -> nx.DiGraph:
+    def normalized_to_value(self, normalized_value: float) -> nx.DiGraph:  # noqa: D102
         raise NotImplementedError
 
     @override
@@ -189,21 +193,25 @@ def clone(self) -> Self:
 class GraphGrammar(GraphParameter, CoreGraphGrammar):
     hp_name = "graph_grammar"
 
-    def __init__(
+    def __init__(  # noqa: D107, PLR0913
         self,
         grammar: Grammar,
         terminal_to_op_names: dict,
-        prior: dict = None,
-        terminal_to_graph_edges: dict = None,
-        edge_attr: bool = True,
+        prior: dict | None = None,
+        terminal_to_graph_edges: dict | None = None,
+        edge_attr: bool = True,  # noqa: FBT001, FBT002
         edge_label: str = "op_name",
-        zero_op: list = ["Zero", "zero"],
-        identity_op: list = ["Identity", "id"],
-        new_graph_repr_func: bool = False,
-        name: str = None,
-        scope: str = None,
+        zero_op: list | None = None,
+        identity_op: list | None = None,
+        new_graph_repr_func: bool = False,  # noqa: FBT001, FBT002
+        name: str | None = None,
+        scope: str | None = None,
         **kwargs,
     ):
+        if identity_op is None:
+            identity_op = ["Identity", "id"]
+        if zero_op is None:
+            zero_op = ["Zero", "zero"]
         if isinstance(grammar, list) and len(grammar) != 1:
             raise NotImplementedError("Does not support multiple grammars")
 
@@ -236,7 +244,8 @@ def sample(self, *, user_priors: bool = False) -> Self:
         copy_self = self.clone()
         copy_self.reset()
         copy_self.string_tree = copy_self.grammars[0].sampler(1, user_priors=user_priors)[
-            0]
+            0
+        ]
         _ = copy_self.value  # required for checking if graph is valid!
         return copy_self
 
@@ -316,7 +325,7 @@ def compute_prior(self, *, log: bool = True) -> float:
         return self.grammars[0].compute_prior(self.string_tree, log=log)
 
     @property
-    def id(self) -> str:
+    def id(self) -> str:  # noqa: D102
         if self._function_id is None or self._function_id == "":
             if self.string_tree == "":
                 raise ValueError("Cannot infer identifier!")
@@ -327,7 +336,7 @@ def id(self) -> str:
     def id(self, value: str) -> None:
         self._function_id = value
 
-    def create_from_id(self, identifier: str) -> None:
+    def create_from_id(self, identifier: str) -> None:  # noqa: D102
         self.reset()
         self._function_id = identifier
         self.id = identifier
@@ -335,724 +344,46 @@ def create_from_id(self, identifier: str) -> None:
         _ = self.value  # required for checking if graph is valid!
 
     @staticmethod
-    def id_to_string_tree(identifier: str) -> str:
+    def id_to_string_tree(identifier: str) -> str:  # noqa: D102
         return identifier
 
     @staticmethod
-    def string_tree_to_id(string_tree: str) -> str:
+    def string_tree_to_id(string_tree: str) -> str:  # noqa: D102
         return string_tree
 
     @property
-    def search_space_size(self) -> int:
+    def search_space_size(self) -> int:  # noqa: D102
         return self.grammars[0].compute_space_size
 
     @abstractmethod
-    def create_new_instance_from_id(self, identifier: str):
+    def create_new_instance_from_id(self, identifier: str):  # noqa: D102
         raise NotImplementedError
 
-    def reset(self) -> None:
+    def reset(self) -> None:  # noqa: D102
         self.clear_graph()
         self.string_tree = ""
         self.nxTree = None
         self._value = None
         self._function_id = ""
 
-    def compose_functions(self, flatten_graph: bool = True) -> nx.DiGraph:
-        return self._compose_functions(self.id, self.grammars[0], flatten_graph)
-
-    def unparse_tree(self, identifier: str, as_composition: bool = True):
-        return self._unparse_tree(identifier, self.grammars[0], as_composition)
-
-    def get_dictionary(self) -> dict[str, str]:
-        return {"graph_grammar": self.id}
-
-    def create_nx_tree(self, string_tree: str) -> nx.DiGraph:
-        nxTree = self.from_stringTree_to_nxTree(string_tree, self.grammars[0])
-        return self.prune_tree(
-            nxTree, terminal_to_torch_map_keys=self.terminal_to_op_names.keys()
-        )
-
-
-class GraphGrammarCell(GraphGrammar):
-    hp_name = "graph_grammar_cell"
-
-    def __init__(
+    def compose_functions(  # noqa: D102
         self,
-        grammar: Grammar,
-        terminal_to_op_names: dict,
-        terminal_to_graph_edges: dict = None,
-        edge_attr: bool = True,
-        edge_label: str = "op_name",
-        zero_op: list = ["Zero", "zero"],
-        identity_op: list = ["Identity", "id"],
-        name: str = None,
-        scope: str = None,
-        **kwargs,
-    ):
-        super().__init__(
-            grammar,
-            terminal_to_op_names,
-            terminal_to_graph_edges,
-            edge_attr=edge_attr,
-            edge_label=edge_label,
-            zero_op=zero_op,
-            identity_op=identity_op,
-            name=name,
-            scope=scope,
-            **kwargs,
-        )
-
-        self.cell = None
-
-    def reset(self) -> None:
-        super().reset()
-        self.cell = None
-
-    @abstractmethod
-    def create_graph_from_string(self, child: str):
-        raise NotImplementedError
-
-
-class GraphGrammarRepetitive(GraphParameter, CoreGraphGrammar):
-    hp_name = "graph_grammar_repetitive"
-
-    def __init__(
-        self,
-        grammars: list[Grammar],
-        terminal_to_op_names: dict,
-        terminal_to_sublanguage_map: dict,
-        number_of_repetitive_motifs: int,
-        terminal_to_graph_edges: dict = None,
-        edge_attr: bool = True,
-        edge_label: str = "op_name",
-        zero_op: list = ["Zero", "zero"],
-        identity_op: list = ["Identity", "id"],
-        name: str = None,
-        scope: str = None,
-    ):
-        CoreGraphGrammar.__init__(
-            self,
-            grammars=grammars,
-            terminal_to_op_names=terminal_to_op_names,
-            terminal_to_graph_edges=terminal_to_graph_edges,
-            edge_attr=edge_attr,
-            edge_label=edge_label,
-            zero_op=zero_op,
-            identity_op=identity_op,
-            name=name,
-            scope=scope,
-        )
-        GraphParameter.__init__(self, value=None, default=None, is_fidelity=False)
-
-        self.id: str = ""
-        self.string_tree: str = ""
-        self.string_tree_list: list[str] = []
-        self.nxTree: nx.DiGraph | None = None
-        self._value: nx.DiGraph | None = None
-
-        self.full_grammar = self.get_full_grammar(self.grammars)
-        self.terminal_to_sublanguage_map = terminal_to_sublanguage_map
-        self.number_of_repetitive_motifs = number_of_repetitive_motifs
-
-    @override
-    def mutate(
-        self,
-        parent: Self | None = None,
-        mutation_rate: float = 1.0,
-        mutation_strategy: str = "bananas",
-    ) -> Self:
-        raise NotImplementedError
-        if parent is None:
-            parent = self
-
-        # bananas mutate
-        if mutation_strategy == "bananas":
-            inner_mutation_strategy = partial(bananas_mutate, mutation_rate=mutation_rate)
-            child_string_tree_list, is_same = repetitive_search_space_mutation(
-                base_parent=parent.string_tree_list[0],
-                motif_parents=parent.string_tree_list[1:],
-                base_grammar=self.grammars[0],
-                motif_grammars=self.grammars[1:],
-                terminal_to_sublanguage_map=self.terminal_to_sublanguage_map,
-                inner_mutation_strategy=inner_mutation_strategy,
-            )
-        else:
-            child_string_tree_list, is_same = repetitive_search_space_mutation(
-                base_parent=parent.string_tree_list[0],
-                motif_parents=parent.string_tree_list[1:],
-                base_grammar=self.grammars[0],
-                motif_grammars=self.grammars[1:],
-                terminal_to_sublanguage_map=self.terminal_to_sublanguage_map,
-                inner_mutation_strategy=super().mutate,
-            )
-
-        if all(is_same):
-            raise ValueError("Parent is the same as child!")
-
-        return self.create_graph_from_string(child_string_tree_list)
-
-    @override
-    def crossover(
-        self,
-        parent1: Self,
-        parent2: Self | None = None,
-    ) -> tuple[Self, Self]:
-        raise NotImplementedError
-        if parent2 is None:
-            parent2 = self
-        children = repetitive_search_space_crossover(
-            base_parent=(parent1.string_tree_list[0], parent2.string_tree_list[0]),
-            motif_parents=(parent1.string_tree_list[1:], parent2.string_tree_list[1:]),
-            base_grammar=self.grammars[0],
-            motif_grammars=self.grammars[1:],
-            terminal_to_sublanguage_map=self.terminal_to_sublanguage_map,
-            inner_crossover_strategy=simple_crossover,
-        )
-        if all(not c for c in children):
-            raise Exception("Cannot create crossover")
-        return [parent2.create_graph_from_string(child) for child in children]
-
-    @override
-    def sample(self, *, user_priors: bool = False) -> Self:
-        copy_self = self.clone()
-        copy_self.reset()
-        copy_self.string_tree_list = [grammar.sampler(1)[0] for grammar in
-                                      copy_self.grammars]
-        copy_self.string_tree = copy_self.assemble_trees(
-            copy_self.string_tree_list[0],
-            copy_self.string_tree_list[1:],
-            terminal_to_sublanguage_map=copy_self.terminal_to_sublanguage_map,
-        )
-        copy_self.id = "\n".join(copy_self.string_tree_list)
-        _ = copy_self.value  # required for checking if graph is valid!
-        return copy_self
-
-    @property
-    @override
-    def value(self) -> nx.DiGraph:
-        if self._value is None:
-            _val = self.from_stringTree_to_graph_repr(
-                self.string_tree,
-                self.full_grammar,
-                valid_terminals=self.terminal_to_op_names.keys(),
-                edge_attr=self.edge_attr,
-            )
-            assert isinstance(_val, nx.DiGraph)
-            self._value = _val
-        return self._value
-
-    @override
-    def compute_prior(self, *, log: bool = True) -> float:
-        prior_probs = [
-            g.compute_prior(st, log=log)
-            for g, st in zip(self.grammars, self.string_tree_list)
-        ]
-        if log:
-            return sum(prior_probs)
-        else:
-            return np.prod(prior_probs)
-
-    def __eq__(self, other: Any) -> bool:
-        if not isinstance(other, GraphGrammarRepetitive):
-            return NotImplemented
-
-        return self.id == other.id
-
-    def reset(self) -> None:
-        self.clear_graph()
-        self.string_tree_list = []
-        self.string_tree = ""
-        self.nxTree = None
-        self._value = None
-        self.id = ""
-
-    @staticmethod
-    def get_full_grammar(grammars):
-        full_grammar = deepcopy(grammars[0])
-        rules = full_grammar.productions()
-        nonterminals = full_grammar.nonterminals
-        terminals = full_grammar.terminals
-        for g in grammars[1:]:
-            rules.extend(g.productions())
-            nonterminals.extend(g.nonterminals)
-            terminals.extend(g.terminals)
-        return full_grammar
-
-    @abstractmethod
-    def create_graph_from_string(self, child: list[str]):
-        raise NotImplementedError
-
-    def get_dictionary(self) -> dict[str, str]:
-        return {"graph_grammar": "\n".join(self.string_tree_list)}
-
-    def create_nx_tree(self, string_tree: str) -> nx.DiGraph:
-        nxTree = self.from_stringTree_to_nxTree(string_tree, self.full_grammar)
-        return self.prune_tree(
-            nxTree, terminal_to_torch_map_keys=self.terminal_to_op_names.keys()
-        )
-
-    def create_from_id(self, identifier: str | list[str]) -> None:
-        self.reset()
-        self.string_tree_list = (
-            identifier.split("\n") if isinstance(identifier, str) else identifier
-        )
-        self.string_tree = self.assemble_trees(
-            self.string_tree_list[0],
-            self.string_tree_list[1:],
-            terminal_to_sublanguage_map=self.terminal_to_sublanguage_map,
-        )
-        self.id = "\n".join(self.string_tree_list)
-        _ = self.value  # required for checking if graph is valid!
-
-    @property
-    def search_space_size(self) -> int:
-        def recursive_worker(
-            nonterminal: Nonterminal, grammar, lower_level_motifs: int = 0
-        ) -> int:
-            primitive_nonterminal = "OPS"
-            if str(nonterminal) == primitive_nonterminal:
-                return (
-                    lower_level_motifs * self.number_of_repetitive_motifs
-                    + len(grammar.productions(lhs=Nonterminal(primitive_nonterminal)))
-                    - self.number_of_repetitive_motifs
-                )
-            potential_productions = grammar.productions(lhs=nonterminal)
-            _possibilites = 0
-            for potential_production in potential_productions:
-                edges_nonterminals = [
-                    rhs_sym
-                    for rhs_sym in potential_production.rhs()
-                    if str(rhs_sym) in grammar.nonterminals
-                ]
-                possibilities_per_edge = [
-                    recursive_worker(e_nonterminal, grammar, lower_level_motifs)
-                    for e_nonterminal in edges_nonterminals
-                ]
-                product = 1
-                for p in possibilities_per_edge:
-                    product *= p
-                _possibilites += product
-            return _possibilites
-
-        lower_level_motifs = recursive_worker(self.grammars[1].start(), self.grammars[1])
-        return recursive_worker(
-            self.grammars[0].start(),
-            self.grammars[0],
-            lower_level_motifs=lower_level_motifs,
-        )
-
-
-class GraphGrammarMultipleRepetitive(GraphParameter, CoreGraphGrammar):
-    hp_name = "graph_grammar_multiple_repetitive"
+        flatten_graph: bool = True,  # noqa: FBT001, FBT002
+    ) -> nx.DiGraph:
+        return self._compose_functions(self.id, self.grammars[0], flatten_graph)
 
-    def __init__(
+    def unparse_tree(  # noqa: D102
         self,
-        grammars: list[Grammar] | list[ConstrainedGrammar],
-        terminal_to_op_names: dict,
-        terminal_to_sublanguage_map: dict,
-        prior: list[dict] = None,
-        terminal_to_graph_edges: dict = None,
-        fixed_macro_grammar: bool = False,
-        edge_attr: bool = True,
-        edge_label: str = "op_name",
-        zero_op: list = ["Zero", "zero"],
-        identity_op: list = ["Identity", "id"],
-        name: str = None,
-        scope: str = None,
-        **kwargs,
+        identifier: str,
+        as_composition: bool = True,  # noqa: FBT001, FBT002
     ):
-        def _check_mapping(macro_grammar, motif_grammars, terminal_to_sublanguage_map):
-            for terminal, start_symbol in terminal_to_sublanguage_map.items():
-                if terminal not in macro_grammar.terminals:
-                    raise Exception(f"Terminal {terminal} not defined in macro grammar")
-                if not any(
-                    start_symbol == str(grammar.start()) for grammar in motif_grammars
-                ):
-                    raise Exception(
-                        f"Start symbol {start_symbol} not defined in motif grammar"
-                    )
-
-        def _identify_macro_grammar(grammar, terminal_to_sublanguage_map):
-            grammars = deepcopy(grammar)
-            motif_grammars = []
-            for start_symbol in terminal_to_sublanguage_map.values():
-                motif_grammars += [
-                    grammar
-                    for grammar in grammars
-                    if start_symbol == str(grammar.start())
-                ]
-                grammars = [
-                    grammar
-                    for grammar in grammars
-                    if start_symbol != str(grammar.start())
-                ]
-            if len(grammars) != 1:
-                raise Exception("Cannot identify macro grammar")
-            return grammars[0], motif_grammars
-
-        if prior is not None:
-            assert len(grammars) == len(
-                prior
-            ), "At least one of the grammars has no prior defined!"
-            for g, p in zip(grammars, prior):
-                g.prior = p
-        self.has_prior = prior is not None
-
-        self.macro_grammar, grammars = _identify_macro_grammar(
-            grammars, terminal_to_sublanguage_map
-        )
-        _check_mapping(self.macro_grammar, grammars, terminal_to_sublanguage_map)
-
-        self.fixed_macro_grammar = fixed_macro_grammar
-        if not self.fixed_macro_grammar:
-            grammars.insert(0, self.macro_grammar)
-
-        self.terminal_to_sublanguage_map = OrderedDict(terminal_to_sublanguage_map)
-        if any(
-            k in terminal_to_op_names for k in self.terminal_to_sublanguage_map.keys()
-        ):
-            raise Exception(
-                f"Terminals {[k for k in self.terminal_to_sublanguage_map.keys()]} already defined in primitives mapping and cannot be used for repetitive substitutions"
-            )
-        self.number_of_repetitive_motifs_per_grammar = [
-            sum(
-                map(
-                    (str(grammar.start())).__eq__,
-                    self.terminal_to_sublanguage_map.values(),
-                )
-            )
-            if str(grammar.start()) in self.terminal_to_sublanguage_map.values()
-            else 1
-            for grammar in grammars
-        ]
-
-        CoreGraphGrammar.__init__(
-            self,
-            grammars=grammars,
-            terminal_to_op_names={
-                **terminal_to_op_names,
-                **self.terminal_to_sublanguage_map,
-            },
-            terminal_to_graph_edges=terminal_to_graph_edges,
-            edge_attr=edge_attr,
-            edge_label=edge_label,
-            zero_op=zero_op,
-            identity_op=identity_op,
-            name=name,
-            scope=scope,
-            **kwargs,
-        )
-        GraphParameter.__init__(self, value=None, default=None, is_fidelity=False)
-
-        self._function_id: str = ""
-        self.string_tree: str = ""
-        self.string_tree_list: list[str] = []
-        self.nxTree: nx.DiGraph | None = None
-        self._value: nx.DiGraph | None = None
-
-        if self.fixed_macro_grammar:
-            self.fixed_macro_string_tree = self.macro_grammar.sampler(1)[0]
-
-        if self.fixed_macro_grammar:
-            self.full_grammar = self.get_full_grammar(
-                [self.macro_grammar] + self.grammars
-            )
-        else:
-            self.full_grammar = self.get_full_grammar(self.grammars)
-
-    @override
-    def sample(self, *, user_priors: bool = False) -> Self:
-        copy_self = self.clone()
-        copy_self.reset()
-        copy_self.string_tree_list = [
-            grammar.sampler(1, user_priors=user_priors)[0]
-            for grammar, number_of_motifs in zip(
-                copy_self.grammars, copy_self.number_of_repetitive_motifs_per_grammar
-            )
-            for _ in range(number_of_motifs)
-        ]
-        copy_self.string_tree = copy_self.assemble_string_tree(copy_self.string_tree_list)
-        _ = copy_self.value  # required for checking if graph is valid!
-        return copy_self
-
-    @property
-    @override
-    def value(self) -> nx.DiGraph:
-        if self._value is None:
-            if self.fixed_macro_grammar:
-                self._value = []
-                string_list_idx = 0
-                for grammar, number_of_motifs in zip(
-                    self.grammars, self.number_of_repetitive_motifs_per_grammar
-                ):
-                    for _ in range(number_of_motifs):
-                        self._value.append(
-                            self.from_stringTree_to_graph_repr(
-                                self.string_tree_list[string_list_idx],
-                                grammar,
-                                valid_terminals=self.terminal_to_op_names.keys(),
-                                edge_attr=self.edge_attr,
-                            )
-                        )
-                        string_list_idx += 1
-                self._value = self._value[0]  # TODO trick
-            else:
-                self._value = self.from_stringTree_to_graph_repr(
-                    self.string_tree,
-                    self.full_grammar,
-                    valid_terminals=self.terminal_to_op_names.keys(),
-                    edge_attr=self.edge_attr,
-                )
-                motif_trees = self.string_tree_list[1:]
-                repetitive_mapping = {
-                    replacement: motif
-                    for motif, replacement in zip(
-                        self.terminal_to_sublanguage_map.keys(), motif_trees
-                    )
-                }
-                for subgraph in self._value[1].values():
-                    old_node_attributes = nx.get_node_attributes(subgraph, "op_name")
-                    new_node_labels = {
-                        k: (repetitive_mapping[v] if v in motif_trees else v)
-                        for k, v in old_node_attributes.items()
-                    }
-                    nx.set_node_attributes(subgraph, new_node_labels, name="op_name")
-        return self._value
-
-    @override
-    def mutate(
-        self,
-        parent: Self | None = None,
-        mutation_rate: float = 1.0,
-        mutation_strategy: str = "bananas",
-    ) -> Self:
-        if parent is None:
-            parent = self
-
-        bananas_inner_mutation = partial(bananas_mutate, mutation_rate=mutation_rate)
-        child_string_tree_list, is_same = repetitive_search_space_mutation(
-            base_parent=self.fixed_macro_string_tree
-            if self.fixed_macro_grammar
-            else parent.string_tree_list[0],
-            motif_parents=parent.string_tree_list
-            if self.fixed_macro_grammar
-            else parent.string_tree_list[1:],
-            base_grammar=self.macro_grammar,
-            motif_grammars=self.grammars
-            if self.fixed_macro_grammar
-            else self.grammars[1:],
-            terminal_to_sublanguage_map=self.terminal_to_sublanguage_map,
-            number_of_repetitive_motifs_per_grammar=self.number_of_repetitive_motifs_per_grammar,
-            inner_mutation_strategy=bananas_inner_mutation
-            if mutation_strategy == "bananas"
-            else super().mutate,
-            fixed_macro_parent=self.fixed_macro_grammar,
-        )
-
-        if all(is_same):
-            raise ValueError("Parent is the same as child!")
-
-        if self.fixed_macro_grammar:
-            child_string_tree_list = child_string_tree_list[1:]
-
-        return self.create_new_instance_from_id(
-            self.string_tree_list_to_id(child_string_tree_list)
-        )
-
-    @override
-    def crossover(
-        self,
-        parent1: Self,
-        parent2: Self | None = None,
-    ) -> tuple[Self, Self]:
-        if parent2 is None:
-            parent2 = self
-        children = repetitive_search_space_crossover(
-            base_parent=(parent1.fixed_macro_string_tree, parent2.fixed_macro_string_tree)
-            if self.fixed_macro_grammar
-            else (parent1.string_tree_list[0], parent2.string_tree_list[0]),
-            motif_parents=(parent1.string_tree_list, parent2.string_tree_list)
-            if self.fixed_macro_grammar
-            else (parent1.string_tree_list[1:], parent2.string_tree_list[1:]),
-            base_grammar=self.macro_grammar,
-            motif_grammars=self.grammars
-            if self.fixed_macro_grammar
-            else self.grammars[1:],
-            terminal_to_sublanguage_map=self.terminal_to_sublanguage_map,
-            number_of_repetitive_motifs_per_grammar=self.number_of_repetitive_motifs_per_grammar,
-            inner_crossover_strategy=simple_crossover,
-            fixed_macro_parent=self.fixed_macro_grammar,
-            multiple_repetitive=True,
-        )
-        if all(not c for c in children):
-            raise Exception("Cannot create crossover")
-
-        return tuple(
-            parent2.create_new_instance_from_id(
-                self.string_tree_list_to_id(
-                    child[1:] if self.fixed_macro_grammar else child
-                )
-            )
-            for child in children
-        )
-
-    @override
-    def compute_prior(self, *, log: bool = True) -> float:
-        prior_probs = [
-            g.compute_prior(st, log=log)
-            for g, st in zip(self.grammars, self.string_tree_list)
-        ]
-        if log:
-            return sum(prior_probs)
-        else:
-            return np.prod(prior_probs)
-
-    @property
-    def id(self) -> str:
-        if self._function_id is None or self._function_id == "":
-            if len(self.string_tree_list) == 0:
-                raise ValueError("Cannot infer identifier")
-            self._function_id = self.string_tree_list_to_id(self.string_tree_list)
-        return self._function_id
-
-    @id.setter
-    def id(self, value: str) -> None:
-        self._function_id = value
-
-    @staticmethod
-    def id_to_string_tree_list(identifier: str) -> list[str]:
-        return identifier.split("\n")
-
-    def id_to_string_tree(self, identifier: str) -> str:
-        string_tree_list = self.id_to_string_tree_list(identifier)
-        return self.assemble_string_tree(string_tree_list)
-
-    @staticmethod
-    def string_tree_list_to_id(string_tree_list: list[str]) -> str:
-        return "\n".join(string_tree_list)
-
-    def string_tree_to_id(self, string_tree: str) -> str:
-        raise NotImplementedError
-
-    def assemble_string_tree(self, string_tree_list: list[str]) -> str:
-        if self.fixed_macro_grammar:
-            string_tree = self.assemble_trees(
-                self.fixed_macro_string_tree,
-                string_tree_list,
-                terminal_to_sublanguage_map=self.terminal_to_sublanguage_map,
-            )
-        else:
-            string_tree = self.assemble_trees(
-                string_tree_list[0],
-                string_tree_list[1:],
-                terminal_to_sublanguage_map=self.terminal_to_sublanguage_map,
-            )
-        return string_tree
-
-    def __eq__(self, other: Any) -> bool:
-        if not isinstance(other, GraphGrammarMultipleRepetitive):
-            return NotImplemented
-        return self.id == other.id
-
-    def reset(self) -> None:
-        self.clear_graph()
-        self.string_tree_list = []
-        self.string_tree = ""
-        self.nxTree = None
-        self._value = None
-        self._function_id = ""
-
-    def compose_functions(self, flatten_graph: bool = True):
-        return self._compose_functions(self.id, self.full_grammar, flatten_graph)
-
-    def unparse_tree(self, identifier: str, as_composition: bool = True):
-        return self._unparse_tree(identifier, self.full_grammar, as_composition)
-
-    @staticmethod
-    def get_full_grammar(grammars):
-        full_grammar = deepcopy(grammars[0])
-        rules = full_grammar.productions()
-        nonterminals = full_grammar.nonterminals
-        terminals = full_grammar.terminals
-        for g in grammars[1:]:
-            rules.extend(g.productions())
-            nonterminals.extend(g.nonterminals)
-            terminals.extend(g.terminals)
-        return full_grammar
-
-    @abstractmethod
-    def create_new_instance_from_id(self, child: str):
-        raise NotImplementedError
+        return self._unparse_tree(identifier, self.grammars[0], as_composition)
 
-    def get_dictionary(self) -> dict[str, str]:
+    def get_dictionary(self) -> dict[str, str]:  # noqa: D102
         return {"graph_grammar": self.id}
 
-    def create_nx_tree(self, string_tree: str) -> nx.DiGraph:
-        nxTree = self.from_stringTree_to_nxTree(string_tree, self.full_grammar)
+    def create_nx_tree(self, string_tree: str) -> nx.DiGraph:  # noqa: D102
+        nxTree = self.from_stringTree_to_nxTree(string_tree, self.grammars[0])
         return self.prune_tree(
             nxTree, terminal_to_torch_map_keys=self.terminal_to_op_names.keys()
         )
-
-    def create_from_id(self, identifier: str) -> None:
-        self.reset()
-        self.id = identifier
-        self.string_tree_list = self.id_to_string_tree_list(self.id)
-        self.string_tree = self.id_to_string_tree(self.id)
-        _ = self.value  # required for checking if graph is valid!
-
-    @property
-    def search_space_size(self) -> int:
-        def recursive_worker(
-            nonterminal: Nonterminal, grammar, lower_level_motifs: dict = None
-        ) -> int:
-            if lower_level_motifs is None:
-                lower_level_motifs = {}
-            potential_productions = grammar.productions(lhs=nonterminal)
-            _possibilites = 0
-            for potential_production in potential_productions:
-                edges_nonterminals = [
-                    rhs_sym
-                    for rhs_sym in potential_production.rhs()
-                    if str(rhs_sym) in grammar.nonterminals
-                ]
-                possibilities_per_edge = [
-                    recursive_worker(e_nonterminal, grammar, lower_level_motifs)
-                    for e_nonterminal in edges_nonterminals
-                ]
-                possibilities_per_edge += [
-                    lower_level_motifs[str(rhs_sym)]
-                    for rhs_sym in potential_production.rhs()
-                    if str(rhs_sym) in lower_level_motifs.keys()
-                ]
-                product = 1
-                for p in possibilities_per_edge:
-                    product *= p
-                _possibilites += product
-            return _possibilites
-
-        if self.fixed_macro_grammar:
-            if len(self.grammars) > 1:
-                raise Exception(
-                    "Compute space size for fixed macro only works for one repetitive level"
-                )
-            return np.prod(
-                [
-                    grammar.compute_space_size
-                    for grammar, n_grammar in zip(
-                    self.grammars, self.number_of_repetitive_motifs_per_grammar
-                )
-                    for _ in range(n_grammar)
-                ]
-            )
-        else:
-            if len(self.grammars) > 2:
-                raise Exception(
-                    "Compute space size for no fixed macro only works for one repetitive level"
-                )
-            macro_space_size = self.grammars[0].compute_space_size
-            motif_space_size = self.grammars[1].compute_space_size
-            return (
-                macro_space_size
-                // self.number_of_repetitive_motifs_per_grammar[1]
-                * motif_space_size
-            )
diff --git a/neps/search_spaces/architecture/mutations.py b/neps/search_spaces/architecture/mutations.py
index f07e3347..c588836a 100644
--- a/neps/search_spaces/architecture/mutations.py
+++ b/neps/search_spaces/architecture/mutations.py
@@ -1,10 +1,13 @@
+from __future__ import annotations  # noqa: D100
+
 import random
-from typing import Callable, List, Tuple
+from typing import TYPE_CHECKING, Callable
 
-from .cfg import Grammar
+if TYPE_CHECKING:
+    from .cfg import Grammar
 
 
-def simple_mutate(parent_string_tree: str, grammar: Grammar) -> Tuple[str, bool]:
+def simple_mutate(parent_string_tree: str, grammar: Grammar) -> tuple[str, bool]:  # noqa: D103
     # works if there is only one grammar
     # randomly choose a subtree from the parent and replace
     # with a new randomly generated subtree
@@ -19,17 +22,17 @@ def simple_mutate(parent_string_tree: str, grammar: Grammar) -> Tuple[str, bool]
     return child_string_tree, parent_string_tree == child_string_tree
 
 
-def bananas_mutate(
+def bananas_mutate(  # noqa: D103
     parent_string_tree: str,
     grammar: Grammar,
     mutation_rate: float = 1.0,
-    mutation_prob: float = None,
+    mutation_prob: float | None = None,
     patience: int = 50,
-) -> Tuple[str, bool]:
+) -> tuple[str, bool]:
     split_tree = parent_string_tree.split(" ")
     swappable_indices = [
         i
-        for i in range(0, len(split_tree))
+        for i in range(len(split_tree))
         if split_tree[i][1:] in grammar.swappable_nonterminals
     ]
     _mutation_prob = (
@@ -40,7 +43,7 @@ def bananas_mutate(
     idx = 0
     while idx < len(swappable_indices):
         swap_idx = swappable_indices[idx]
-        if random.random() < _mutation_prob:
+        if random.random() < _mutation_prob:  # noqa: S311
             subtree_node = split_tree[swap_idx][1:]
             subtree_idx = swap_idx
             child_string_tree = grammar.mutate(
@@ -54,7 +57,7 @@ def bananas_mutate(
             split_tree = child_string_tree.split(" ")
             swappable_indices = [
                 i
-                for i in range(0, len(split_tree))
+                for i in range(len(split_tree))
                 if split_tree[i][1:] in grammar.swappable_nonterminals
             ]
             _mutation_prob = (
@@ -67,18 +70,18 @@ def bananas_mutate(
     return child_string_tree, child_string_tree == parent_string_tree
 
 
-def repetitive_search_space_mutation(
+def repetitive_search_space_mutation(  # noqa: D103
     base_parent: str,
-    motif_parents: List[str],
+    motif_parents: list[str],
     base_grammar: Grammar,
-    motif_grammars: List[Grammar],
+    motif_grammars: list[Grammar],
     terminal_to_sublanguage_map: dict,
     number_of_repetitive_motifs_per_grammar: list,
     inner_mutation_strategy: Callable,
     mutation_rate: float = 1.0,
-    mutation_prob: float = None,
-    fixed_macro_parent: bool = False,
-) -> Tuple[List[str], List[bool]]:
+    mutation_prob: float | None = None,
+    fixed_macro_parent: bool = False,  # noqa: FBT001, FBT002
+) -> tuple[list[str], list[bool]]:
     def _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map):
         return [
             i
@@ -97,7 +100,7 @@ def _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map):
         )
 
     child_string_trees = []
-    if not fixed_macro_parent and random.random() < mutation_prob:
+    if not fixed_macro_parent and random.random() < mutation_prob:  # noqa: S311
         child_string_trees.append(inner_mutation_strategy(base_parent, base_grammar))
         indices = _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map)
         mutation_prob = (
@@ -116,22 +119,12 @@ def _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map):
         motif_grammars, _number_of_repetitive_motifs_per_grammar
     ):
         for _ in range(number_of_motifs):
-            if parent_string_idx in indices and random.random() < mutation_prob:
+            if parent_string_idx in indices and random.random() < mutation_prob:  # noqa: S311
                 child_string_trees.append(
                     inner_mutation_strategy(motif_parents[parent_string_idx], grammar)
                 )
             else:
                 child_string_trees.append((motif_parents[parent_string_idx], True))
             parent_string_idx += 1
-    # child_string_trees.extend(
-    #     [
-    #         inner_mutation_strategy(parent_string_tree, grammar)
-    #         if i in indices and random.random() < mutation_prob
-    #         else (parent_string_tree, True)
-    #         for i, (parent_string_tree, grammar) in enumerate(
-    #             zip(motif_parents, motif_grammars)
-    #         )
-    #     ]
-    # )
 
     return [c[0] for c in child_string_trees], [c[1] for c in child_string_trees]
diff --git a/neps/search_spaces/architecture/primitives.py b/neps/search_spaces/architecture/primitives.py
index eebb828d..916c0fc7 100644
--- a/neps/search_spaces/architecture/primitives.py
+++ b/neps/search_spaces/architecture/primitives.py
@@ -1,3 +1,5 @@
+from __future__ import annotations  # noqa: D100
+
 from abc import ABCMeta, abstractmethod
 
 import torch
@@ -5,8 +7,7 @@
 
 
 class _AbstractPrimitive(nn.Module, metaclass=ABCMeta):
-    """
-    Use this class when creating new operations for edges.
+    """Use this class when creating new operations for edges.
 
     This is required because we are agnostic to operations
     at the edges. As a consequence, they can contain subgraphs
@@ -24,15 +25,12 @@ def __init__(self, kwargs):
 
     @abstractmethod
     def forward(self, x):
-        """
-        The forward processing of the operation.
-        """
+        """The forward processing of the operation."""
         raise NotImplementedError
 
     @abstractmethod
     def get_embedded_ops(self):
-        """
-        Return any embedded ops so that they can be
+        """Return any embedded ops so that they can be
         analysed whether they contain a child graph, e.g.
         a 'motif' in the hierachical search space.
 
@@ -46,86 +44,88 @@ def get_op_name(self):
         return type(self).__name__
 
 
-class AbstractPrimitive(_AbstractPrimitive):
-    def forward(self, x):
+class AbstractPrimitive(_AbstractPrimitive):  # noqa: D101
+    def forward(self, x):  # noqa: D102
         raise NotImplementedError
 
-    def get_embedded_ops(self):
+    def get_embedded_ops(self):  # noqa: D102
         return None
 
 
 class Identity(AbstractPrimitive):
-    """
-    An implementation of the Identity operation.
-    """
+    """An implementation of the Identity operation."""
 
-    def __init__(self, **kwargs):
+    def __init__(self, **kwargs):  # noqa: D107
         super().__init__(locals())
 
-    def forward(self, x):
+    def forward(self, x: object) -> object:  # noqa: D102
         return x
 
 
 class Zero(AbstractPrimitive):
-    """
-    Implementation of the zero operation. It removes
+    """Implementation of the zero operation. It removes
     the connection by multiplying its input with zero.
     """
 
     def __init__(self, stride, **kwargs):
-        """
-        When setting stride > 1 then it is assumed that the
+        """When setting stride > 1 then it is assumed that the
         channels must be doubled.
         """
         super().__init__(locals())
         self.stride = int(stride)
 
-    def forward(self, x):
+    def forward(self, x):  # noqa: D102
         if self.stride == 1:
             return x.mul(0.0)
-        else:
-            return x[:, :, :: self.stride, :: self.stride].mul(0.0)
+
+        return x[:, :, :: self.stride, :: self.stride].mul(0.0)
 
     def __repr__(self):
         return f"<Zero (stride={self.stride})>"
 
 
 class Zero1x1(AbstractPrimitive):
-    """
-    Implementation of the zero operation. It removes
+    """Implementation of the zero operation. It removes
     the connection by multiplying its input with zero.
     """
 
     def __init__(self, stride, **kwargs):
-        """
-        When setting stride > 1 then it is assumed that the
+        """When setting stride > 1 then it is assumed that the
         channels must be doubled.
         """
         super().__init__(locals())
         self.stride = int(stride)
 
-    def forward(self, x):
+    def forward(self, x):  # noqa: D102
         if self.stride == 1:
             return x.mul(0.0)
-        else:
-            x = x[:, :, :: self.stride, :: self.stride].mul(0.0)
-            return torch.cat([x, x], dim=1)  # double the channels TODO: ugly as hell
+
+        x = x[:, :, :: self.stride, :: self.stride].mul(0.0)
+        return torch.cat([x, x], dim=1)  # double the channels TODO: ugly as hell
 
     def __repr__(self):
         return f"<Zero1x1 (stride={self.stride})>"
 
 
 class SepConv(AbstractPrimitive):
-    """
-    Implementation of Separable convolution operation as
+    """Implementation of Separable convolution operation as
     in the DARTS paper, i.e. 2 sepconv directly after another.
     """
 
-    def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True, **kwargs):
+    def __init__(  # noqa: D107
+        self,
+        c_in: int,
+        c_out: int,
+        kernel_size: int,
+        stride: int,
+        padding: int,
+        affine: bool = True,  # noqa: FBT001, FBT002
+        **kwargs,
+    ):
         super().__init__(locals())
 
-        C_in = int(C_in)
-        C_out = int(C_out)
+        c_in = int(c_in)
+        c_out = int(c_out)
         kernel_size = int(kernel_size)
         stride = int(stride)
         padding = int(padding)
@@ -135,53 +135,60 @@ def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True, **kwa
         self.op = nn.Sequential(
             nn.ReLU(inplace=False),
             nn.Conv2d(
-                C_in,
-                C_in,
+                c_in,
+                c_in,
                 kernel_size=kernel_size,
                 stride=stride,
                 padding=padding,
-                groups=C_in,
+                groups=c_in,
                 bias=False,
             ),
-            nn.Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False),
-            nn.BatchNorm2d(C_in, affine=affine),
+            nn.Conv2d(c_in, c_in, kernel_size=1, padding=0, bias=False),
+            nn.BatchNorm2d(c_in, affine=affine),
             nn.ReLU(inplace=False),
             nn.Conv2d(
-                C_in,
-                C_in,
+                c_in,
+                c_in,
                 kernel_size=kernel_size,
                 stride=1,
                 padding=padding,
-                groups=C_in,
+                groups=c_in,
                 bias=False,
             ),
-            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine),
+            nn.Conv2d(c_in, c_out, kernel_size=1, padding=0, bias=False),
+            nn.BatchNorm2d(c_out, affine=affine),
         )
 
-    def forward(self, x):
+    def forward(self, x):  # noqa: D102
         return self.op(x)
 
     @property
-    def get_op_name(self):
+    def get_op_name(self):  # noqa: D102
         op_name = super().get_op_name
         op_name += f"{self.kernel_size}x{self.kernel_size}"
         return op_name
 
 
 class DilConv(AbstractPrimitive):
-    """
-    Implementation of a dilated separable convolution as
+    """Implementation of a dilated separable convolution as
     used in the DARTS paper.
     """
 
-    def __init__(
-        self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True, **kwargs
+    def __init__(  # noqa: D107
+        self,
+        c_in: int,
+        c_out: int,
+        kernel_size: int,
+        stride: int,
+        padding: int,
+        dilation: int,
+        affine: bool = True,  # noqa: FBT001, FBT002
+        **kwargs,
     ):
         super().__init__(locals())
 
-        C_in = int(C_in)
-        C_out = int(C_out)
+        c_in = int(c_in)
+        c_out = int(c_out)
         kernel_size = int(kernel_size)
         stride = int(stride)
         padding = int(padding)
@@ -192,68 +199,66 @@ def __init__(
         self.op = nn.Sequential(
             nn.ReLU(inplace=False),
             nn.Conv2d(
-                C_in,
-                C_in,
+                c_in,
+                c_in,
                 kernel_size=kernel_size,
                 stride=stride,
                 padding=padding,
                 dilation=dilation,
-                groups=C_in,
+                groups=c_in,
                 bias=False,
             ),
-            nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine),
+            nn.Conv2d(c_in, c_out, kernel_size=1, padding=0, bias=False),
+            nn.BatchNorm2d(c_out, affine=affine),
         )
 
-    def forward(self, x):
+    def forward(self, x):  # noqa: D102
         return self.op(x)
 
     @property
-    def get_op_name(self):
+    def get_op_name(self):  # noqa: D102
         op_name = super().get_op_name
         op_name += f"{self.kernel_size}x{self.kernel_size}"
         return op_name
 
 
 class Stem(AbstractPrimitive):
-    """
-    This is used as an initial layer directly after the
+    """This is used as an initial layer directly after the
     image input.
     """
 
-    def __init__(self, C_out, C_in=3, **kwargs):
+    def __init__(self, c_out: int, c_in: int = 3, **kwargs):  # noqa: D107
         super().__init__(locals())
 
-        C_out = int(C_out)
+        c_out = int(c_out)
 
         self.seq = nn.Sequential(
-            nn.Conv2d(C_in, C_out, 3, padding=1, bias=False), nn.BatchNorm2d(C_out)
+            nn.Conv2d(c_in, c_out, 3, padding=1, bias=False), nn.BatchNorm2d(c_out)
         )
 
-    def forward(self, x):
+    def forward(self, x):  # noqa: D102
         return self.seq(x)
 
 
 class Sequential(AbstractPrimitive):
-    """
-    Implementation of `torch.nn.Sequential` to be used
+    """Implementation of `torch.nn.Sequential` to be used
     as op on edges.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, **kwargs):  # noqa: D107
         super().__init__(locals())
         self.primitives = args
         self.op = nn.Sequential(*args)
 
-    def forward(self, x):
+    def forward(self, x):  # noqa: D102
         return self.op(x)
 
-    def get_embedded_ops(self):
+    def get_embedded_ops(self):  # noqa: D102
         return list(self.primitives)
 
 
-class MaxPool(AbstractPrimitive):
-    def __init__(self, kernel_size, stride, **kwargs):
+class MaxPool(AbstractPrimitive):  # noqa: D101
+    def __init__(self, kernel_size: int, stride: int, **kwargs):  # noqa: D107
         super().__init__(locals())
 
         kernel_size = int(kernel_size)
@@ -261,35 +266,42 @@ def __init__(self, kernel_size, stride, **kwargs):
 
         self.maxpool = nn.MaxPool2d(kernel_size, stride=stride, padding=1)
 
-    def forward(self, x):
-        x = self.maxpool(x)
-        return x
+    def forward(self, x):  # noqa: D102
+        return self.maxpool(x)
 
 
 class MaxPool1x1(AbstractPrimitive):
-    """
-    Implementation of MaxPool with an optional 1x1 convolution
+    """Implementation of MaxPool with an optional 1x1 convolution
     in case stride > 1. The 1x1 convolution is required to increase
     the number of channels.
     """
 
-    def __init__(self, kernel_size, stride, C_in, C_out, affine=True, **kwargs):
+    def __init__(  # noqa: D107
+        self,
+        kernel_size: int,
+        stride: int,
+        c_in: int,
+        c_out: int,
+        affine: bool = True,  # noqa: FBT001, FBT002
+        **kwargs,
+    ):
         super().__init__(locals())
 
         kernel_size = int(kernel_size)
         stride = int(stride)
-        C_in = int(C_in)
-        C_out = int(C_out)
+        c_in = int(c_in)
+        c_out = int(c_out)
         affine = bool(affine)
 
         self.stride = stride
         self.maxpool = nn.MaxPool2d(kernel_size, stride=stride, padding=1)
         if stride > 1:
-            assert C_in is not None and C_out is not None
-            self.conv = nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False)
-            self.bn = nn.BatchNorm2d(C_out, affine=affine)
+            assert c_in is not None
+            assert c_out is not None
+            self.conv = nn.Conv2d(c_in, c_out, 1, stride=1, padding=0, bias=False)
+            self.bn = nn.BatchNorm2d(c_out, affine=affine)
 
-    def forward(self, x):
+    def forward(self, x):  # noqa: D102
         x = self.maxpool(x)
         if self.stride > 1:
             x = self.conv(x)
@@ -298,34 +310,32 @@ def forward(self, x):
 
 
 class AvgPool(AbstractPrimitive):
-    """
-    Implementation of Avergae Pooling.
-    """
+    """Implementation of Avergae Pooling."""
 
-    def __init__(self, kernel_size, stride, **kwargs):
+    def __init__(self, kernel_size: int, stride: int, **kwargs):  # noqa: D107
         stride = int(stride)
         super().__init__(locals())
-        self.avgpool = nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False)
+        self.avgpool = nn.AvgPool2d(
+            kernel_size=3, stride=stride, padding=1, count_include_pad=False
+        )
 
-    def forward(self, x):
-        x = self.avgpool(x)
-        return x
+    def forward(self, x):  # noqa: D102
+        return self.avgpool(x)
 
 
 class AvgPool1x1(AbstractPrimitive):
-    """
-    Implementation of Avergae Pooling with an optional
+    """Implementation of Avergae Pooling with an optional
     1x1 convolution afterwards. The convolution is required
     to increase the number of channels if stride > 1.
     """
 
-    def __init__(
+    def __init__(  # noqa: D107
         self,
-        kernel_size,
-        stride,
-        C_in,
-        C_out,
-        affine=True,
+        kernel_size: int,
+        stride: int,
+        c_in: int,
+        c_out: int,
+        affine: bool = True,  # noqa: FBT001, FBT002
         **kwargs,
     ):
         super().__init__(locals())
@@ -333,11 +343,12 @@ def __init__(
         self.stride = int(stride)
         self.avgpool = nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False)
         if stride > 1:
-            assert C_in is not None and C_out is not None
-            self.conv = nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False)
-            self.bn = nn.BatchNorm2d(C_out, affine=affine)
+            assert c_in is not None
+            assert c_out is not None
+            self.conv = nn.Conv2d(c_in, c_out, 1, stride=1, padding=0, bias=False)
+            self.bn = nn.BatchNorm2d(c_out, affine=affine)
 
-    def forward(self, x):
+    def forward(self, x):  # noqa: D102
         x = self.avgpool(x)
         if self.stride > 1:
             x = self.conv(x)
@@ -345,8 +356,16 @@ def forward(self, x):
         return x
 
 
-class ReLUConvBN(AbstractPrimitive):
-    def __init__(self, C_in, C_out, kernel_size, stride=1, affine=True, **kwargs):
+class ReLUConvBN(AbstractPrimitive):  # noqa: D101
+    def __init__(  # noqa: D107
+        self,
+        c_in: int,
+        c_out: int,
+        kernel_size: int,
+        stride: int = 1,
+        affine: bool = True,  # noqa: FBT001, FBT002
+        **kwargs,
+    ):
         super().__init__(locals())
         kernel_size = int(kernel_size)
         stride = int(stride)
@@ -355,113 +374,134 @@ def __init__(self, C_in, C_out, kernel_size, stride=1, affine=True, **kwargs):
         pad = 0 if int(stride) == 1 and kernel_size == 1 else 1
         self.op = nn.Sequential(
             nn.ReLU(inplace=False),
-            nn.Conv2d(C_in, C_out, kernel_size, stride=stride, padding=pad, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine),
+            nn.Conv2d(c_in, c_out, kernel_size, stride=stride, padding=pad, bias=False),
+            nn.BatchNorm2d(c_out, affine=affine),
         )
 
-    def forward(self, x):
+    def forward(self, x):  # noqa: D102
         return self.op(x)
 
     @property
-    def get_op_name(self):
+    def get_op_name(self):  # noqa: D102
         op_name = super().get_op_name
         op_name += f"{self.kernel_size}x{self.kernel_size}"
         return op_name
 
 
 class ConvBnReLU(AbstractPrimitive):
-    """
-    Implementation of 2d convolution, followed by 2d batch normalization and ReLU activation.
+    """Implementation of 2d convolution, followed by 2d batch normalization and
+    ReLU activation.
     """
 
-    def __init__(self, C_in, C_out, kernel_size, stride=1, affine=True, **kwargs):
+    def __init__(  # noqa: D107
+        self,
+        c_in: int,
+        c_out: int,
+        kernel_size: int,
+        stride: int = 1,
+        affine: bool = True,  # noqa: FBT001, FBT002
+        **kwargs,
+    ):
         super().__init__(locals())
         self.kernel_size = kernel_size
         pad = 0 if stride == 1 and kernel_size == 1 else 1
         self.op = nn.Sequential(
-            nn.Conv2d(C_in, C_out, kernel_size, stride=stride, padding=pad, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine),
+            nn.Conv2d(c_in, c_out, kernel_size, stride=stride, padding=pad, bias=False),
+            nn.BatchNorm2d(c_out, affine=affine),
             nn.ReLU(inplace=False),
         )
 
-    def forward(self, x):
+    def forward(self, x):  # noqa: D102
         return self.op(x)
 
     @property
-    def get_op_name(self):
+    def get_op_name(self):  # noqa: D102
         op_name = super().get_op_name
         op_name += f"{self.kernel_size}x{self.kernel_size}"
         return op_name
 
 
 class ConvBn(AbstractPrimitive):
-    """
-    Implementation of 2d convolution, followed by 2d batch normalization and ReLU activation.
+    """Implementation of 2d convolution, followed by 2d batch normalization and ReLU
+    activation.
     """
 
-    def __init__(self, C_in, C_out, kernel_size, stride=1, affine=True, **kwargs):
+    def __init__(  # noqa: D107
+        self,
+        c_in: int,
+        c_out: int,
+        kernel_size: int,
+        stride=1,
+        affine: bool = True,  # noqa: FBT001, FBT002
+        **kwargs,
+    ):
         super().__init__(locals())
         self.kernel_size = kernel_size
         pad = 0 if stride == 1 and kernel_size == 1 else 1
         self.op = nn.Sequential(
-            nn.Conv2d(C_in, C_out, kernel_size, stride=stride, padding=pad, bias=False),
-            nn.BatchNorm2d(C_out, affine=affine),
+            nn.Conv2d(c_in, c_out, kernel_size, stride=stride, padding=pad, bias=False),
+            nn.BatchNorm2d(c_out, affine=affine),
         )
 
-    def forward(self, x):
+    def forward(self, x):  # noqa: D102
         return self.op(x)
 
     @property
-    def get_op_name(self):
+    def get_op_name(self):  # noqa: D102
         op_name = super().get_op_name
         op_name += f"{self.kernel_size}x{self.kernel_size}"
         return op_name
 
 
 class Concat1x1(AbstractPrimitive):
-    """
-    Implementation of the channel-wise concatination followed by a 1x1 convolution
+    """Implementation of the channel-wise concatination followed by a 1x1 convolution
     to retain the channel dimension.
     """
 
-    def __init__(
-        self, num_in_edges, C_out, affine=True, **kwargs
+    def __init__(  # noqa: D107
+        self,
+        num_in_edges: int,
+        c_out: int,
+        affine: bool = True,  # noqa: FBT001, FBT002
+        **kwargs,
     ):
         super().__init__(locals())
         self.conv = nn.Conv2d(
-            num_in_edges * C_out, C_out, kernel_size=1, stride=1, padding=0, bias=False
+            num_in_edges * c_out, c_out, kernel_size=1, stride=1, padding=0, bias=False
         )
-        self.bn = nn.BatchNorm2d(C_out, affine=affine)
+        self.bn = nn.BatchNorm2d(c_out, affine=affine)
 
     def forward(self, x):
-        """
-        Expecting a list of input tensors. Stacking them channel-wise
-        and applying 1x1 conv
+        """Expecting a list of input tensors. Stacking them channel-wise
+        and applying 1x1 conv.
         """
         x = torch.cat(x, dim=1)
         x = self.conv(x)
-        x = self.bn(x)
-        return x
+        return self.bn(x)
 
 
-class ResNetBasicblock(AbstractPrimitive):
-    def __init__(
-        self, C_in, C_out, stride, affine=True, **kwargs
+class ResNetBasicblock(AbstractPrimitive):  # noqa: D101
+    def __init__(  # noqa: D107
+        self,
+        c_in: int,
+        c_out: int,
+        stride: int,
+        affine: bool = True,  # noqa: FBT001, FBT002
+        **kwargs,
     ):
         super().__init__(locals())
-        assert stride == 1 or stride == 2, f"invalid stride {stride}"
-        self.conv_a = ReLUConvBN(C_in, C_out, 3, stride)
-        self.conv_b = ReLUConvBN(C_out, C_out, 3)
+        assert stride in (1, 2), f"invalid stride {stride}"
+        self.conv_a = ReLUConvBN(c_in, c_out, 3, stride)
+        self.conv_b = ReLUConvBN(c_out, c_out, 3)
         if stride == 2:
             self.downsample = nn.Sequential(
-                # nn.AvgPool2d(kernel_size=2, stride=2, padding=0),
-                nn.Conv2d(C_in, C_out, kernel_size=1, stride=2, padding=0, bias=False),
-                nn.BatchNorm2d(C_out),
+                nn.Conv2d(c_in, c_out, kernel_size=1, stride=2, padding=0, bias=False),
+                nn.BatchNorm2d(c_out),
             )
         else:
             self.downsample = None
 
-    def forward(self, x):
+    def forward(self, x):  # noqa: D102
         basicblock = self.conv_a(x)
         basicblock = self.conv_b(basicblock)
         residual = self.downsample(x) if self.downsample is not None else x
diff --git a/neps/search_spaces/architecture/topologies.py b/neps/search_spaces/architecture/topologies.py
index b45db832..5bb040ed 100644
--- a/neps/search_spaces/architecture/topologies.py
+++ b/neps/search_spaces/architecture/topologies.py
@@ -1,3 +1,5 @@
+from __future__ import annotations  # noqa: D100
+
 import inspect
 import queue
 from abc import ABCMeta, abstractmethod
@@ -7,21 +9,23 @@
 from .graph import Graph
 
 
-class AbstractTopology(Graph, metaclass=ABCMeta):
-    edge_list: list = []
+class AbstractTopology(Graph, metaclass=ABCMeta):  # noqa: D101
+    edge_list: list = []  # noqa: RUF012
 
-    def __init__(self, name: str = None, scope: str = None, merge_fn: Callable = sum):
+    def __init__(  # noqa: D107
+        self, name: str | None = None, scope: str | None = None, merge_fn: Callable = sum
+    ):
         super().__init__(name=name, scope=scope)
 
         self.merge_fn = merge_fn
 
-    def mutate(self):
+    def mutate(self):  # noqa: D102
         pass
 
-    def sample(self):
+    def sample(self):  # noqa: D102
         pass
 
-    def create_graph(self, vals: dict):
+    def create_graph(self, vals: dict):  # noqa: C901, D102
         def get_args_and_defaults(func):
             signature = inspect.signature(func)
             return list(signature.parameters.keys()), {
@@ -36,18 +40,18 @@ def get_op_name_from_dict(val: dict):
             args: dict = {}
             arg_names, default_args = get_args_and_defaults(op)
             for arg_name in arg_names:
-                if arg_name == "self" or arg_name == "kwargs" or arg_name in args.keys():
+                if arg_name in ("self", "kwargs") or arg_name in args:
                     continue
-                if arg_name in val.keys():
+                if arg_name in val:
                     args[arg_name] = val[arg_name]
-                elif arg_name in default_args.keys():
+                elif arg_name in default_args:
                     args[arg_name] = default_args[arg_name]
                 else:
                     args[arg_name] = 42
 
             if "groups" in args and args["groups"] != 1:
-                args["C_in"] = args["groups"]
-                args["C_out"] = args["groups"]
+                args["c_in"] = args["groups"]
+                args["c_out"] = args["groups"]
 
             return op(**args).get_op_name
 
@@ -57,24 +61,23 @@ def get_op_name_from_dict(val: dict):
             if isinstance(val, dict):
                 _val = val
                 _val["op_name"] = get_op_name_from_dict(val)
+            elif isinstance(val, int):  # for synthetic benchmarks
+                _val = {"op": val, "op_name": val}
+            elif hasattr(val, "get_op_name"):
+                _val = {"op": val, "op_name": val.get_op_name}
+            elif callable(val):
+                _val = {"op": val, "op_name": val.__name__}
             else:
-                if isinstance(val, int):  # for synthetic benchmarks
-                    _val = {"op": val, "op_name": val}
-                elif hasattr(val, "get_op_name"):
-                    _val = {"op": val, "op_name": val.get_op_name}
-                elif callable(val):
-                    _val = {"op": val, "op_name": val.__name__}
-                else:
-                    raise Exception(f"Cannot extract op name from {val}")
+                raise Exception(f"Cannot extract op name from {val}")
 
             self.edges[u, v].update(_val)
 
     @property
-    def get_op_name(self):
+    def get_op_name(self):  # noqa: D102
         return type(self).__name__
 
-    def __call__(self, x):
-        cur_node_idx = [node for node in self.nodes if self.in_degree(node) == 0][0]
+    def __call__(self, x):  # noqa: D102
+        cur_node_idx = next(node for node in self.nodes if self.in_degree(node) == 0)
         predecessor_inputs = {cur_node_idx: [x]}
         next_successors = queue.Queue()
         next_successors.put(cur_node_idx)
@@ -103,18 +106,20 @@ def __call__(self, x):
         return inputs
 
 
-class AbstractVariableTopology(AbstractTopology):
-    def __init__(self, name: str = None, scope: str = None, **kwargs):
+class AbstractVariableTopology(AbstractTopology):  # noqa: D101
+    def __init__(  # noqa: D107
+        self, name: str | None = None, scope: str | None = None, **kwargs
+    ):
         super().__init__(name, scope, **kwargs)
 
     @staticmethod
     @abstractmethod
-    def get_edge_list(**kwargs):
+    def get_edge_list(**kwargs):  # noqa: D102
         raise NotImplementedError
 
 
 class _SequentialNEdge(AbstractTopology):
-    edge_list: list = []
+    edge_list: list = []  # noqa: RUF012
 
     def __init__(self, *edge_vals, number_of_edges: int, **kwargs):
         super().__init__(**kwargs)
@@ -132,18 +137,18 @@ def get_edge_list(number_of_edges: int):
 LinearNEdge = _SequentialNEdge
 
 
-def get_sequential_n_edge(number_of_edges: int):
+def get_sequential_n_edge(number_of_edges: int):  # noqa: D103
     return partial(_SequentialNEdge, number_of_edges=number_of_edges)
 
 
-class Residual(AbstractTopology):
-    edge_list = [
+class Residual(AbstractTopology):  # noqa: D101
+    edge_list = [  # noqa: RUF012
         (1, 2),
         (1, 3),
         (2, 3),
     ]
 
-    def __init__(self, *edge_vals, **kwargs):
+    def __init__(self, *edge_vals, **kwargs):  # noqa: D107
         super().__init__(**kwargs)
 
         self.name = "residual"
@@ -151,10 +156,10 @@ def __init__(self, *edge_vals, **kwargs):
         self.set_scope(self.name)
 
 
-class Diamond(AbstractTopology):
-    edge_list = [(1, 2), (1, 3), (2, 4), (3, 4)]
+class Diamond(AbstractTopology):  # noqa: D101
+    edge_list = [(1, 2), (1, 3), (2, 4), (3, 4)]  # noqa: RUF012
 
-    def __init__(self, *edge_vals, **kwargs):
+    def __init__(self, *edge_vals, **kwargs):  # noqa: D107
         super().__init__(**kwargs)
 
         self.name = "diamond"
@@ -162,10 +167,10 @@ def __init__(self, *edge_vals, **kwargs):
         self.set_scope(self.name)
 
 
-class DiamondMid(AbstractTopology):
-    edge_list = [(1, 2), (1, 3), (2, 3), (2, 4), (3, 4)]
+class DiamondMid(AbstractTopology):  # noqa: D101
+    edge_list = [(1, 2), (1, 3), (2, 3), (2, 4), (3, 4)]  # noqa: RUF012
 
-    def __init__(self, *edge_vals, **kwargs):
+    def __init__(self, *edge_vals, **kwargs):  # noqa: D107
         super().__init__(**kwargs)
 
         self.name = "diamond_mid"
@@ -174,7 +179,7 @@ def __init__(self, *edge_vals, **kwargs):
 
 
 class _DenseNNodeDAG(AbstractTopology):
-    edge_list: list = []
+    edge_list: list = []  # noqa: RUF012
 
     def __init__(self, *edge_vals, number_of_nodes: int, **kwargs):
         super().__init__(**kwargs)
@@ -190,5 +195,5 @@ def get_edge_list(number_of_nodes: int):
         return [(i + 1, j + 1) for j in range(number_of_nodes) for i in range(j)]
 
 
-def get_dense_n_node_dag(number_of_nodes: int):
+def get_dense_n_node_dag(number_of_nodes: int):  # noqa: D103
     return partial(_DenseNNodeDAG, number_of_nodes=number_of_nodes)
diff --git a/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py b/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py
index fd01f7c1..77ce9e9f 100644
--- a/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py
+++ b/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py
@@ -64,9 +64,9 @@
 
 
 def set_recursive_attribute(op_name, predecessor_values):
-    in_channels = 64 if predecessor_values is None else predecessor_values["C_out"]
+    in_channels = 64 if predecessor_values is None else predecessor_values["c_out"]
     out_channels = in_channels * 2 if op_name == "ResNetBasicblock" else in_channels
-    return dict(C_in=in_channels, C_out=out_channels)
+    return dict(c_in=in_channels, c_out=out_channels)
 
 
 def run_pipeline(some_architecture, some_float, some_integer, some_cat):
@@ -79,7 +79,7 @@ def run_pipeline(some_architecture, some_float, some_integer, some_cat):
 
     model = some_architecture.to_pytorch()
     model = nn.Sequential(
-        ops.Stem(base_channels, C_in=in_channels),
+        ops.Stem(base_channels, c_in=in_channels),
         model,
         nn.AdaptiveAvgPool2d(1),
         nn.Flatten(),
diff --git a/neps_examples/experimental/hierarchical_architecture.py b/neps_examples/experimental/hierarchical_architecture.py
index 6751cc7a..55ed9144 100644
--- a/neps_examples/experimental/hierarchical_architecture.py
+++ b/neps_examples/experimental/hierarchical_architecture.py
@@ -54,12 +54,12 @@
 
 
 def set_recursive_attribute(op_name, predecessor_values):
-    in_channels = 64 if predecessor_values is None else predecessor_values["C_out"]
+    in_channels = 64 if predecessor_values is None else predecessor_values["c_out"]
     out_channels = in_channels * 2 if op_name == "ResNetBasicblock" else in_channels
-    return dict(C_in=in_channels, C_out=out_channels)
+    return dict(c_in=in_channels, c_out=out_channels)
 
 
-def run_pipeline(architecture):
+def run_pipeline(architecture: neps.FunctionParameter):
     in_channels = 3
     n_classes = 20
     base_channels = 64
@@ -67,7 +67,7 @@ def run_pipeline(architecture):
 
     model = architecture.to_pytorch()
     model = nn.Sequential(
-        ops.Stem(base_channels, C_in=in_channels),
+        ops.Stem(base_channels, c_in=in_channels),
         model,
         nn.AdaptiveAvgPool2d(1),
         nn.Flatten(),
diff --git a/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py b/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py
index c79a7a01..0d2acfb0 100644
--- a/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py
+++ b/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py
@@ -59,9 +59,9 @@
 
 
 def set_recursive_attribute(op_name, predecessor_values):
-    in_channels = 64 if predecessor_values is None else predecessor_values["C_out"]
+    in_channels = 64 if predecessor_values is None else predecessor_values["c_out"]
     out_channels = in_channels * 2 if op_name == "ResNetBasicblock" else in_channels
-    return dict(C_in=in_channels, C_out=out_channels)
+    return dict(c_in=in_channels, c_out=out_channels)
 
 
 def run_pipeline(architecture):
@@ -74,7 +74,7 @@ def run_pipeline(architecture):
 
     model = architecture.to_pytorch()
     model = nn.Sequential(
-        ops.Stem(base_channels, C_in=in_channels),
+        ops.Stem(base_channels, c_in=in_channels),
         model,
         nn.AdaptiveAvgPool2d(1),
         nn.Flatten(),
diff --git a/pyproject.toml b/pyproject.toml
index 3e304c55..92765560 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,19 +3,19 @@ name = "neural-pipeline-search"
 version = "v0.12.2"
 description = "Neural Pipeline Search helps deep learning experts find the best neural pipeline."
 authors = [
-  "Danny Stoll <stolld@cs.uni-freiburg.de>",
-  "Neeratyoy Mallik <mallik@cs.uni-freiburg.de>",
-  "Simon Schrodi",
-  "Eddie Bergman",
-  "Maciej Janowski",
-  "Samir Garibov",
-  "Tarek Abou Chakra",
-  "Daniel Rogalla",
-  "Carl Hvarfner",
-  "Binxin Ru",
-  "Nils Kober",
-  "Théophane Vallaeys",
-  "Frank Hutter",
+    "Danny Stoll <stolld@cs.uni-freiburg.de>",
+    "Neeratyoy Mallik <mallik@cs.uni-freiburg.de>",
+    "Simon Schrodi",
+    "Eddie Bergman",
+    "Maciej Janowski",
+    "Samir Garibov",
+    "Tarek Abou Chakra",
+    "Daniel Rogalla",
+    "Carl Hvarfner",
+    "Binxin Ru",
+    "Nils Kober",
+    "Théophane Vallaeys",
+    "Frank Hutter",
 ]
 readme = "README.md"
 license = "Apache-2.0"
@@ -23,10 +23,10 @@ homepage = "https://github.com/automl/neps"
 repository = "https://github.com/automl/neps"
 documentation = "https://automl.github.io/neps/"
 keywords = [
-  "Neural Pipeline Search",
-  "Neural Architecture Search",
-  "Hyperparameter Optimization",
-  "AutoML",
+    "Neural Pipeline Search",
+    "Neural Architecture Search",
+    "Hyperparameter Optimization",
+    "AutoML",
 ]
 classifiers = [
   "Development Status :: 4 - Beta",
@@ -97,33 +97,34 @@ src = ["neps"]
 
 # TODO(eddiebergman): Include more of these as we go on in migration
 exclude = [
-  "neps/search_spaces/architecture/**/*.py",
-  "neps/search_spaces/yaml_search_space_utils.py",
-  "neps/utils/run_args_from_yaml.py",
-  "neps/utils/common.py",
-  "neps/api.py",
-  "tests",
-  "neps_examples",
-  ".bzr",
-  ".direnv",
-  ".eggs",
-  ".git",
-  ".hg",
-  ".mypy_cache",
-  ".nox",
-  ".pants.d",
-  ".ruff_cache",
-  ".svn",
-  ".tox",
-  ".venv",
-  "__pypackages__",
-  "_build",
-  "buck-out",
-  "build",
-  "dist",
-  "node_modules",
-  "venv",
-  "docs",
+    "neps/optimizers/**/*.py",
+    "neps/search_spaces/architecture/**/*.py",
+    "neps/search_spaces/yaml_search_space_utils.py",
+    "neps/search_spaces/architecture",
+    "neps/utils/run_args_from_yaml.py",
+    "neps/api.py",
+    "tests",
+    "neps_examples",
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "venv",
+    "docs",
 ]
 
 [tool.ruff.lint]
@@ -137,52 +138,52 @@ extend-safe-fixes = ["ALL"]
 dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 
 select = [
-  "A",
-  # "ANN", # Handled by mypy
-  "ARG",
-  "B",
-  "BLE",
-  "COM",
-  "C4",
-  "D",
-  # "DTZ",  # One day I should know how to utilize timezones and dates...
-  "E",
-  # "EXE", Meh
-  "ERA",
-  "F",
-  "FBT",
-  "I",
-  # "ISC",  # Favours implicit string concatenation
-  "INP",
-  # "INT", # I don't understand this one
-  "N",
-  "NPY",
-  "PD",
-  "PLC",
-  "PLE",
-  "PLR",
-  "PLW",
-  "PIE",
-  "PT",
-  "PTH",
-  # "PYI", # Specific to .pyi files for type stubs
-  "Q",
-  "PGH004",
-  "RET",
-  "RUF",
-  "C90",
-  "S",
-  # "SLF",    # Private member accessed (sure, it's python)
-  "SIM",
-  # "TRY", # Good in principle, would take a lot of work to statisfy
-  "T10",
-  "T20",
-  "TID",
-  "TCH",
-  "UP",
-  "N",
-  "W",
-  "YTT",
+    "A",
+    # "ANN", # Handled by mypy
+    "ARG",
+    "B",
+    "BLE",
+    "COM",
+    "C4",
+    "D",
+    # "DTZ",  # One day I should know how to utilize timezones and dates...
+    "E",
+    # "EXE", Meh
+    "ERA",
+    "F",
+    "FBT",
+    "I",
+    # "ISC",  # Favours implicit string concatenation
+    "INP",
+    # "INT", # I don't understand this one
+    "N",
+    "NPY",
+    "PD",
+    "PLC",
+    "PLE",
+    "PLR",
+    "PLW",
+    "PIE",
+    "PT",
+    "PTH",
+    # "PYI", # Specific to .pyi files for type stubs
+    "Q",
+    "PGH004",
+    "RET",
+    "RUF",
+    "C90",
+    "S",
+    # "SLF",    # Private member accessed (sure, it's python)
+    "SIM",
+    # "TRY", # Good in principle, would take a lot of work to statisfy
+    "T10",
+    "T20",
+    "TID",
+    "TCH",
+    "UP",
+    "N",
+    "W",
+    "YTT",
 ]
 
 ignore = [
@@ -213,31 +214,31 @@ ignore = [
 # Exclude a variety of commonly ignored directories.
 [tool.ruff.lint.per-file-ignores]
 "tests/*.py" = [
-  "S101",
-  "D101",
-  "D102",
-  "D103",
-  "ANN001",
-  "ANN201",
-  "FBT001",
-  "D100",
-  "PD901",  #  X is a bad variable name. (pandas)
-  "TCH",
-  "N803",
-  "C901",   # Too complex
+    "S101",
+    "D101",
+    "D102",
+    "D103",
+    "ANN001",
+    "ANN201",
+    "FBT001",
+    "D100",
+    "PD901", #  X is a bad variable name. (pandas)
+    "TCH",
+    "N803",
+    "C901", # Too complex
 ]
 "__init__.py" = ["I002"]
 "neps_examples/*" = [
-  "INP001",
-  "I002",
-  "E741",
-  "D101",
-  "D103",
-  "T20",
-  "D415",
-  "ERA001",
-  "E402",
-  "E501",
+    "INP001",
+    "I002",
+    "E741",
+    "D101",
+    "D103",
+    "T20",
+    "D415",
+    "ERA001",
+    "E402",
+    "E501",
 ]
 "docs/*" = ["INP001"]
 
@@ -260,12 +261,12 @@ max-args = 10 # Changed from default of 5
 [tool.pytest.ini_options]
 addopts = "--basetemp ./tests_tmpdir -m 'not ci_examples'"
 markers = [
-  "ci_examples",
-  "core_examples",
-  "regression_all",
-  "runtime",
-  "neps_api",
-  "summary_csv",
+    "ci_examples",
+    "core_examples",
+    "regression_all",
+    "runtime",
+    "neps_api",
+    "summary_csv",
 ]
 filterwarnings = "ignore::DeprecationWarning:torch.utils.tensorboard.*:"
 
@@ -296,10 +297,10 @@ check_untyped_defs = true
 # TODO(eddiebergman): Improve coverage on these modules
 [[tool.mypy.overrides]]
 module = [
-  "neps.api",
-  "neps.optimizers.*",
-  "neps.search_spaces.architecture.*",
-  "neps.utils.run_args_from_yaml",
+    "neps.api",
+    "neps.optimizers.*",
+    "neps.search_spaces.architecture.*",
+    "neps.utils.run_args_from_yaml",
 ]
 ignore_errors = true
 

From 6303a650d1bfe6adb05089c6b7073a41e9a1f47b Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 18 Sep 2024 15:03:48 +0200
Subject: [PATCH 40/63] reapply stash

---
 neps/optimizers/bayesian_optimization/models/gp.py | 6 +++---
 neps_examples/basic_usage/hyperparameters.py       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index 0281cd1a..b3f5c2b2 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -7,9 +7,8 @@
 from functools import reduce
 from typing import TYPE_CHECKING, Any, TypeVar
 
-import gpytorch
-import gpytorch.constraints
 import torch
+import gpytorch.constraints
 from botorch.acquisition.analytic import SingleTaskGP
 from botorch.models.gp_regression import (
     get_covar_module_with_dim_scaled_prior,
@@ -18,7 +17,8 @@
 from botorch.models.transforms.outcome import Standardize
 from botorch.optim import optimize_acqf, optimize_acqf_mixed
 from gpytorch.kernels import ScaleKernel
-from torch._dynamo.utils import product
+from botorch.optim import optimize_acqf, optimize_acqf_mixed
+from itertools import product
 
 from neps.search_spaces.encoding import (
     CategoricalToIntegerTransformer,
diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py
index 6ea897f8..3f346949 100644
--- a/neps_examples/basic_usage/hyperparameters.py
+++ b/neps_examples/basic_usage/hyperparameters.py
@@ -48,5 +48,5 @@ def run_pipeline(float1, float2, float3, integer1, integer2):
     root_directory="results/hyperparameters_example",
     post_run_summary=True,
     max_evaluations_total=50,
-    use_prior=True,
+    use_priors=True,
 )

From 93943503b61d71c53aaa504782d9093031fc63a8 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 18 Sep 2024 18:20:51 +0200
Subject: [PATCH 41/63] refactor: Simpler ifbo

---
 neps/optimizers/base_optimizer.py             |  18 +-
 .../acquisition_functions/mf_pi.py            |   8 +-
 .../freeze_thaw_sampler.py                    |   2 +-
 .../kernels/grakel_replace/utils.py           |   4 +-
 .../grakel_replace/weisfeiler_lehman.py       |   2 +-
 .../bayesian_optimization/models/ftpfn.py     |  95 +--
 .../bayesian_optimization/optimizer.py        |  11 +-
 .../bayesian_optimization.yaml                |   2 +-
 .../optimizers/default_searchers/mobster.yaml |   2 +-
 neps/optimizers/default_searchers/pibo.yaml   |   2 +-
 .../default_searchers/priorband_bo.yaml       |   2 +-
 neps/optimizers/multi_fidelity/ifbo.py        | 547 +++++----------
 neps/optimizers/multi_fidelity/mf_bo.py       |  76 +--
 neps/optimizers/multi_fidelity/utils.py       |  45 ++
 .../multi_fidelity_prior/priorband.py         |   4 -
 neps/utils/common.py                          |   4 +-
 ...erarchical_architecture_hierarchical_GP.py | 143 ----
 .../user_priors_from_arbitrary_densities.py   | 151 -----
 .../testing_scripts/default_neps.py           |   5 +-
 tests/test_settings/test_settings.py          | 621 +++++++++---------
 .../test_yaml_run_args/test_yaml_run_args.py  |  75 ++-
 21 files changed, 671 insertions(+), 1148 deletions(-)
 delete mode 100644 neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py
 delete mode 100644 neps_examples/experimental/user_priors_from_arbitrary_densities.py

diff --git a/neps/optimizers/base_optimizer.py b/neps/optimizers/base_optimizer.py
index a80f9f75..8dd9e96f 100644
--- a/neps/optimizers/base_optimizer.py
+++ b/neps/optimizers/base_optimizer.py
@@ -6,7 +6,7 @@
 from dataclasses import asdict, dataclass
 from typing import TYPE_CHECKING, Any
 
-from neps.state.trial import Trial
+from neps.state.trial import Report, Trial
 from neps.utils.data_loading import _get_cost, _get_learning_curve, _get_loss
 from neps.utils.types import ERROR, ConfigResult, RawConfig, ResultDict
 
@@ -144,16 +144,14 @@ def update_state_post_evaluation(
         # state["key"] = "value"
         return state
 
-    def get_loss(
-        self, result: ERROR | ResultDict | float | Trial.Report
-    ) -> float | ERROR:
+    def get_loss(self, result: ERROR | ResultDict | float | Report) -> float | ERROR:
         """Calls result.utils.get_loss() and passes the error handling through.
         Please use self.get_loss() instead of get_loss() in all optimizer classes.
         """
         # TODO(eddiebergman): This is a forward change for whenever we can have optimizers
         # use `Trial` and `Report`, they already take care of this and save having to do this
         # `_get_loss` at every call. We can also then just use `None` instead of the string `"error"`
-        if isinstance(result, Trial.Report):
+        if isinstance(result, Report):
             return result.loss if result.loss is not None else "error"
 
         return _get_loss(
@@ -162,16 +160,14 @@ def get_loss(
             ignore_errors=self.ignore_errors,
         )
 
-    def get_cost(
-        self, result: ERROR | ResultDict | float | Trial.Report
-    ) -> float | ERROR:
+    def get_cost(self, result: ERROR | ResultDict | float | Report) -> float | ERROR:
         """Calls result.utils.get_cost() and passes the error handling through.
         Please use self.get_cost() instead of get_cost() in all optimizer classes.
         """
         # TODO(eddiebergman): This is a forward change for whenever we can have optimizers
         # use `Trial` and `Report`, they already take care of this and save having to do this
         # `_get_loss` at every call
-        if isinstance(result, Trial.Report):
+        if isinstance(result, Report):
             return result.loss if result.loss is not None else "error"
 
         return _get_cost(
@@ -181,7 +177,7 @@ def get_cost(
         )
 
     def get_learning_curve(
-        self, result: str | dict | float | Trial.Report
+        self, result: str | dict | float | Report
     ) -> list[float] | Any:
         """Calls result.utils.get_loss() and passes the error handling through.
         Please use self.get_loss() instead of get_loss() in all optimizer classes.
@@ -189,7 +185,7 @@ def get_learning_curve(
         # TODO(eddiebergman): This is a forward change for whenever we can have optimizers
         # use `Trial` and `Report`, they already take care of this and save having to do this
         # `_get_loss` at every call
-        if isinstance(result, Trial.Report):
+        if isinstance(result, Report):
             return result.learning_curve
 
         return _get_learning_curve(
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py
index ba2e886b..75c7f1e3 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py
@@ -125,10 +125,12 @@ def set_state(
         surrogate_model: Any,
         observations: MFObservedData,
         b_step: int | float,
-        **kwargs,
-    ):
+        seed: int = 42,
+    ) -> None:
         # set RNG
-        self.rng = np.random.RandomState(seed=42)
+        self.rng = np.random.RandomState(seed=seed)
+
+        # TODO: wut is this?
         for _i in range(len(observations.completed_runs)):
             self.rng.uniform(-4, -1)
             self.rng.randint(1, 51)
diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py
index 3021bfe0..ea22c5b1 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py
@@ -120,7 +120,7 @@ def sample(
         acquisition_function: Callable | None = None,
         n: int | None = None,
         set_new_sample_fidelity: int | float | None = None,
-    ) -> pd.DataFrame:
+    ) -> pd.Series:
         """Samples a new set and returns the total set of observed + new configs."""
         assert self.observations is not None
         assert self.pipeline_space is not None
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py
index e0ad94f3..fe8f8d06 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py
@@ -37,7 +37,7 @@ def calculate_kernel_matrix_as_tensor(
         K = se_kernel.forward(X, X) if se_kernel is not None else X @ X.t()
         if normalize:
             K_diag = torch.sqrt(torch.diag(K))
-            K_diag_outer = torch.ger(K_diag, K_diag)
+            K_diag_outer = torch.outer(K_diag, K_diag)
             return K / K_diag_outer
     else:
         assert Y.shape[1] == X.shape[1], (
@@ -51,7 +51,7 @@ def calculate_kernel_matrix_as_tensor(
             Kyy = calculate_kernel_matrix_as_tensor(
                 Y, Y, oa=oa, se_kernel=se_kernel, normalize=False
             )
-            K_diag_outer = torch.ger(
+            K_diag_outer = torch.outer(
                 torch.sqrt(torch.diag(Kyy)), torch.sqrt(torch.diag(Kxx))
             )
             return K / K_diag_outer
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
index f62d0ca0..be35c02a 100644
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
+++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
@@ -620,7 +620,7 @@ def generate_graphs_transform(WL_labels_inverse, nl):
         if self.normalize:
             X_diag, Y_diag = self.diagonal()
             if self.as_tensor:
-                div_ = torch.sqrt(torch.ger(Y_diag, X_diag))
+                div_ = torch.sqrt(torch.outer(Y_diag, X_diag))
                 K /= div_
             else:
                 old_settings = np.seterr(divide="ignore")
diff --git a/neps/optimizers/bayesian_optimization/models/ftpfn.py b/neps/optimizers/bayesian_optimization/models/ftpfn.py
index 95b02ba0..6f697033 100644
--- a/neps/optimizers/bayesian_optimization/models/ftpfn.py
+++ b/neps/optimizers/bayesian_optimization/models/ftpfn.py
@@ -58,11 +58,21 @@ def _download_workaround_for_ifbo_issue_10(path: Path | None, version: str) -> P
     return target_path
 
 
+def _cast_tensor_shapes(x: torch.Tensor) -> torch.Tensor:
+    if len(x.shape) == 3 and x.shape[1] == 1:
+        return x
+    if len(x.shape) == 2:
+        return x.reshape(x.shape[0], 1, x.shape[1])
+    if len(x.shape) == 1:
+        return x.reshape(x.shape[0], 1)
+    raise ValueError(f"Shape not recognized: {x.shape}")
+
+
 _CACHED_FTPFN_MODEL: dict[tuple[str, str], FTPFN] = {}
 
 
-class FTPFNSurrogate:
-    """Special class to deal with PFN surrogate model and freeze-thaw acquisition."""
+class FTPFNModel:
+    """Wrapper around the IfBO model."""
 
     def __init__(
         self,
@@ -85,57 +95,64 @@ def __init__(
             _CACHED_FTPFN_MODEL[key] = ftpfn
 
         self.ftpfn = ftpfn
-        self.target_path = self.ftpfn.target_path
-        self.version = self.ftpfn.version
-        self.train_x: torch.Tensor | None = None
-        self.train_y: torch.Tensor | None = None
-
-    @property
-    def device(self):
-        return self.ftpfn.device
-
-    def _get_logits(self, test_x: torch.Tensor) -> torch.Tensor:
-        assert self.train_x is not None, "Train data is not set."
-        assert self.train_y is not None, "Train data is not set."
+        self.device = self.ftpfn.device
+
+    def _get_logits(
+        self, train_x: torch.Tensor, train_y: torch.Tensor, test_x: torch.Tensor
+    ) -> torch.Tensor:
         return self.ftpfn.model(
-            self._cast_tensor_shapes(self.train_x),
-            self._cast_tensor_shapes(self.train_y),
-            self._cast_tensor_shapes(test_x),
+            _cast_tensor_shapes(train_x),
+            _cast_tensor_shapes(train_y),
+            _cast_tensor_shapes(test_x),
         )
 
-    def _cast_tensor_shapes(self, x: torch.Tensor) -> torch.Tensor:
-        if len(x.shape) == 3 and x.shape[1] == 1:
-            return x
-        if len(x.shape) == 2:
-            return x.reshape(x.shape[0], 1, x.shape[1])
-        if len(x.shape) == 1:
-            return x.reshape(x.shape[0], 1)
-        raise ValueError(f"Shape not recognized: {x.shape}")
-
     @torch.no_grad()
-    def get_mean_performance(self, test_x: torch.Tensor) -> torch.Tensor:
-        logits = self._get_logits(test_x).squeeze()
+    def get_mean_performance(
+        self,
+        train_x: torch.Tensor,
+        train_y: torch.Tensor,
+        test_x: torch.Tensor,
+    ) -> torch.Tensor:
+        logits = self._get_logits(train_x, train_y, test_x).squeeze()
         return self.ftpfn.model.criterion.mean(logits)
 
     @torch.no_grad()
-    def get_pi(self, test_x: torch.Tensor, y_best: torch.Tensor) -> torch.Tensor:
-        logits = self._get_logits(test_x)
+    def get_pi(
+        self,
+        train_x: torch.Tensor,
+        train_y: torch.Tensor,
+        test_x: torch.Tensor,
+        # TODO: just calculate from train_y?
+        y_best: torch.Tensor,
+    ) -> torch.Tensor:
+        logits = self._get_logits(train_x, train_y, test_x)
         return self.ftpfn.model.criterion.pi(
-            logits.squeeze(), best_f=(1 - y_best).unsqueeze(1)
+            logits.squeeze(),
+            best_f=(1 - y_best).unsqueeze(1),
         )
 
     @torch.no_grad()
-    def get_ei(self, test_x: torch.Tensor, y_best: torch.Tensor) -> torch.Tensor:
-        logits = self._get_logits(test_x)
+    def get_ei(
+        self,
+        train_x: torch.Tensor,
+        train_y: torch.Tensor,
+        test_x: torch.Tensor,
+        y_best: torch.Tensor,
+    ) -> torch.Tensor:
+        logits = self._get_logits(train_x, train_y, test_x)
         return self.ftpfn.model.criterion.ei(
             logits.squeeze(), best_f=(1 - y_best).unsqueeze(1)
         )
 
     @torch.no_grad()
     def get_lcb(
-        self, test_x: torch.Tensor, beta: float = (1 - 0.682) / 2
+        self,
+        train_x: torch.Tensor,
+        train_y: torch.Tensor,
+        test_x: torch.Tensor,
+        beta: float = (1 - 0.682) / 2,
     ) -> torch.Tensor:
-        logits = self._get_logits(test_x)
+        logits = self._get_logits(train_x, train_y, test_x)
         return self.ftpfn.model.criterion.ucb(
             logits=logits,
             best_f=None,
@@ -145,9 +162,13 @@ def get_lcb(
 
     @torch.no_grad()
     def get_ucb(
-        self, test_x: torch.Tensor, beta: float = (1 - 0.682) / 2
+        self,
+        train_x: torch.Tensor,
+        train_y: torch.Tensor,
+        test_x: torch.Tensor,
+        beta: float = (1 - 0.682) / 2,
     ) -> torch.Tensor:
-        logits = self._get_logits(test_x)
+        logits = self._get_logits(train_x, train_y, test_x)
         return self.ftpfn.model.criterion.ucb(
             logits=logits,
             best_f=None,
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index d9bd10e3..4188a5fe 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -180,7 +180,6 @@ def __init__(  # noqa: D417
         device: torch.device | None = None,
         encoder: TensorEncoder | None = None,
         treat_fidelity_as_hyperparameters: bool = False,
-        **kwargs: Any,  # TODO: Remove
     ):
         """Initialise the BO loop.
 
@@ -250,9 +249,9 @@ def ask(
                 "Seed is not yet implemented for BayesianOptimization"
             )
 
-        n_trials_completed = len(trials)
+        n_trials_sampled = len(trials)
         space = self.pipeline_space
-        config_id = str(n_trials_completed + 1)
+        config_id = str(n_trials_sampled + 1)
 
         # Fill intitial design data if we don't have any...
         if self.initial_design_ is None:
@@ -278,8 +277,8 @@ def ask(
             self.initial_design_.extend(configs)
 
         # If we havn't passed the intial design phase
-        if n_trials_completed < len(self.initial_design_):
-            config = self.initial_design_[n_trials_completed]
+        if n_trials_sampled < len(self.initial_design_):
+            config = self.initial_design_[n_trials_sampled]
             sample = SampledConfig(id=config_id, config=config, previous_config_id=None)
             return sample, optimizer_state
 
@@ -346,7 +345,7 @@ def ask(
         # the probability of it being sampled from the prior.
         if self.prior:
             pibo_exp_term = _pibo_exp_term(
-                n_trials_completed,
+                n_trials_sampled,
                 self.encoder.ncols,
                 self.n_initial_design,
             )
diff --git a/neps/optimizers/default_searchers/bayesian_optimization.yaml b/neps/optimizers/default_searchers/bayesian_optimization.yaml
index fb43f97b..9b5a3f37 100644
--- a/neps/optimizers/default_searchers/bayesian_optimization.yaml
+++ b/neps/optimizers/default_searchers/bayesian_optimization.yaml
@@ -1,6 +1,6 @@
 strategy: bayesian_optimization
 # Arguments that can be modified by the user
-surrogate_model: gp  # or {"gp_hierarchy"}
+surrogate_model: gp
 acquisition: EI  # or {"LogEI", "AEI"}
 log_prior_weighted: false
 acquisition_sampler: mutation  # or {"random", "evolution"}
diff --git a/neps/optimizers/default_searchers/mobster.yaml b/neps/optimizers/default_searchers/mobster.yaml
index 9ce821b3..81afaabb 100644
--- a/neps/optimizers/default_searchers/mobster.yaml
+++ b/neps/optimizers/default_searchers/mobster.yaml
@@ -8,7 +8,7 @@ sample_default_first: false
 sample_default_at_target: false
 
 # arguments for model
-surrogate_model: gp  # or {"gp_hierarchy"}
+surrogate_model: gp
 acquisition: EI  # or {"LogEI", "AEI"}
 log_prior_weighted: false
 acquisition_sampler: random  # or {"mutation", "evolution"}
diff --git a/neps/optimizers/default_searchers/pibo.yaml b/neps/optimizers/default_searchers/pibo.yaml
index 8b514ba8..0dc7a7db 100644
--- a/neps/optimizers/default_searchers/pibo.yaml
+++ b/neps/optimizers/default_searchers/pibo.yaml
@@ -1,6 +1,6 @@
 strategy: pibo
 # Arguments that can be modified by the user
-surrogate_model: gp  # or {"gp_hierarchy"}
+surrogate_model: gp
 acquisition: EI  # or {"LogEI", "AEI"}
 log_prior_weighted: false
 acquisition_sampler: mutation  # or {"random", "evolution"}
diff --git a/neps/optimizers/default_searchers/priorband_bo.yaml b/neps/optimizers/default_searchers/priorband_bo.yaml
index 5a9fd3a9..3deb61d7 100644
--- a/neps/optimizers/default_searchers/priorband_bo.yaml
+++ b/neps/optimizers/default_searchers/priorband_bo.yaml
@@ -16,7 +16,7 @@ inc_style: dynamic
 model_based: true # crucial argument to set to allow model-search
 modelling_type: joint
 initial_design_size: 10
-surrogate_model: gp  # or {"gp_hierarchy"}
+surrogate_model: gp
 acquisition: EI  # or {"LogEI", "AEI"}
 log_prior_weighted: false
 acquisition_sampler: mutation  # or {"random", "evolution"}
diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index c1a87862..e8b34d25 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -1,462 +1,223 @@
 from __future__ import annotations
 
 import warnings
-from typing import TYPE_CHECKING, Any
-from typing_extensions import override
+from typing import TYPE_CHECKING, Any, Mapping
 
 import numpy as np
-import pandas as pd
 
-from neps.optimizers.base_optimizer import BaseOptimizer
-from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping
-from neps.optimizers.bayesian_optimization.acquisition_samplers import (
-    AcquisitionSamplerMapping,
+from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig
+from neps.optimizers.bayesian_optimization.acquisition_functions.mf_pi import MFPI_Random
+from neps.optimizers.bayesian_optimization.acquisition_samplers.freeze_thaw_sampler import (
+    FreezeThawSampler,
 )
-from neps.optimizers.multi_fidelity.mf_bo import FreezeThawModel, PFNSurrogate
+from neps.optimizers.multi_fidelity.mf_bo import PFNSurrogate
 from neps.optimizers.multi_fidelity.utils import MFObservedData
 from neps.search_spaces.search_space import FloatParameter, IntegerParameter, SearchSpace
-from neps.utils.common import instance_from_map
+from neps.state.trial import Trial
 
 if TYPE_CHECKING:
-    from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
-        BaseAcquisition,
-    )
-    from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-        AcquisitionSampler,
-    )
     from neps.state.optimizer import BudgetInfo
-    from neps.utils.types import ConfigResult
+
+
+def _adjust_fidelity_for_freeze_thaw_steps(
+    pipeline_space: SearchSpace,
+    step_size: int,
+) -> SearchSpace:
+    """Adjusts the fidelity range to be divisible by `step_size` for Freeze-Thaw."""
+    assert pipeline_space.fidelity is not None
+
+    # Check if the fidelity range is divided into equal sized steps by `step_size`
+    fid_range = pipeline_space.fidelity.upper - pipeline_space.fidelity.lower
+    remainder = fid_range % step_size
+    if remainder == 0:
+        return pipeline_space
+
+    # Adjust the fidelity lower bound to be divisible by `step_size` into equal steps
+    # Pushing the lower bound of the fidelity space by an offset to ensure equal-sized steps
+    offset = step_size - remainder
+    pipeline_space.fidelity.lower += offset
+
+    warnings.warn(
+        f"Adjusted fidelity lower bound to {pipeline_space.fidelity.lower} "
+        f"for equal-sized steps of {step_size}.",
+        UserWarning,
+        stacklevel=3,
+    )
+    return pipeline_space
+
+
+# TODO: Maybe make this a part of searchspace functionality
+def get_budget_value(
+    space: SearchSpace,
+    step_size: int,
+    budget_level: int | float,
+) -> int | float:
+    assert space.fidelity is not None
+    match space.fidelity:
+        case IntegerParameter():
+            return int(step_size * budget_level + space.fidelity.lower)
+        case FloatParameter():
+            return step_size * budget_level + space.fidelity.lower
+        case _:
+            raise NotImplementedError(
+                f"Fidelity parameter: {space.fidelity}"
+                f"must be one of the types: "
+                f"[IntegerParameter, FloatParameter], but is type:"
+                f"{type(space.fidelity)}"
+            )
 
 
 class IFBO(BaseOptimizer):
     """Base class for MF-BO algorithms that use DyHPO-like acquisition and budgeting."""
 
-    acquisition: str = "MFPI-random"
-
     def __init__(
         self,
         pipeline_space: SearchSpace,
-        budget: int | None = None,
-        step_size: int | float = 1,
-        optimal_assignment: bool = False,  # pylint: disable=unused-argument
+        step_size: int = 1,
         use_priors: bool = False,
         sample_default_first: bool = False,
         sample_default_at_target: bool = False,
-        loss_value_on_error: None | float = None,
-        cost_value_on_error: None | float = None,
         patience: int = 100,
-        ignore_errors: bool = False,
-        logger=None,
         # arguments for model
-        surrogate_model: str | Any = "ftpfn",
         surrogate_model_args: dict | None = None,
-        domain_se_kernel: str | None = None,
-        graph_kernels: list | None = None,
-        hp_kernels: list | None = None,
-        acquisition: str | BaseAcquisition = acquisition,
-        acquisition_args: dict | None = None,
-        acquisition_sampler: str | AcquisitionSampler = "freeze-thaw",
-        acquisition_sampler_args: dict | None = None,
-        model_policy: Any = PFNSurrogate,
         initial_design_size: int = 1,
     ):
         """Initialise.
 
         Args:
             pipeline_space: Space in which to search
-            budget: Maximum budget
             use_priors: Allows random samples to be generated from a default
                 Samples generated from a Gaussian centered around the default value
             sampling_policy: The type of sampling procedure to use
             promotion_policy: The type of promotion procedure to use
-            loss_value_on_error: Setting this and cost_value_on_error to any float will
-                supress any error during bayesian optimization and will use given loss
-                value instead. default: None
-            cost_value_on_error: Setting this and loss_value_on_error to any float will
-                supress any error during bayesian optimization and will use given cost
-                value instead. default: None
-            logger: logger object, or None to use the neps logger
             sample_default_first: Whether to sample the default configuration first
             initial_design_size: Number of configurations to sample before starting optimization
         """
+        assert self.pipeline_space.fidelity is not None
+
         # Adjust pipeline space fidelity steps to be equally spaced
-        pipeline_space = self._adjust_fidelity_for_freeze_thaw_steps(
-            pipeline_space, step_size
-        )
-        # Super constructor call
-        super().__init__(
-            pipeline_space=pipeline_space,
-            budget=budget,
-            patience=patience,
-            loss_value_on_error=loss_value_on_error,
-            cost_value_on_error=cost_value_on_error,
-            ignore_errors=ignore_errors,
-            logger=logger,
-        )
-        self.raw_tabular_space = None  # placeholder, can be populated using pre_load_hook
-        self._budget_list: list[int | float] = []
-        self.step_size: int | float = step_size
-        self.min_budget = self.pipeline_space.fidelity.lower
-        # TODO: generalize this to work with real data (not benchmarks)
-        self.max_budget = self.pipeline_space.fidelity.upper
-        self._initial_design_size = initial_design_size
+        pipeline_space = _adjust_fidelity_for_freeze_thaw_steps(pipeline_space, step_size)
+        super().__init__(pipeline_space=pipeline_space, patience=patience)
 
-        # TODO: Write use cases for these parameters
-        self._model_update_failed = False
+        self.step_size = step_size
+        self.use_priors = use_priors
+        self.surrogate_model_args = surrogate_model_args
         self.sample_default_first = sample_default_first
         self.sample_default_at_target = sample_default_at_target
 
-        self.surrogate_model_name = surrogate_model
+        self._initial_design_size = initial_design_size
 
-        self.use_priors = use_priors
-        self.total_fevals: int = 0
+        self.min_budget: int | float = self.pipeline_space.fidelity.lower
+        self.max_budget: int | float = self.pipeline_space.fidelity.upper
 
-        self.observed_configs = MFObservedData(
-            columns=["config", "perf", "learning_curves"],
-            index_names=["config_id", "budget_id"],
-        )
+        fidelity_name = self.pipeline_space.fidelity_name
+        assert isinstance(fidelity_name, str)
+        self.fidelity_name: str = fidelity_name
 
-        # Preparing model
-        self.graph_kernels, self.hp_kernels = get_default_kernels(
-            pipeline_space=pipeline_space,
-            domain_se_kernel=domain_se_kernel,
-            graph_kernels=graph_kernels,
-            hp_kernels=hp_kernels,
-            optimal_assignment=optimal_assignment,
-        )
-        self.surrogate_model_args = (
-            {} if surrogate_model_args is None else surrogate_model_args
-        )
-        self._prep_model_args(self.hp_kernels, self.graph_kernels, pipeline_space)
+        self._model_update_failed = False
 
-        # TODO: Better solution than branching based on the surrogate name is needed
-        if surrogate_model in ["gp", "gp_hierarchy"]:
-            model_policy = FreezeThawModel
-        elif surrogate_model == "ftpfn":
-            model_policy = PFNSurrogate
-        else:
-            raise ValueError("Invalid model option selected!")
+    def ask(
+        self,
+        trials: Mapping[str, Trial],
+        budget_info: BudgetInfo,
+        optimizer_state: dict[str, Any],
+        seed: int | None = None,
+    ) -> tuple[SampledConfig, dict[str, Any]]:
+        if seed is not None:
+            raise NotImplementedError("Seed is not yet implemented for IFBO")
+
+        observed_configs = MFObservedData.from_trials(trials)
 
-        # The surrogate model is initalized here
-        self.model_policy = model_policy(
-            pipeline_space=pipeline_space,
-            surrogate_model=surrogate_model,
+        in_initial_design_phase = (
+            len(observed_configs.completed_runs) < self._initial_design_size
+        )
+        if in_initial_design_phase:
+            # TODO: Copy BO setup where we can sample SOBOL or from Prior
+            self.logger.debug("Sampling from initial design...")
+            config = self.pipeline_space.sample(
+                patience=self.patience, user_priors=True, ignore_fidelity=False
+            )
+            _config_dict = config.hp_values()
+            _config_dict.update({self.fidelity_name: self.min_budget})
+            config.set_hyperparameters_from_dict(_config_dict)
+            _config_id = observed_configs.next_config_id()
+            return SampledConfig(
+                config=config.hp_values(), id=_config_id, previous_config_id=None
+            ), optimizer_state
+
+        # TODO: Maybe just remove `PFNSurrogate` as a whole and use FTPFN directly...
+        #    this depends on whether we can actually create a proper surrogate model abstraction
+        # TODO: Really all of this should just be passed into an __init__ instead of 3 stage process
+        model_policy = PFNSurrogate(
+            pipeline_space=self.pipeline_space,
             surrogate_model_args=self.surrogate_model_args,
             step_size=self.step_size,
         )
-        self.acquisition_args = {} if acquisition_args is None else acquisition_args
-        self.acquisition_args.update(
-            {
-                "pipeline_space": self.pipeline_space,
-                "surrogate_model_name": self.surrogate_model_name,
-            }
-        )
-        self.acquisition = instance_from_map(
-            AcquisitionMapping,
-            acquisition,
-            name="acquisition function",
-            kwargs=self.acquisition_args,
-        )
-        self.acquisition_sampler_args = (
-            {} if acquisition_sampler_args is None else acquisition_sampler_args
-        )
-        self.acquisition_sampler_args.update(
-            {"patience": self.patience, "pipeline_space": self.pipeline_space}
-        )
-        self.acquisition_sampler = instance_from_map(
-            AcquisitionSamplerMapping,
-            acquisition_sampler,
-            name="acquisition sampler function",
-            kwargs=self.acquisition_sampler_args,
-        )
-        self.count = 0
-
-    def _adjust_fidelity_for_freeze_thaw_steps(
-        self, pipeline_space: SearchSpace, step_size: int
-    ) -> SearchSpace:
-        """Adjusts the fidelity range to be divisible by `step_size` for Freeze-Thaw."""
-        if not pipeline_space.has_fidelity:
-            return pipeline_space
-        # Check if the fidelity range is divided into equal sized steps by `step_size`
-        remainder = (
-            pipeline_space.fidelity.upper - pipeline_space.fidelity.lower
-        ) % step_size
-        if remainder == 0:
-            return pipeline_space
-        # Adjust the fidelity lower bound to be divisible by `step_size` into equal steps
-        offset = step_size - remainder
-        # Pushing the lower bound of the fidelity space by an offset to ensure equal-sized steps
-        pipeline_space.fidelity.lower += offset
-        warnings.warn(
-            f"Adjusted fidelity lower bound to {pipeline_space.fidelity.lower} "
-            f"for equal-sized steps of {step_size}."
-        )
-        return pipeline_space
-
-    def _prep_model_args(self, hp_kernels, graph_kernels, pipeline_space):
-        if self.surrogate_model_name in ["gp", "gp_hierarchy"]:
-            # setup for GP implemented in NePS
-            self.surrogate_model_args.update(
-                {
-                    # domain_se_kernel=domain_se_kernel,
-                    "hp_kernels": hp_kernels,
-                    "graph_kernels": graph_kernels,
-                }
-            )
-            if not self.surrogate_model_args["hp_kernels"]:
-                raise ValueError("No kernels are provided!")
-            # if "vectorial_features" not in self.surrogate_model_args:
-            self.surrogate_model_args["vectorial_features"] = (
-                pipeline_space.raw_tabular_space.get_vectorial_dim()
-                if pipeline_space.has_tabular
-                else pipeline_space.get_vectorial_dim()
-            )
+        model_policy.observed_configs = observed_configs
+        model_policy.update_model()
 
-    def get_budget_level(self, config: SearchSpace) -> int:
-        """Calculates the discretized (int) budget level for a given configuration."""
-        return int(
-            np.ceil((config.fidelity.value - config.fidelity.lower) / self.step_size)
+        # TODO: Replace with more efficient samplers we have from BO
+        # TODO: Just make this take in everything at __init__ instead of a 2 stage init
+        acquisition_sampler = FreezeThawSampler(
+            pipeline_space=self.pipeline_space, patience=self.patience
         )
-
-    def get_budget_value(self, budget_level: int | float) -> int | float:
-        if isinstance(self.pipeline_space.fidelity, IntegerParameter):
-            budget_val = int(
-                self.step_size * budget_level + self.pipeline_space.fidelity.lower
-            )
-        elif isinstance(self.pipeline_space.fidelity, FloatParameter):
-            budget_val = (
-                self.step_size * budget_level + self.pipeline_space.fidelity.lower
-            )
-        else:
-            raise NotImplementedError(
-                f"Fidelity parameter: {self.pipeline_space.fidelity}"
-                f"must be one of the types: "
-                f"[IntegerParameter, FloatParameter], but is type:"
-                f"{type(self.pipeline_space.fidelity)}"
-            )
-        self._budget_list.append(budget_val)
-        return budget_val
-
-    def total_budget_spent(self) -> int | float:
-        """Calculates the toal budget spent so far, in the unit of fidelity specified.
-
-        This is calculated as a function of the fidelity range provided, that takes into
-        account the minimum budget and the step size.
-        """
-        if len(self.observed_configs.df) == 0:
-            return 0
-
-        n_configs = len(self.observed_configs.seen_config_ids)
-        total_budget_level = sum(self.observed_configs.seen_budget_levels)
-        total_initial_budget_spent = n_configs * self.pipeline_space.fidelity.lower
-        return total_initial_budget_spent + total_budget_level * self.step_size
-
-    def is_init_phase(self) -> bool:
-        return self.num_train_configs < self._initial_design_size
-
-    @property
-    def num_train_configs(self):
-        return len(self.observed_configs.completed_runs)
-
-    @override
-    def load_optimization_state(
-        self,
-        previous_results: dict[str, ConfigResult],
-        pending_evaluations: dict[str, SearchSpace],
-        budget_info: BudgetInfo | None,
-        optimizer_state: dict[str, Any],
-    ) -> None:
-        """This is basically the fit method.
-
-        Args:
-            previous_results (dict[str, ConfigResult]): [description]
-            pending_evaluations (dict[str, ConfigResult]): [description]
-        """
-        self.observed_configs = MFObservedData(
-            columns=["config", "perf", "learning_curves"],
-            index_names=["config_id", "budget_id"],
+        acquisition_sampler.set_state(
+            self.pipeline_space, observed_configs, self.step_size
         )
-        # previous optimization run exists and needs to be loaded
-        self._load_previous_observations(previous_results)
-        self.total_fevals = len(previous_results) + len(pending_evaluations)
 
-        # account for pending evaluations
-        self._handle_pending_evaluations(pending_evaluations)
+        samples = acquisition_sampler.sample(set_new_sample_fidelity=self.min_budget)
 
-        # an aesthetic choice more than a functional choice
-        self.observed_configs.df = self.observed_configs.df.sort_index(
-            level=self.observed_configs.df.index.names
+        # TODO: See if we can get away from `set_state` style things
+        # and just instantiate it with what it needs
+        acquisition = MFPI_Random(
+            pipeline_space=self.pipeline_space, surrogate_model_name="ftpfn"
         )
-        # TODO: can we do better than keeping a copy of the observed configs?
-        # TODO: can we not hide this in load_results and have something that pops out
-        #   more, like a set_state or policy_args
-        self.model_policy.observed_configs = self.observed_configs
-        # fit any model/surrogates
-        init_phase = self.is_init_phase()
-        if not init_phase:
-            self._fit_models()
-
-    @classmethod
-    def _get_config_id_split(cls, config_id: str) -> tuple[str, str]:
-        # assumes config IDs of the format `[unique config int ID]_[int rung ID]`
-        ids = config_id.split("_")
-        _config, _budget = ids[0], ids[1]
-        return _config, _budget
-
-    def _load_previous_observations(self, previous_results):
-        def index_data_split(config_id: str, config_val):
-            _config_id, _budget_id = IFBO._get_config_id_split(config_id)
-            index = int(_config_id), int(_budget_id)
-            _data = [
-                config_val.config,
-                self.get_loss(config_val.result),
-                self.get_learning_curve(config_val.result),
-            ]
-            return index, _data
-
-        if len(previous_results) > 0:
-            index_row = [
-                tuple(index_data_split(config_id, config_val))
-                for config_id, config_val in previous_results.items()
-            ]
-            indices, rows = zip(*index_row, strict=False)
-            self.observed_configs.add_data(data=list(rows), index=list(indices))
-
-    def _handle_pending_evaluations(self, pending_evaluations):
-        for config_id, config_val in pending_evaluations.items():
-            _config, _budget_level = config_id.split("_")
-            index = (int(_config), int(_budget_level))
-
-            if index not in self.observed_configs.df.index:
-                # TODO: Validate this
-                self.observed_configs.add_data(
-                    [config_val, np.nan, [np.nan]], index=index
-                )
-            else:
-                self.observed_configs.update_data(
-                    {
-                        self.observed_configs.config_col: config_val,
-                        self.observed_configs.perf_col: np.nan,
-                        self.observed_configs.lc_col_name: [np.nan],
-                    },
-                    index=index,
-                )
-
-    def _fit_models(self):
-        # TODO: Once done with development catch the model update exceptions
-        # and skip model based suggestions if failed (karibbov)
-        self._prep_model_args(self.hp_kernels, self.graph_kernels, self.pipeline_space)
-        self.model_policy.set_state(self.pipeline_space, self.surrogate_model_args)
-        self.model_policy.update_model()
-        self.acquisition.set_state(
+        acquisition.set_state(
             self.pipeline_space,
-            self.model_policy.surrogate_model,
-            self.observed_configs,
+            model_policy.surrogate_model,
+            observed_configs,
             self.step_size,
         )
-        self.acquisition_sampler.set_state(
-            self.pipeline_space, self.observed_configs, self.step_size
-        )
 
-    def _randomly_promote(self) -> tuple[SearchSpace, int]:
-        """Samples the initial design.
+        # `_samples` should have new configs with fidelities set to as required
+        acq, _samples = acquisition.eval(x=samples, asscalar=True)
+        # NOTE: len(samples) need not be equal to len(_samples) as `samples` contain
+        # all (partials + new) configurations obtained from the sampler, but
+        # in `_samples`, configs are removed that have reached maximum epochs allowed
 
-        With an unbiased coin toss (p=0.5) it decides whether to sample a new
-        configuration or continue a partial configuration, until initial_design_size
-        configurations have been sampled.
-        """
-        # sampling a configuration ID from the observed ones
-        _config_ids = np.unique(
-            self.observed_configs.df.index.get_level_values("config_id").values
-        )
-        _config_id = np.random.choice(_config_ids)
-        # extracting the config
-        config = self.observed_configs.df.loc[
-            _config_id, self.observed_configs.config_col
-        ].iloc[0]
-        # extracting the budget level
-        budget = self.observed_configs.df.loc[_config_id].index.values[-1]
-        # calculating fidelity value
-        new_fidelity = self.get_budget_value(budget + 1)
-        # setting the config fidelity
-        config.update_hp_values({config.fidelity_name: new_fidelity})
-        return config, _config_id
-
-    def get_config_and_ids(  # pylint: disable=no-self-use
-        self,
-    ) -> tuple[SearchSpace, str, str | None]:
-        """...and this is the method that decides which point to query.
+        best_idx = acq.argmax()
+        _config_id = best_idx
 
-        Returns:
-            [type]: [description]
-        """
-        config_id = None
-        previous_config_id = None
-        if self.is_init_phase():
-            # sample a new config till initial design size is satisfied
-            self.logger.info("sampling...")
-            config = self.pipeline_space.sample(
-                patience=self.patience, user_priors=True, ignore_fidelity=False
-            )
-            _config_dict = config.hp_values()
-            _config_dict.update({config.fidelity_name: self.min_budget})
-            config.set_hyperparameters_from_dict(_config_dict)
-            _config_id = self.observed_configs.next_config_id()
-        elif self.is_init_phase() or self._model_update_failed:
-            # promote a config randomly if initial design size is satisfied but the
-            # initial design budget has not been exhausted
-            self.logger.info("promoting...")
-            config, _config_id = self._randomly_promote()
+        # NOTE: `samples` and `_samples` should share the same index values, hence,
+        # avoid using `.iloc` and work with `.loc` on these pandas DataFrame/Series
+        config: SearchSpace = samples.loc[_config_id]
+        config = config.clone()
+
+        # IMPORTANT: setting the fidelity value appropriately
+        if best_idx > max(observed_configs.seen_config_ids):
+            next_fid_value = self.min_budget
         else:
-            if self.count == 0:
-                self.logger.info("\nPartial learning curves as initial design:\n")
-                self.logger.info(f"{self.observed_configs.get_learning_curves()}\n")
-            self.count += 1
-            # main acquisition call here after initial design is turned off
-            self.logger.info("acquiring...")
-            # generates candidate samples for acquisition calculation
-            samples = self.acquisition_sampler.sample(
-                set_new_sample_fidelity=self.pipeline_space.fidelity.lower
-            )  # fidelity values here should be the observations or min. fidelity
-
-            # calculating acquisition function values for the candidate samples
-            acq, _samples = self.acquisition.eval(  # type: ignore[attr-defined]
-                x=samples, asscalar=True
+            max_observed_fids = (
+                observed_configs.get_max_observed_fidelity_level_per_config()
             )
-            acq = pd.Series(acq, index=_samples.index)
-
-            # maximizing acquisition function
-            best_idx = acq.sort_values().index[-1]
-            # extracting the config ID for the selected maximizer
-            _config_id = best_idx  # samples.index[_samples.index.values[_idx]]
-            # `_samples` should have new configs with fidelities set to as required
-            # NOTE: len(samples) need not be equal to len(_samples) as `samples` contain
-            # all (partials + new) configurations obtained from the sampler, but
-            # in `_samples`, configs are removed that have reached maximum epochs allowed
-            # NOTE: `samples` and `_samples` should share the same index values, hence,
-            # avoid using `.iloc` and work with `.loc` on these pandas DataFrame/Series
-
-            # assigning config hyperparameters
-            config = samples.loc[_config_id]
-            # IMPORTANT: setting the fidelity value appropriately
-            _fid_value = (
-                config.fidelity.lower
-                if best_idx > max(self.observed_configs.seen_config_ids)
-                else (
-                    self.get_budget_value(
-                        self.observed_configs.get_max_observed_fidelity_level_per_config().loc[
-                            best_idx
-                        ]
-                    )
-                    + self.step_size  # ONE-STEP FIDELITY QUERY for freeze-thaw
-                )
+            best_configs_max_fid = max_observed_fids.loc[best_idx]
+            budget_value = get_budget_value(
+                space=self.pipeline_space,
+                step_size=self.step_size,
+                budget_level=best_configs_max_fid,
             )
-            config.update_hp_values({config.fidelity_name: _fid_value})
-        # generating correct IDs
-        if _config_id in self.observed_configs.seen_config_ids:
-            config_id = f"{_config_id}_{self.get_budget_level(config)}"
-            previous_config_id = f"{_config_id}_{self.get_budget_level(config) - 1}"
+            next_fid_value = budget_value + self.step_size
+
+        config.update_hp_values({self.fidelity_name: next_fid_value})
+
+        # Lastly, we need to generate config id for it.
+        budget_level = int(np.ceil((next_fid_value - self.min_budget) / self.step_size))
+        if _config_id in observed_configs.seen_config_ids:
+            config_id = f"{_config_id}_{budget_level}"
+            previous_config_id = f"{_config_id}_{budget_value - 1}"
         else:
-            config_id = f"{self.observed_configs.next_config_id()}_{self.get_budget_level(config)}"
+            config_id = f"{observed_configs.next_config_id()}_{budget_level}"
 
-        return config.hp_values(), config_id, previous_config_id  # type: ignore
+        return SampledConfig(
+            config=config.hp_values(), id=config_id, previous_config_id=previous_config_id
+        ), optimizer_state
diff --git a/neps/optimizers/multi_fidelity/mf_bo.py b/neps/optimizers/multi_fidelity/mf_bo.py
index 790c833a..729e3718 100755
--- a/neps/optimizers/multi_fidelity/mf_bo.py
+++ b/neps/optimizers/multi_fidelity/mf_bo.py
@@ -6,7 +6,9 @@
 import torch
 
 from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
+from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate
 from neps.optimizers.multi_fidelity.utils import (
+    MFObservedData,
     get_tokenized_data,
     get_training_data_for_freeze_thaw,
 )
@@ -183,66 +185,30 @@ def sample_new_config(
         return config
 
 
-class FreezeThawModel:
-    """Designed to work with model search in unit step multi-fidelity algorithms."""
+class PFNSurrogate:
+    """Special class to deal with PFN surrogate model and freeze-thaw acquisition."""
 
     def __init__(
         self,
-        pipeline_space,
+        pipeline_space: SearchSpace,
         surrogate_model: str = "ftpfn",
         surrogate_model_args: dict | None = None,
         step_size: int = 1,
     ):
-        self.observed_configs = None
+        self.train_x = None
+        self.train_y = None
+        self.observed_configs: MFObservedData | None = None
         self.pipeline_space = pipeline_space
         self.surrogate_model_name = surrogate_model
         self.surrogate_model_args = (
             surrogate_model_args if surrogate_model_args is not None else {}
         )
-        self.surrogate_model = instance_from_map(
-            SurrogateModelMapping,
-            self.surrogate_model_name,
-            name="surrogate model",
-            kwargs=self.surrogate_model_args,
-        )
-        self.step_size = step_size
-
-    def _fantasize_pending(self, train_x, train_y, pending_x):
-        raise NotImplementedError("Fantasization not implemented yet!")
-
-    def _fit(self, train_x, train_y, train_lcs):
-        raise NotImplementedError("Predict not implemented yet!")
-
-    def _predict(self, test_x) -> torch.Tensor:
-        raise NotImplementedError("Predict not implemented yet!")
 
-    def set_state(
-        self,
-        pipeline_space,
-        surrogate_model_args,
-        **kwargs,  # pylint: disable=unused-argument
-    ):
-        self.pipeline_space = pipeline_space
-        self.surrogate_model_args = (
-            surrogate_model_args if surrogate_model_args is not None else {}
-        )
-        self.surrogate_model = instance_from_map(
-            SurrogateModelMapping,
-            self.surrogate_model_name,
-            name="surrogate model",
-            kwargs=self.surrogate_model_args,
-        )
-
-
-class PFNSurrogate(FreezeThawModel):
-    """Special class to deal with PFN surrogate model and freeze-thaw acquisition."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.train_x = None
-        self.train_y = None
+        # TODO: Lift this into the responsility of the caller of this function.
+        self.surrogate_model = FTPFNSurrogate(**surrogate_model_args)
+        self.step_size = step_size
 
-    def update_model(self):
+    def update_model(self) -> None:
         # tokenize the observations
         idxs, steps, configs, performance = get_training_data_for_freeze_thaw(
             self.observed_configs.df.loc[self.observed_configs.completed_runs_index],
@@ -295,7 +261,7 @@ def update_model(self):
             # refit the model, on completed runs + fantasized pending runs
             self._fit(train_x, train_y)
 
-    def _fit(self, train_x: torch.Tensor, train_y: torch.Tensor):  # pylint: disable=unused-argument
+    def _fit(self, train_x: torch.Tensor, train_y: torch.Tensor) -> None:
         # no training required,, only preprocessing the training data as context during inference
         assert self.surrogate_model is not None, "Surrogate model not set!"
         self.surrogate_model.train_x = train_x
@@ -306,10 +272,12 @@ def _predict(self, test_x: torch.Tensor) -> torch.Tensor:
             self.surrogate_model.train_x is not None
             and self.surrogate_model.train_y is not None
         ), "Model not trained yet!"
-        if self.surrogate_model_name == "ftpfn":
-            mean = self.surrogate_model.get_mean_performance(test_x)
-            if mean.is_cuda:
-                mean = mean.cpu()
-            return mean
-        # check neps/optimizers/bayesian_optimization/models/__init__.py for options
-        raise ValueError(f"Surrogate model {self.surrogate_model_name} not supported!")
+        return self.surrogate_model.get_mean_performance(test_x)
+
+    def set_state(
+        self,
+        pipeline_space,
+        surrogate_model_args,
+        **kwargs,  # pylint: disable=unused-argument
+    ):
+        self.pipeline_space = pipeline_space
diff --git a/neps/optimizers/multi_fidelity/utils.py b/neps/optimizers/multi_fidelity/utils.py
index 0158fbdf..8e7b4910 100644
--- a/neps/optimizers/multi_fidelity/utils.py
+++ b/neps/optimizers/multi_fidelity/utils.py
@@ -4,6 +4,7 @@
 from collections.abc import Sequence
 from copy import deepcopy
 from typing import TYPE_CHECKING, Any
+from typing_extensions import Self
 
 import numpy as np
 import pandas as pd
@@ -353,6 +354,50 @@ def get_max_observed_fidelity_level_per_config(self) -> pd.Series:
     def token_ids(self) -> np.ndarray:
         return self.df.index.values
 
+    @classmethod
+    def from_trials(cls, trials: Mapping[str, Trial]) -> Self:
+        observed_configs = MFObservedData(
+            columns=["config", "perf", "learning_curves"],
+            index_names=["config_id", "budget_id"],
+        )
+
+        def _data(trial: Trial) -> Any:
+            # Considered pending
+            if report is None:
+                loss = np.nan
+                lc = [np.nan]
+            else:
+                loss = report.loss if report.loss is not None else "error"
+                lc = (
+                    report.learning_curve
+                    if report.learning_curve is not None
+                    else "error"
+                )
+
+            return [trial.config, loss, lc]
+
+        # previous optimization run exists and needs to be loaded
+        def index_data_split(
+            config_id: str, trial: Trial
+        ) -> tuple[tuple[int, int], list]:
+            _config_id, _budget_id = config_id.split("_")
+            index = int(_config_id), int(_budget_id)
+            return index, _data(trial)
+
+        if len(trials) > 0:
+            index_row = [
+                tuple(index_data_split(trial_id, trial))
+                for trial_id, trial in trials.items()
+            ]
+            indices, rows = zip(*index_row, strict=True)
+            observed_configs.add_data(data=list(rows), index=list(indices))
+
+        # an aesthetic choice more than a functional choice
+        observed_configs.df = observed_configs.df.sort_index(
+            level=self.observed_configs.df.index.names, inplace=True
+        )
+        return observed_configs
+
 
 if __name__ == "__main__":
     # TODO: Either delete these or convert them to tests (karibbov)
diff --git a/neps/optimizers/multi_fidelity_prior/priorband.py b/neps/optimizers/multi_fidelity_prior/priorband.py
index bdf3a567..f4bc067b 100644
--- a/neps/optimizers/multi_fidelity_prior/priorband.py
+++ b/neps/optimizers/multi_fidelity_prior/priorband.py
@@ -296,8 +296,6 @@ def __init__(
         initial_design_size: int | None = None,
         model_policy: typing.Any = ModelPolicy,
         surrogate_model: str | typing.Any = "gp",
-        domain_se_kernel: str | None = None,
-        hp_kernels: list | None = None,
         surrogate_model_args: dict | None = None,
         acquisition: str | BaseAcquisition = "EI",
         log_prior_weighted: bool = False,
@@ -339,8 +337,6 @@ def __init__(
 
         bo_args = {
             "surrogate_model": surrogate_model,
-            "domain_se_kernel": domain_se_kernel,
-            "hp_kernels": hp_kernels,
             "surrogate_model_args": surrogate_model_args,
             "acquisition": acquisition,
             "log_prior_weighted": log_prior_weighted,
diff --git a/neps/utils/common.py b/neps/utils/common.py
index 2c6f9d35..d0fb2137 100644
--- a/neps/utils/common.py
+++ b/neps/utils/common.py
@@ -53,7 +53,7 @@ def load_checkpoint(
     if not checkpoint_path.exists():
         return None
 
-    checkpoint = torch.load(checkpoint_path)
+    checkpoint = torch.load(checkpoint_path, weights_only=True)
 
     if model is not None and "model_state_dict" in checkpoint:
         model.load_state_dict(checkpoint["model_state_dict"])
@@ -141,7 +141,7 @@ def load_lightning_checkpoint(
 
     assert len(ckpt_files) == 1
     checkpoint_path = ckpt_files[0]
-    checkpoint = torch.load(checkpoint_path)
+    checkpoint = torch.load(checkpoint_path, weights_only=True)
     return checkpoint_path, checkpoint
 
 
diff --git a/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py b/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py
deleted file mode 100644
index 0d2acfb0..00000000
--- a/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import logging
-import time
-
-from torch import nn
-
-import neps
-from neps.optimizers.bayesian_optimization.kernels import GraphKernelMapping
-from neps.optimizers.bayesian_optimization.models.gp_hierarchy import (
-    ComprehensiveGPHierarchy,
-)
-from neps.search_spaces.architecture import primitives as ops
-from neps.search_spaces.architecture import topologies as topos
-
-primitives = {
-    "id": ops.Identity(),
-    "conv3x3": {"op": ops.ReLUConvBN, "kernel_size": 3, "stride": 1, "padding": 1},
-    "conv1x1": {"op": ops.ReLUConvBN, "kernel_size": 1},
-    "avg_pool": {"op": ops.AvgPool1x1, "kernel_size": 3, "stride": 1},
-    "downsample": {"op": ops.ResNetBasicblock, "stride": 2},
-    "residual": topos.Residual,
-    "diamond": topos.Diamond,
-    "linear": topos.get_sequential_n_edge(2),
-    "diamond_mid": topos.DiamondMid,
-}
-
-structure = {
-    "S": [
-        "diamond D2 D2 D1 D1",
-        "diamond D1 D2 D2 D1",
-        "diamond D1 D1 D2 D2",
-        "linear D2 D1",
-        "linear D1 D2",
-        "diamond_mid D1 D2 D1 D2 D1",
-        "diamond_mid D2 D2 Cell D1 D1",
-    ],
-    "D2": [
-        "diamond D1 D1 D1 D1",
-        "linear D1 D1",
-        "diamond_mid D1 D1 Cell D1 D1",
-    ],
-    "D1": [
-        "diamond D1Helper D1Helper Cell Cell",
-        "diamond Cell Cell D1Helper D1Helper",
-        "diamond D1Helper Cell Cell D1Helper",
-        "linear D1Helper Cell",
-        "linear Cell D1Helper",
-        "diamond_mid D1Helper D1Helper Cell Cell Cell",
-        "diamond_mid Cell D1Helper D1Helper D1Helper Cell",
-    ],
-    "D1Helper": ["linear Cell downsample"],
-    "Cell": [
-        "residual OPS OPS OPS",
-        "diamond OPS OPS OPS OPS",
-        "linear OPS OPS",
-        "diamond_mid OPS OPS OPS OPS OPS",
-    ],
-    "OPS": ["conv3x3", "conv1x1", "avg_pool", "id"],
-}
-
-
-def set_recursive_attribute(op_name, predecessor_values):
-    in_channels = 64 if predecessor_values is None else predecessor_values["c_out"]
-    out_channels = in_channels * 2 if op_name == "ResNetBasicblock" else in_channels
-    return dict(c_in=in_channels, c_out=out_channels)
-
-
-def run_pipeline(architecture):
-    start = time.time()
-
-    in_channels = 3
-    n_classes = 20
-    base_channels = 64
-    out_channels = 512
-
-    model = architecture.to_pytorch()
-    model = nn.Sequential(
-        ops.Stem(base_channels, c_in=in_channels),
-        model,
-        nn.AdaptiveAvgPool2d(1),
-        nn.Flatten(),
-        nn.Linear(out_channels, n_classes),
-    )
-
-    number_of_params = sum(p.numel() for p in model.parameters())
-    y = abs(1.5e7 - number_of_params) / 1.5e7
-
-    end = time.time()
-
-    return {
-        "loss": y,
-        "info_dict": {
-            "test_score": y,
-            "train_time": end - start,
-        },
-    }
-
-
-pipeline_space = dict(
-    architecture=neps.FunctionParameter(
-        set_recursive_attribute=set_recursive_attribute,
-        structure=structure,
-        primitives=primitives,
-        name="makrograph",
-        return_graph_per_hierarchy=True,
-    )
-)
-
-early_hierarchies_considered = "0_1_2_3"
-hierarchy_considered = [int(hl) for hl in early_hierarchies_considered.split("_")]
-graph_kernels = ["wl"] * (len(hierarchy_considered) + 1)
-wl_h = [2, 1] + [2] * (len(hierarchy_considered) - 1)
-graph_kernels = [
-    GraphKernelMapping[kernel](
-        h=wl_h[j],
-        oa=False,
-        se_kernel=None,
-    )
-    for j, kernel in enumerate(graph_kernels)
-]
-surrogate_model = ComprehensiveGPHierarchy
-surrogate_model_args = {
-    "graph_kernels": graph_kernels,
-    "hp_kernels": [],
-    "verbose": False,
-    "hierarchy_consider": hierarchy_considered,
-    "d_graph_features": 0,
-    "vectorial_features": None,
-}
-
-logging.basicConfig(level=logging.INFO)
-neps.run(
-    run_pipeline=run_pipeline,
-    pipeline_space=pipeline_space,
-    root_directory="results/hierarchical_architecture_example_new",
-    max_evaluations_total=15,
-    searcher="bayesian_optimization",
-    surrogate_model=surrogate_model,
-    surrogate_model_args=surrogate_model_args,
-)
-
-previous_results, pending_configs = neps.status(
-    "results/hierarchical_architecture_example_new"
-)
diff --git a/neps_examples/experimental/user_priors_from_arbitrary_densities.py b/neps_examples/experimental/user_priors_from_arbitrary_densities.py
deleted file mode 100644
index 4c734cd2..00000000
--- a/neps_examples/experimental/user_priors_from_arbitrary_densities.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import neps
-
-def run_pipeline(some_float, some_integer, some_cat):
-    if some_cat != "a":
-        y = some_float + some_integer
-    else:
-        y = -some_float - some_integer
-    return y
-
-# ========================================================================================
-# Current API
-# User prior is given as a default value and a confidence level specified in the parameter itself
-pipeline_space = dict(
-    some_float=neps.FloatParameter(
-        lower=1, upper=1000, log=True, default=900, default_confidence="medium"
-    ),
-    some_integer=neps.IntegerParameter(
-        lower=0, upper=50, default=35, default_confidence="low"
-    ),
-    some_cat=neps.CategoricalParameter(
-        choices=["a", "b", "c"], default="a", default_confidence="high"
-    )
-)
-neps.run(
-    run_pipeline=run_pipeline,
-    pipeline_space=pipeline_space,
-    root_directory="results",
-    max_evaluations_total=15,
-)
-
-# ========================================================================================
-# New API, variant 01
-# User prior is passed to neps.run and not specified in the pipeline_space
-# The prior is given as one of the following:
-# 1) A (non-factorized) density function that returns the likelihood of a given parameter configuration
-# 2) A dicttionary of marginal densities for each parameter. Then the factorized density is used.
-# 3) A dictionary of default values and confidence levels for each parameter. Then a gaussian prior is used.
-
-pipeline_space = dict(
-    some_float=neps.FloatParameter(lower=1, upper=1000, log=True),
-    some_integer=neps.IntegerParameter(lower=0, upper=50),
-    some_cat=neps.CategoricalParameter(choices=["a", "b", "c"])
-)
-
-# 1) A (non-factorized) density function that returns the likelihood of a given parameter configuration
-def prior_01(some_float, some_integer, some_cat):
-    # some exponential distribution
-    if some_cat != "a":
-        return np.exp(-(some_float + some_integer - 1))
-    else:
-        return np.exp(-(-some_float - some_integer + 1050))
-
-# 2) A dictionary of marginal densities for each parameter. Then the factorized density is used.
-prior_02 = dict(
-    some_float=lambda x: 1/400 if 800 < x < 1000 else 1/1600, # prior on interval [800, 1000]
-    some_integer=lambda k: 30**k/np.math.factorial(k) * np.exp(-k), # poisson prior on integers k=30
-    some_cat=lambda x: 1/2*(x=="b") + 1/3*(x=="c") + 1/6*(x=="a")
-)
-
-# 3) A dictionary of default values and confidence levels for each parameter. Then a gaussian prior is used.
-prior_03 = dict(
-    some_float=dict(default=900, default_confidence="medium"),
-    some_integer=dict(default=35, default_confidence="low"),
-    some_cat=dict(default="a", default_confidence="high")
-)
-
-# Combination of 2) and 3)
-prior_04 = dict(
-    some_float=dict(default=900, default_confidence="medium"),
-    some_integer=lambda k: 30**k/np.math.factorial(k) * np.exp(-k), # poisson prior on integers k=30
-    some_cat=dict(default="a", default_confidence="high")
-)
-
-# Pass the prior to neps.run
-
-neps.run(
-    prior=prior_01, # or prior_02 or prior_03 or prior_04
-    run_pipeline=run_pipeline,
-    pipeline_space=pipeline_space,
-    root_directory="results",
-    max_evaluations_total=15,
-)
-
-# ========================================================================================
-# New API, variant 02
-# User prior is specfied in the pipeline_space and not directly passed to neps.run
-# Same possibiities for priors as in variant 01
-
-# 1) A (non-factorized) density function that returns the likelihood of a given parameter configuration
-def prior_01(some_float, some_integer, some_cat):
-    # some exponential distribution
-    if some_cat != "a":
-        return np.exp(-(some_float + some_integer - 1))
-    else:
-        return np.exp(-(-some_float - some_integer + 1050))
-
-pipeline_space_01 = dict(
-    some_float=neps.FloatParameter(lower=1, upper=1000, log=True),
-    some_integer=neps.IntegerParameter(lower=0, upper=50),
-    some_cat=neps.CategoricalParameter(choices=["a", "b", "c"]),
-    _prior=prior_01
-)
-
-# 2) A dictionary of marginal densities for each parameter. Then the factorized density is used.
-pipeline_space_02 = dict(
-    some_float=neps.FloatParameter(
-        lower=1, upper=1000, log=True,
-        prior_fun=lambda x: 1/400 if 800 < x < 1000 else 1/1600
-    ),
-    some_integer=neps.IntegerParameter(lower=0, upper=50,
-        prior_fun=lambda k: 30**k/np.math.factorial(k) * np.exp(-k)
-),
-    some_cat=neps.CategoricalParameter(choices=["a", "b", "c"],
-        prior_fun=lambda x: 1/2*(x=="b") + 1/3*(x=="c") + 1/6*(x=="a")
-    )
-)
-
-# 3) A dictionary of default values and confidence levels for each parameter. Then a gaussian prior is used.
-# Same as in the current API
-pipeline_space_03 = dict(
-    some_float=neps.FloatParameter(
-        lower=1, upper=1000, log=True, default=900, default_confidence="medium"
-    ),
-    some_integer=neps.IntegerParameter(
-        lower=0, upper=50, default=35, default_confidence="low"
-    ),
-    some_cat=neps.CategoricalParameter(
-        choices=["a", "b", "c"], default="a", default_confidence="high"
-    )
-)
-
-# Combination of 2) and 3)
-pipeline_space_04 = dict(
-    some_float=neps.FloatParameter(
-        lower=1, upper=1000, log=True, default=900, default_confidence="medium",
-    ),
-    some_integer=neps.IntegerParameter(
-        lower=0, upper=50,
-        prior_fun=lambda k: 30**k/np.math.factorial(k) * np.exp(-k)
-    ),
-    some_cat=neps.CategoricalParameter(
-        choices=["a", "b", "c"], default="a", default_confidence="high")
-)
-
-# Pass the pipeline_space to neps.run
-neps.run(
-    run_pipeline=run_pipeline,
-    pipeline_space=pipeline_space_01, # or pipeline_space_02 or pipeline_space_03 or pipeline_space_04
-    root_directory="results",
-    max_evaluations_total=15,
-)
diff --git a/tests/test_neps_api/testing_scripts/default_neps.py b/tests/test_neps_api/testing_scripts/default_neps.py
index 5384042a..370c6255 100644
--- a/tests/test_neps_api/testing_scripts/default_neps.py
+++ b/tests/test_neps_api/testing_scripts/default_neps.py
@@ -2,9 +2,6 @@
 
 import neps
 from neps.optimizers.bayesian_optimization.kernels import GraphKernelMapping
-from neps.optimizers.bayesian_optimization.models.gp_hierarchy import (
-    ComprehensiveGPHierarchy,
-)
 
 pipeline_space_fidelity_priors = dict(
     val1=neps.FloatParameter(lower=-10, upper=10, default=1),
@@ -63,7 +60,7 @@ def run_pipeline(val1, val2):
     )
     for j, kernel in enumerate(graph_kernels)
 ]
-surrogate_model = ComprehensiveGPHierarchy
+surrogate_model =
 surrogate_model_args = {
     "graph_kernels": graph_kernels,
     "hp_kernels": [],
diff --git a/tests/test_settings/test_settings.py b/tests/test_settings/test_settings.py
index fcdac758..fe649563 100644
--- a/tests/test_settings/test_settings.py
+++ b/tests/test_settings/test_settings.py
@@ -2,8 +2,12 @@
 import pytest
 import neps
 from neps.utils.run_args import get_run_args_from_yaml
-from tests.test_yaml_run_args.test_yaml_run_args import (run_pipeline, hook1, hook2,
-                                                         pipeline_space)
+from tests.test_yaml_run_args.test_yaml_run_args import (
+    run_pipeline,
+    hook1,
+    hook2,
+    pipeline_space,
+)
 from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization
 from typing import Union, Callable, Dict, List, Type
 
@@ -16,276 +20,291 @@
 
 
 @pytest.mark.neps_api
-@pytest.mark.parametrize("func_args, yaml_args, expected_output", [
-    (
-        {  # only essential arguments provided by func_args, no yaml
-            "run_pipeline": run_pipeline,
-            "root_directory": "path/to/root_directory",
-            "pipeline_space": pipeline_space,
-            "run_args": Default(None),
-            "overwrite_working_directory": Default(False),
-            "post_run_summary": Default(True),
-            "development_stage_id": Default(None),
-            "task_id": Default(None),
-            "max_evaluations_total": 10,
-            "max_evaluations_per_run": Default(None),
-            "continue_until_max_evaluation_completed": Default(False),
-            "max_cost_total": Default(None),
-            "ignore_errors": Default(False),
-            "loss_value_on_error": Default(None),
-            "cost_value_on_error": Default(None),
-            "pre_load_hooks": Default(None),
-            "searcher": Default("default"),
-            "searcher_kwargs": {},
-        }
-        ,
-        Default(None),
-        {
-            "run_pipeline": run_pipeline,
-            "root_directory": "path/to/root_directory",
-            "pipeline_space": pipeline_space,
-            "overwrite_working_directory": False,
-            "post_run_summary": True,
-            "development_stage_id": None,
-            "task_id": None,
-            "max_evaluations_total": 10,
-            "max_evaluations_per_run": None,
-            "continue_until_max_evaluation_completed": False,
-            "max_cost_total": None,
-            "ignore_errors": False,
-            "loss_value_on_error": None,
-            "cost_value_on_error": None,
-            "pre_load_hooks": None,
-            "searcher": "default",
-            "searcher_kwargs": {}
-        }
-    ),
-    ({  # only required elements of run_args
-        "run_pipeline": Default(None),
-        "root_directory": Default(None),
-        "pipeline_space": Default(None),
-        "run_args": Default(None),
-        "overwrite_working_directory": Default(False),
-        "post_run_summary": Default(True),
-        "development_stage_id": Default(None),
-        "task_id": Default(None),
-        "max_evaluations_total": Default(None),
-        "max_evaluations_per_run": Default(None),
-        "continue_until_max_evaluation_completed": Default(False),
-        "max_cost_total": Default(None),
-        "ignore_errors": Default(False),
-        "loss_value_on_error": Default(None),
-        "cost_value_on_error": Default(None),
-        "pre_load_hooks": Default(None),
-        "searcher": Default("default"),
-        "searcher_kwargs": {},
-    },
-    "/run_args_required.yaml",
-    {
-        "run_pipeline": run_pipeline,
-        "root_directory": "path/to/root_directory",
-        "pipeline_space": pipeline_space,
-        "overwrite_working_directory": False,
-        "post_run_summary": True,
-        "development_stage_id": None,
-        "task_id": None,
-        "max_evaluations_total": 10,
-        "max_evaluations_per_run": None,
-        "continue_until_max_evaluation_completed": False,
-        "max_cost_total": None,
-        "ignore_errors": False,
-        "loss_value_on_error": None,
-        "cost_value_on_error": None,
-        "pre_load_hooks": None,
-        "searcher": "default",
-        "searcher_kwargs": {}
-    }),
-    ({  # required via func_args, optional via yaml
-        "run_pipeline": run_pipeline,
-        "root_directory": "path/to/root_directory",
-        "pipeline_space": pipeline_space,
-        "run_args": "tests/path/to/run_args",  # will be ignored by Settings
-        "overwrite_working_directory": Default(False),
-        "post_run_summary": Default(True),
-        "development_stage_id": Default(None),
-        "task_id": Default(None),
-        "max_evaluations_total": 10,
-        "max_evaluations_per_run": Default(None),
-        "continue_until_max_evaluation_completed": Default(False),
-        "max_cost_total": Default(None),
-        "ignore_errors": Default(False),
-        "loss_value_on_error": Default(None),
-        "cost_value_on_error": Default(None),
-        "pre_load_hooks": Default(None),
-        "searcher": Default("default"),
-        "searcher_kwargs": {},
-    },
-    "/run_args_optional.yaml",
-    {
-        "run_pipeline": run_pipeline,
-        "root_directory": "path/to/root_directory",
-        "pipeline_space": pipeline_space,
-        "overwrite_working_directory": True,
-        "post_run_summary": False,
-        "development_stage_id": None,
-        "task_id": None,
-        "max_evaluations_total": 10,
-        "max_evaluations_per_run": None,
-        "continue_until_max_evaluation_completed": False,
-        "max_cost_total": None,
-        "ignore_errors": False,
-        "loss_value_on_error": None,
-        "cost_value_on_error": None,
-        "pre_load_hooks": None,
-        "searcher": "hyperband",
-        "searcher_kwargs": {}
-    }),
-    ({  # overwrite all yaml values
-            "run_pipeline": run_pipeline,
-            "root_directory": "path/to/root_directory",
-            "pipeline_space": pipeline_space,
-            "run_args": "test",
-            "overwrite_working_directory": False,
-            "post_run_summary": True,
-            "development_stage_id": 5,
-            "task_id": None,
-            "max_evaluations_total": 17,
-            "max_evaluations_per_run": None,
-            "continue_until_max_evaluation_completed": False,
-            "max_cost_total": None,
-            "ignore_errors": False,
-            "loss_value_on_error": None,
-            "cost_value_on_error": None,
-            "pre_load_hooks": None,
-            "searcher": "default",
-            "searcher_kwargs": {},
-        }
-        ,
-        "/overwrite_run_args.yaml",
-     {
-         "run_pipeline": run_pipeline,
-         "root_directory": "path/to/root_directory",
-         "pipeline_space": pipeline_space,
-         "overwrite_working_directory": False,
-         "post_run_summary": True,
-         "development_stage_id": 5,
-         "task_id": None,
-         "max_evaluations_total": 17,
-         "max_evaluations_per_run": None,
-         "continue_until_max_evaluation_completed": False,
-         "max_cost_total": None,
-         "ignore_errors": False,
-         "loss_value_on_error": None,
-         "cost_value_on_error": None,
-         "pre_load_hooks": None,
-         "searcher": "default",
-         "searcher_kwargs": {},
-     }
-    ),
-    ({  # optimizer args special case
-            "run_pipeline": run_pipeline,
-            "root_directory": "path/to/root_directory",
-            "pipeline_space": pipeline_space,
-            "run_args": "test",
-            "overwrite_working_directory": False,
-            "post_run_summary": True,
-            "development_stage_id": 5,
-            "task_id": None,
-            "max_evaluations_total": 17,
-            "max_evaluations_per_run": None,
-            "continue_until_max_evaluation_completed": False,
-            "max_cost_total": None,
-            "ignore_errors": False,
-            "loss_value_on_error": None,
-            "cost_value_on_error": None,
-            "pre_load_hooks": None,
-            "searcher": Default("default"),
-            "searcher_kwargs": {"initial_design_type": "max_budget",
-            "use_priors": False,
-            "random_interleave_prob": 0.0,
-            "sample_default_first": False,
-            "sample_default_at_target": False},
-        }
-        ,
-        "/run_args_optimizer_settings.yaml",
-     {
-         "run_pipeline": run_pipeline,
-         "root_directory": "path/to/root_directory",
-         "pipeline_space": pipeline_space,
-         "overwrite_working_directory": False,
-         "post_run_summary": True,
-         "development_stage_id": 5,
-         "task_id": None,
-         "max_evaluations_total": 17,
-         "max_evaluations_per_run": None,
-         "continue_until_max_evaluation_completed": False,
-         "max_cost_total": None,
-         "ignore_errors": False,
-         "loss_value_on_error": None,
-         "cost_value_on_error": None,
-         "pre_load_hooks": None,
-         "searcher": {
-            "strategy": "hyperband",
-            "eta": 3,
-            "initial_design_type": "max_budget",
-            "use_priors": False,
-            "random_interleave_prob": 0.0,
-            "sample_default_first": False,
-            "sample_default_at_target": False},
-         "searcher_kwargs": {"initial_design_type": "max_budget",
-            "use_priors": False,
-            "random_interleave_prob": 0.0,
-            "sample_default_first": False,
-            "sample_default_at_target": False},
-     }),
-({  # load optimizer with args
-        "run_pipeline": Default(None),
-        "root_directory": Default(None),
-        "pipeline_space": Default(None),
-        "run_args": Default(None),
-        "overwrite_working_directory": Default(False),
-        "post_run_summary": Default(True),
-        "development_stage_id": Default(None),
-        "task_id": Default(None),
-        "max_evaluations_total": Default(None),
-        "max_evaluations_per_run": Default(None),
-        "continue_until_max_evaluation_completed": Default(False),
-        "max_cost_total": Default(None),
-        "ignore_errors": Default(False),
-        "loss_value_on_error": Default(None),
-        "cost_value_on_error": Default(None),
-        "pre_load_hooks": Default(None),
-        "searcher": Default("default"),
-        "searcher_kwargs": {"random_interleave_prob": 0.2,
-                            "initial_design_size": 9},
-        }
-        ,
-        "/run_args_optimizer_outside.yaml",
-     {
-         "run_pipeline": run_pipeline,
-         "root_directory": "path/to/root_directory",
-         "pipeline_space": pipeline_space,
-         "overwrite_working_directory": True,
-         "post_run_summary": True,
-         "development_stage_id": None,
-         "task_id": None,
-         "max_evaluations_total": 10,
-         "max_evaluations_per_run": None,
-         "continue_until_max_evaluation_completed": False,
-         "max_cost_total": None,
-         "ignore_errors": False,
-         "loss_value_on_error": None,
-         "cost_value_on_error": None,
-         "pre_load_hooks": None,
-         "searcher": my_bayesian,
-         "searcher_kwargs": {"acquisition": "EI",
-                             "acquisition_sampler": "random",
-                             "random_interleave_prob": 0.2,
-                             "initial_design_size": 9,
-                             "surrogate_model": "gp"
-                             },
-     })
-])
+@pytest.mark.parametrize(
+    "func_args, yaml_args, expected_output",
+    [
+        (
+            {  # only essential arguments provided by func_args, no yaml
+                "run_pipeline": run_pipeline,
+                "root_directory": "path/to/root_directory",
+                "pipeline_space": pipeline_space,
+                "run_args": Default(None),
+                "overwrite_working_directory": Default(False),
+                "post_run_summary": Default(True),
+                "development_stage_id": Default(None),
+                "task_id": Default(None),
+                "max_evaluations_total": 10,
+                "max_evaluations_per_run": Default(None),
+                "continue_until_max_evaluation_completed": Default(False),
+                "max_cost_total": Default(None),
+                "ignore_errors": Default(False),
+                "loss_value_on_error": Default(None),
+                "cost_value_on_error": Default(None),
+                "pre_load_hooks": Default(None),
+                "searcher": Default("default"),
+                "searcher_kwargs": {},
+            },
+            Default(None),
+            {
+                "run_pipeline": run_pipeline,
+                "root_directory": "path/to/root_directory",
+                "pipeline_space": pipeline_space,
+                "overwrite_working_directory": False,
+                "post_run_summary": True,
+                "development_stage_id": None,
+                "task_id": None,
+                "max_evaluations_total": 10,
+                "max_evaluations_per_run": None,
+                "continue_until_max_evaluation_completed": False,
+                "max_cost_total": None,
+                "ignore_errors": False,
+                "loss_value_on_error": None,
+                "cost_value_on_error": None,
+                "pre_load_hooks": None,
+                "searcher": "default",
+                "searcher_kwargs": {},
+            },
+        ),
+        (
+            {  # only required elements of run_args
+                "run_pipeline": Default(None),
+                "root_directory": Default(None),
+                "pipeline_space": Default(None),
+                "run_args": Default(None),
+                "overwrite_working_directory": Default(False),
+                "post_run_summary": Default(True),
+                "development_stage_id": Default(None),
+                "task_id": Default(None),
+                "max_evaluations_total": Default(None),
+                "max_evaluations_per_run": Default(None),
+                "continue_until_max_evaluation_completed": Default(False),
+                "max_cost_total": Default(None),
+                "ignore_errors": Default(False),
+                "loss_value_on_error": Default(None),
+                "cost_value_on_error": Default(None),
+                "pre_load_hooks": Default(None),
+                "searcher": Default("default"),
+                "searcher_kwargs": {},
+            },
+            "/run_args_required.yaml",
+            {
+                "run_pipeline": run_pipeline,
+                "root_directory": "path/to/root_directory",
+                "pipeline_space": pipeline_space,
+                "overwrite_working_directory": False,
+                "post_run_summary": True,
+                "development_stage_id": None,
+                "task_id": None,
+                "max_evaluations_total": 10,
+                "max_evaluations_per_run": None,
+                "continue_until_max_evaluation_completed": False,
+                "max_cost_total": None,
+                "ignore_errors": False,
+                "loss_value_on_error": None,
+                "cost_value_on_error": None,
+                "pre_load_hooks": None,
+                "searcher": "default",
+                "searcher_kwargs": {},
+            },
+        ),
+        (
+            {  # required via func_args, optional via yaml
+                "run_pipeline": run_pipeline,
+                "root_directory": "path/to/root_directory",
+                "pipeline_space": pipeline_space,
+                "run_args": "tests/path/to/run_args",  # will be ignored by Settings
+                "overwrite_working_directory": Default(False),
+                "post_run_summary": Default(True),
+                "development_stage_id": Default(None),
+                "task_id": Default(None),
+                "max_evaluations_total": 10,
+                "max_evaluations_per_run": Default(None),
+                "continue_until_max_evaluation_completed": Default(False),
+                "max_cost_total": Default(None),
+                "ignore_errors": Default(False),
+                "loss_value_on_error": Default(None),
+                "cost_value_on_error": Default(None),
+                "pre_load_hooks": Default(None),
+                "searcher": Default("default"),
+                "searcher_kwargs": {},
+            },
+            "/run_args_optional.yaml",
+            {
+                "run_pipeline": run_pipeline,
+                "root_directory": "path/to/root_directory",
+                "pipeline_space": pipeline_space,
+                "overwrite_working_directory": True,
+                "post_run_summary": False,
+                "development_stage_id": None,
+                "task_id": None,
+                "max_evaluations_total": 10,
+                "max_evaluations_per_run": None,
+                "continue_until_max_evaluation_completed": False,
+                "max_cost_total": None,
+                "ignore_errors": False,
+                "loss_value_on_error": None,
+                "cost_value_on_error": None,
+                "pre_load_hooks": None,
+                "searcher": "hyperband",
+                "searcher_kwargs": {},
+            },
+        ),
+        (
+            {  # overwrite all yaml values
+                "run_pipeline": run_pipeline,
+                "root_directory": "path/to/root_directory",
+                "pipeline_space": pipeline_space,
+                "run_args": "test",
+                "overwrite_working_directory": False,
+                "post_run_summary": True,
+                "development_stage_id": 5,
+                "task_id": None,
+                "max_evaluations_total": 17,
+                "max_evaluations_per_run": None,
+                "continue_until_max_evaluation_completed": False,
+                "max_cost_total": None,
+                "ignore_errors": False,
+                "loss_value_on_error": None,
+                "cost_value_on_error": None,
+                "pre_load_hooks": None,
+                "searcher": "default",
+                "searcher_kwargs": {},
+            },
+            "/overwrite_run_args.yaml",
+            {
+                "run_pipeline": run_pipeline,
+                "root_directory": "path/to/root_directory",
+                "pipeline_space": pipeline_space,
+                "overwrite_working_directory": False,
+                "post_run_summary": True,
+                "development_stage_id": 5,
+                "task_id": None,
+                "max_evaluations_total": 17,
+                "max_evaluations_per_run": None,
+                "continue_until_max_evaluation_completed": False,
+                "max_cost_total": None,
+                "ignore_errors": False,
+                "loss_value_on_error": None,
+                "cost_value_on_error": None,
+                "pre_load_hooks": None,
+                "searcher": "default",
+                "searcher_kwargs": {},
+            },
+        ),
+        (
+            {  # optimizer args special case
+                "run_pipeline": run_pipeline,
+                "root_directory": "path/to/root_directory",
+                "pipeline_space": pipeline_space,
+                "run_args": "test",
+                "overwrite_working_directory": False,
+                "post_run_summary": True,
+                "development_stage_id": 5,
+                "task_id": None,
+                "max_evaluations_total": 17,
+                "max_evaluations_per_run": None,
+                "continue_until_max_evaluation_completed": False,
+                "max_cost_total": None,
+                "ignore_errors": False,
+                "loss_value_on_error": None,
+                "cost_value_on_error": None,
+                "pre_load_hooks": None,
+                "searcher": Default("default"),
+                "searcher_kwargs": {
+                    "initial_design_type": "max_budget",
+                    "use_priors": False,
+                    "random_interleave_prob": 0.0,
+                    "sample_default_first": False,
+                    "sample_default_at_target": False,
+                },
+            },
+            "/run_args_optimizer_settings.yaml",
+            {
+                "run_pipeline": run_pipeline,
+                "root_directory": "path/to/root_directory",
+                "pipeline_space": pipeline_space,
+                "overwrite_working_directory": False,
+                "post_run_summary": True,
+                "development_stage_id": 5,
+                "task_id": None,
+                "max_evaluations_total": 17,
+                "max_evaluations_per_run": None,
+                "continue_until_max_evaluation_completed": False,
+                "max_cost_total": None,
+                "ignore_errors": False,
+                "loss_value_on_error": None,
+                "cost_value_on_error": None,
+                "pre_load_hooks": None,
+                "searcher": {
+                    "strategy": "hyperband",
+                    "eta": 3,
+                    "initial_design_type": "max_budget",
+                    "use_priors": False,
+                    "random_interleave_prob": 0.0,
+                    "sample_default_first": False,
+                    "sample_default_at_target": False,
+                },
+                "searcher_kwargs": {
+                    "initial_design_type": "max_budget",
+                    "use_priors": False,
+                    "random_interleave_prob": 0.0,
+                    "sample_default_first": False,
+                    "sample_default_at_target": False,
+                },
+            },
+        ),
+        (
+            {  # load optimizer with args
+                "run_pipeline": Default(None),
+                "root_directory": Default(None),
+                "pipeline_space": Default(None),
+                "run_args": Default(None),
+                "overwrite_working_directory": Default(False),
+                "post_run_summary": Default(True),
+                "development_stage_id": Default(None),
+                "task_id": Default(None),
+                "max_evaluations_total": Default(None),
+                "max_evaluations_per_run": Default(None),
+                "continue_until_max_evaluation_completed": Default(False),
+                "max_cost_total": Default(None),
+                "ignore_errors": Default(False),
+                "loss_value_on_error": Default(None),
+                "cost_value_on_error": Default(None),
+                "pre_load_hooks": Default(None),
+                "searcher": Default("default"),
+                "searcher_kwargs": {
+                    "random_interleave_prob": 0.2,
+                    "initial_design_size": 9,
+                },
+            },
+            "/run_args_optimizer_outside.yaml",
+            {
+                "run_pipeline": run_pipeline,
+                "root_directory": "path/to/root_directory",
+                "pipeline_space": pipeline_space,
+                "overwrite_working_directory": True,
+                "post_run_summary": True,
+                "development_stage_id": None,
+                "task_id": None,
+                "max_evaluations_total": 10,
+                "max_evaluations_per_run": None,
+                "continue_until_max_evaluation_completed": False,
+                "max_cost_total": None,
+                "ignore_errors": False,
+                "loss_value_on_error": None,
+                "cost_value_on_error": None,
+                "pre_load_hooks": None,
+                "searcher": my_bayesian,
+                "searcher_kwargs": {
+                    "acquisition": "EI",
+                    "acquisition_sampler": "random",
+                    "random_interleave_prob": 0.2,
+                    "initial_design_size": 9,
+                },
+            },
+        ),
+    ],
+)
 def test_check_settings(func_args: Dict, yaml_args: str, expected_output: Dict) -> None:
     """
     Check if expected settings are set
@@ -299,33 +318,37 @@ def test_check_settings(func_args: Dict, yaml_args: str, expected_output: Dict)
 
 
 @pytest.mark.neps_api
-@pytest.mark.parametrize("func_args, yaml_args, error", [
-    (
-        {
-            "root_directory": Default(None),
-            "pipeline_space": Default(None),
-            "run_args": Default(None),
-            "overwrite_working_directory": Default(False),
-            "post_run_summary": Default(True),
-            "development_stage_id": Default(None),
-            "task_id": Default(None),
-            "max_evaluations_total": Default(None),
-            "max_evaluations_per_run": Default(None),
-            "continue_until_max_evaluation_completed": Default(False),
-            "max_cost_total": Default(None),
-            "ignore_errors": Default(False),
-            "loss_value_on_error": Default(None),
-            "cost_value_on_error": Default(None),
-            "pre_load_hooks": Default(None),
-            "searcher": Default("default"),
-            "searcher_kwargs": {},
-        },
-        Default(None),
-        ValueError
-    )
-])
-def test_settings_initialization_error(func_args: Dict, yaml_args: Union[str, Default],
-                                       error: Exception) -> None:
+@pytest.mark.parametrize(
+    "func_args, yaml_args, error",
+    [
+        (
+            {
+                "root_directory": Default(None),
+                "pipeline_space": Default(None),
+                "run_args": Default(None),
+                "overwrite_working_directory": Default(False),
+                "post_run_summary": Default(True),
+                "development_stage_id": Default(None),
+                "task_id": Default(None),
+                "max_evaluations_total": Default(None),
+                "max_evaluations_per_run": Default(None),
+                "continue_until_max_evaluation_completed": Default(False),
+                "max_cost_total": Default(None),
+                "ignore_errors": Default(False),
+                "loss_value_on_error": Default(None),
+                "cost_value_on_error": Default(None),
+                "pre_load_hooks": Default(None),
+                "searcher": Default("default"),
+                "searcher_kwargs": {},
+            },
+            Default(None),
+            ValueError,
+        )
+    ],
+)
+def test_settings_initialization_error(
+    func_args: Dict, yaml_args: Union[str, Default], error: Exception
+) -> None:
     """
     Test if Settings raises Error when essential arguments are missing
     """
diff --git a/tests/test_yaml_run_args/test_yaml_run_args.py b/tests/test_yaml_run_args/test_yaml_run_args.py
index 5a0c5d22..8200b2fd 100644
--- a/tests/test_yaml_run_args/test_yaml_run_args.py
+++ b/tests/test_yaml_run_args/test_yaml_run_args.py
@@ -5,11 +5,12 @@
 from typing import Union, Callable, Dict, List, Type
 
 BASE_PATH = "tests/test_yaml_run_args/"
-pipeline_space = dict(lr=neps.FloatParameter(lower=1e-3, upper=0.1),
-                      optimizer=neps.CategoricalParameter(choices=["adam", "sgd",
-                                                                   "adamw"]),
-                      epochs=neps.IntegerParameter(lower=1, upper=10),
-                      batch_size=neps.ConstantParameter(value=64))
+pipeline_space = dict(
+    lr=neps.FloatParameter(lower=1e-3, upper=0.1),
+    optimizer=neps.CategoricalParameter(choices=["adam", "sgd", "adamw"]),
+    epochs=neps.IntegerParameter(lower=1, upper=10),
+    batch_size=neps.ConstantParameter(value=64),
+)
 
 
 def run_pipeline():
@@ -44,8 +45,9 @@ def check_run_args(yaml_path_run_args: str, expected_output: Dict) -> None:
     """
     output = get_run_args_from_yaml(BASE_PATH + yaml_path_run_args)
 
-    def are_functions_equivalent(f1: Union[Callable, List[Callable]],
-                                 f2: Union[Callable, List[Callable]]) -> bool:
+    def are_functions_equivalent(
+        f1: Union[Callable, List[Callable]], f2: Union[Callable, List[Callable]]
+    ) -> bool:
         """
         Compares functions or lists of functions for equivalence by their bytecode,
         useful when identical functions have different memory addresses. This method
@@ -111,8 +113,10 @@ def are_functions_equivalent(f1: Union[Callable, List[Callable]],
                 "loss_value_on_error": 4.2,
                 "cost_value_on_error": 3.7,
                 "ignore_errors": True,
-                "searcher": {"strategy": "bayesian_optimization",
-                             "initial_design_size": 5, "surrogate_model": "gp"},
+                "searcher": {
+                    "strategy": "bayesian_optimization",
+                    "initial_design_size": 5,
+                },
                 "pre_load_hooks": [hook1, hook2],
             },
         ),
@@ -133,8 +137,10 @@ def are_functions_equivalent(f1: Union[Callable, List[Callable]],
                 "loss_value_on_error": 2.4,
                 "cost_value_on_error": 2.1,
                 "ignore_errors": False,
-                "searcher": {"strategy": "bayesian_optimization",
-                             "initial_design_size": 5, "surrogate_model": "gp"},
+                "searcher": {
+                    "strategy": "bayesian_optimization",
+                    "initial_design_size": 5,
+                },
                 "pre_load_hooks": [hook1],
             },
         ),
@@ -147,8 +153,10 @@ def are_functions_equivalent(f1: Union[Callable, List[Callable]],
                 "overwrite_working_directory": True,
                 "post_run_summary": False,
                 "continue_until_max_evaluation_completed": False,
-                "searcher": {"strategy": "bayesian_optimization",
-                             "initial_design_size": 5, "surrogate_model": "gp"},
+                "searcher": {
+                    "strategy": "bayesian_optimization",
+                    "initial_design_size": 5,
+                },
             },
         ),
         (
@@ -164,26 +172,27 @@ def are_functions_equivalent(f1: Union[Callable, List[Callable]],
             },
         ),
         ("run_args_empty.yaml", {}),
-        ("run_args_optional_loading_format.yaml", {
-            "run_pipeline": run_pipeline,
-            "pipeline_space": pipeline_space,
-            "root_directory": "test_yaml",
-            "max_evaluations_total": 20,
-            "max_cost_total": 4.2,
-            "overwrite_working_directory": True,
-            "post_run_summary": False,
-            "development_stage_id": 9,
-            "max_evaluations_per_run": 5,
-            "continue_until_max_evaluation_completed": True,
-            "loss_value_on_error": 2.4,
-            "cost_value_on_error": 2.1,
-            "ignore_errors": False,
-            "searcher": BayesianOptimization,
-            "searcher_kwargs": {'initial_design_size': 5,
-                                             'surrogate_model': 'gp'},
-            "pre_load_hooks": [hook1]
-
-        })
+        (
+            "run_args_optional_loading_format.yaml",
+            {
+                "run_pipeline": run_pipeline,
+                "pipeline_space": pipeline_space,
+                "root_directory": "test_yaml",
+                "max_evaluations_total": 20,
+                "max_cost_total": 4.2,
+                "overwrite_working_directory": True,
+                "post_run_summary": False,
+                "development_stage_id": 9,
+                "max_evaluations_per_run": 5,
+                "continue_until_max_evaluation_completed": True,
+                "loss_value_on_error": 2.4,
+                "cost_value_on_error": 2.1,
+                "ignore_errors": False,
+                "searcher": BayesianOptimization,
+                "searcher_kwargs": {"initial_design_size": 5},
+                "pre_load_hooks": [hook1],
+            },
+        ),
     ],
 )
 def test_yaml_config(yaml_path: str, expected_output: Dict) -> None:

From d1c7a859d0eccfbb259b4fe0a1f708c84ae9da61 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Tue, 24 Sep 2024 19:03:38 +0200
Subject: [PATCH 42/63] refactor: Ifbo

---
 neps/optimizers/base_optimizer.py             |   6 +-
 .../freeze_thaw_sampler.py                    |  56 ---
 .../bayesian_optimization/models/__init__.py  |   6 +-
 .../bayesian_optimization/models/ftpfn.py     |   6 +-
 .../bayesian_optimization/optimizer.py        | 100 +----
 neps/optimizers/intial_design.py              | 130 ++++++
 neps/optimizers/multi_fidelity/ifbo.py        | 424 +++++++++++-------
 neps/optimizers/multi_fidelity/utils.py       |  78 ++--
 neps/sampling/priors.py                       |  59 ++-
 neps/sampling/samplers.py                     |  29 +-
 neps/search_spaces/domain.py                  |  45 +-
 neps/search_spaces/encoding.py                |  73 ++-
 neps/state/neps_state.py                      |   8 +-
 neps/state/trial.py                           |   2 +-
 14 files changed, 669 insertions(+), 353 deletions(-)
 create mode 100644 neps/optimizers/intial_design.py

diff --git a/neps/optimizers/base_optimizer.py b/neps/optimizers/base_optimizer.py
index 8dd9e96f..41dc6962 100644
--- a/neps/optimizers/base_optimizer.py
+++ b/neps/optimizers/base_optimizer.py
@@ -19,7 +19,7 @@
 class SampledConfig:
     id: Trial.ID
     config: Mapping[str, Any]
-    previous_config_id: Trial.ID | None
+    previous_config_id: Trial.ID | None = None
 
 
 class BaseOptimizer:
@@ -76,7 +76,7 @@ def ask(
         trials: Mapping[str, Trial],
         budget_info: BudgetInfo | None,
         optimizer_state: dict[str, Any],
-    ) -> tuple[SampledConfig, dict[str, Any]]:
+    ) -> SampledConfig | tuple[SampledConfig, dict[str, Any]]:
         """Sample a new configuration.
 
         !!! note
@@ -134,7 +134,7 @@ def ask(
         config, config_id, previous_config_id = self.get_config_and_ids()
         return SampledConfig(
             id=config_id, config=config, previous_config_id=previous_config_id
-        ), optimizer_state
+        )
 
     def update_state_post_evaluation(
         self, state: dict[str, Any], report: Trial.Report
diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py
index ea22c5b1..cae5bee8 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py
@@ -59,62 +59,6 @@ def _sample_new(
             new_configs, index=range(index_from, index_from + len(new_configs))
         )
 
-    def _sample_new_unique(
-        self,
-        index_from: int,
-        n: int | None = None,
-        patience: int = 10,
-        ignore_fidelity: bool = False,
-    ) -> pd.Series:
-        n = n if n is not None else self.samples_to_draw
-        assert (
-            patience > 0 and n > 0
-        ), "Patience and `samples_to_draw` must be larger than 0"
-
-        assert self.observations is not None
-        assert self.pipeline_space is not None
-
-        existing_configs = self.observations.all_configs_list()
-        new_configs = []
-        for _ in range(n):
-            # Sample patience times for an unobserved configuration
-            for _ in range(patience):
-                _config = self.pipeline_space.sample(
-                    patience=self.patience,
-                    user_priors=False,
-                    ignore_fidelity=ignore_fidelity,
-                )
-                # # Convert continuous into tabular if the space is tabular
-                # _config = continuous_to_tabular(_config, self.tabular_space)
-                # Iterate over all observed configs
-                for config in existing_configs:
-                    if _config.is_equal_value(
-                        config, include_fidelity=not ignore_fidelity
-                    ):
-                        # if the sampled config already exists
-                        # do the next iteration of patience
-                        break
-                else:
-                    # If the new sample is not equal to any previous
-                    # then it's a new config
-                    new_config = _config
-                    break
-            else:
-                # TODO: use logger.warn here instead (karibbov)
-                warnings.warn(
-                    f"Couldn't find an unobserved configuration in {patience} "
-                    f"iterations. Using an observed config instead"
-                )
-                # patience budget exhausted use the last sampled config anyway
-                new_config = _config
-
-            # append the new config to the list
-            new_configs.append(new_config)
-
-        return pd.Series(
-            new_configs, index=range(index_from, index_from + len(new_configs))
-        )
-
     def sample(
         self,
         acquisition_function: Callable | None = None,
diff --git a/neps/optimizers/bayesian_optimization/models/__init__.py b/neps/optimizers/bayesian_optimization/models/__init__.py
index 35ae2120..49ac7258 100755
--- a/neps/optimizers/bayesian_optimization/models/__init__.py
+++ b/neps/optimizers/bayesian_optimization/models/__init__.py
@@ -1,5 +1,9 @@
-from .ftpfn import FTPFNSurrogate
+from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate
 
+# TODO: Need the GP back here
+#  * What actually uses the GP
 SurrogateModelMapping = {
     "ftpfn": FTPFNSurrogate,
 }
+
+__all__ = ["FTPFNSurrogate", "SurrogateModelMapping"]
diff --git a/neps/optimizers/bayesian_optimization/models/ftpfn.py b/neps/optimizers/bayesian_optimization/models/ftpfn.py
index 6f697033..2041396f 100644
--- a/neps/optimizers/bayesian_optimization/models/ftpfn.py
+++ b/neps/optimizers/bayesian_optimization/models/ftpfn.py
@@ -71,14 +71,14 @@ def _cast_tensor_shapes(x: torch.Tensor) -> torch.Tensor:
 _CACHED_FTPFN_MODEL: dict[tuple[str, str], FTPFN] = {}
 
 
-class FTPFNModel:
+class FTPFNSurrogate:
     """Wrapper around the IfBO model."""
 
     def __init__(
         self,
         target_path: Path | None = None,
         version: str = "0.0.1",
-        **kwargs: Any,
+        device: torch.device | None = None,
     ):
         if target_path is None:
             # TODO: We also probably want to link this to the actual root directory
@@ -91,7 +91,7 @@ def __init__(
         key = (str(target_path), version)
         ftpfn = _CACHED_FTPFN_MODEL.get(key)
         if ftpfn is None:
-            ftpfn = FTPFN(target_path=target_path, version=version)
+            ftpfn = FTPFN(target_path=target_path, version=version, device=device)
             _CACHED_FTPFN_MODEL[key] = ftpfn
 
         self.ftpfn = ftpfn
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 4188a5fe..11eca577 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -22,7 +22,8 @@
     default_single_obj_gp,
     optimize_acq,
 )
-from neps.sampling import Prior, Sampler
+from neps.optimizers.intial_design import make_initial_design
+from neps.sampling import Prior
 from neps.search_spaces.encoding import TensorEncoder
 from neps.search_spaces.hyperparameters.categorical import CategoricalParameter
 
@@ -129,43 +130,6 @@ def _cost_used_budget_percentage(budget_info: BudgetInfo) -> float:
     raise ValueError("No cost budget provided!")
 
 
-# TODO: This needs to be moved to the search space class, however
-# to not break the current prior based APIs used elsewhere, we can
-# just manually create this here.
-# We use confidence here where `0` means no confidence and `1` means
-# absolute confidence. This gets translated in to std's and weights
-# accordingly in a `CenteredPrior`
-def _make_prior(
-    parameters: dict[str, CategoricalParameter | FloatParameter | IntegerParameter],
-) -> Prior:
-    _mapping = {"low": 0.25, "medium": 0.5, "high": 0.75}
-
-    domains: dict[str, Domain] = {}
-    centers: dict[str, tuple[Any, float]] = {}
-    categoricals: set[str] = set()
-    for name, hp in parameters.items():
-        domains[name] = hp.domain  # type: ignore
-
-        if isinstance(hp, CategoricalParameter):
-            categoricals.add(name)
-
-        if hp.default is None:
-            continue
-
-        confidence_str = hp.default_confidence_choice
-        confidence_score = _mapping[confidence_str]
-        center = hp._default_index if isinstance(hp, CategoricalParameter) else hp.default
-
-        centers[name] = (center, confidence_score)
-
-    # Uses truncnorms for numerical and weighted choices categoricals
-    return Prior.make_centered(
-        domains=domains,
-        centers=centers,
-        categoricals=categoricals,
-    )
-
-
 class BayesianOptimization(BaseOptimizer):
     """Implements the basic BO loop."""
 
@@ -179,6 +143,7 @@ def __init__(  # noqa: D417
         sample_default_first: bool = False,
         device: torch.device | None = None,
         encoder: TensorEncoder | None = None,
+        seed: int | None = None,
         treat_fidelity_as_hyperparameters: bool = False,
     ):
         """Initialise the BO loop.
@@ -186,8 +151,7 @@ def __init__(  # noqa: D417
         Args:
             pipeline_space: Space in which to search
             initial_design_size: Number of samples used before using the surrogate model.
-                If None, it will take `int(log(N) ** 2)` samples where `N` is the number
-                of parameters in the search space.
+                If None, it will use the number of parameters in the search space.
             use_priors: Whether to use priors set on the hyperparameters during search.
             use_cost: Whether to consider reported "cost" from configurations in decision
                 making. If True, the optimizer will weigh potential candidates by how much
@@ -199,6 +163,7 @@ def __init__(  # noqa: D417
                     If using `cost`, cost must be provided in the reports of the trials.
 
             sample_default_first: Whether to sample the default configuration first.
+            seed: Seed to use for the random number generator of samplers.
             device: Device to use for the optimization.
             encoder: Encoder to use for encoding the configurations. If None, it will
                 will use the default encoder.
@@ -221,17 +186,11 @@ def __init__(  # noqa: D417
         if treat_fidelity_as_hyperparameters:
             params.update(pipeline_space.fidelities)
 
-        if initial_design_size is None:
-            # As we have fairly regularized GPs, who start with a more smooth landscape
-            # model, we don't need a high level of initial samples.
-            ndims = len(params)
-            initial_design_size = max(2, int(math.log(ndims) ** 2))
-        elif initial_design_size < 1:
-            raise ValueError("Initial_design_size to be at least 1")
-
         self.encoder = TensorEncoder.default(params) if encoder is None else encoder
+        self.prior = Prior.from_parameters(params) if use_priors is True else None
+        self.treat_fidelity_as_hyperparameters = treat_fidelity_as_hyperparameters
+        self.seed = seed
         self.use_cost = use_cost
-        self.prior = _make_prior(params) if use_priors is True else None
         self.device = device
         self.sample_default_first = sample_default_first
         self.n_initial_design = initial_design_size
@@ -243,44 +202,34 @@ def ask(
         budget_info: BudgetInfo,
         optimizer_state: dict[str, Any],
         seed: int | None = None,
-    ) -> tuple[SampledConfig, dict[str, Any]]:
+    ) -> SampledConfig:
         if seed is not None:
             raise NotImplementedError(
                 "Seed is not yet implemented for BayesianOptimization"
             )
 
         n_trials_sampled = len(trials)
-        space = self.pipeline_space
         config_id = str(n_trials_sampled + 1)
 
-        # Fill intitial design data if we don't have any...
+        # If we havn't passed the intial design phase
         if self.initial_design_ is None:
-            self.initial_design_ = []
-
-            if self.sample_default_first:
-                config = space.sample_default_configuration()
-                self.initial_design_.append(config.hp_values())
-
-            sampler = (
-                self.prior if self.prior else Sampler.sobol(self.encoder.ncols, seed=seed)
-            )
-            n_samples = self.n_initial_design - len(self.initial_design_)
-
-            x = sampler.sample(
-                n_samples * 2,
-                to=self.encoder.domains,
+            self.initial_design_ = make_initial_design(
+                space=self.pipeline_space,
+                encoder=self.encoder,
+                sample_default_first=self.sample_default_first,
+                sampler=self.prior if self.prior is not None else "sobol",
                 seed=seed,
-                device=self.device,
+                sample_size=(
+                    "ndim" if self.n_initial_design is None else self.n_initial_design
+                ),
+                sample_fidelity=(
+                    "max" if not self.treat_fidelity_as_hyperparameters else True
+                ),
             )
-            uniq_x = torch.unique(x, dim=0)
-            configs = self.encoder.unpack(uniq_x[:n_samples])
-            self.initial_design_.extend(configs)
 
-        # If we havn't passed the intial design phase
         if n_trials_sampled < len(self.initial_design_):
             config = self.initial_design_[n_trials_sampled]
-            sample = SampledConfig(id=config_id, config=config, previous_config_id=None)
-            return sample, optimizer_state
+            return SampledConfig(id=config_id, config=config)
 
         # Now we actually do the BO loop, start by encoding the data
         # TODO: Lift this into runtime, let the optimizer advertise the encoding wants...
@@ -347,7 +296,7 @@ def ask(
             pibo_exp_term = _pibo_exp_term(
                 n_trials_sampled,
                 self.encoder.ncols,
-                self.n_initial_design,
+                len(self.initial_design_),
             )
 
             # If the amount of weight derived from the pibo exponent becomes
@@ -398,5 +347,4 @@ def ask(
         assert len(candidates) == 1, "Expected only one candidate!"
         config = self.encoder.unpack(candidates)[0]
 
-        sample = SampledConfig(id=config_id, config=config, previous_config_id=None)
-        return sample, optimizer_state
+        return SampledConfig(id=config_id, config=config)
diff --git a/neps/optimizers/intial_design.py b/neps/optimizers/intial_design.py
new file mode 100644
index 00000000..f2109f00
--- /dev/null
+++ b/neps/optimizers/intial_design.py
@@ -0,0 +1,130 @@
+from collections.abc import Sequence
+from dataclasses import dataclass, field
+
+from typing import Literal, Any, Mapping
+
+from neps.sampling import Sampler
+from neps.sampling.priors import Prior
+from neps.search_spaces.encoding import TensorEncoder
+from neps.search_spaces.search_space import SearchSpace
+import torch
+
+
+def make_initial_design(
+    space: SearchSpace,
+    encoder: TensorEncoder,
+    sampler: Literal["sobol", "prior", "uniform"] | Sampler,
+    sample_size: int | Literal["ndim"] | None = "ndim",
+    sample_default_first: bool = True,
+    sample_fidelity: (
+        Literal["min", "max", True] | int | float | dict[str, int | float]
+    ) = True,
+    seed: int | None = None,
+) -> list[dict[str, Any]]:
+    """Generate the initial design of the optimization process.
+
+    Args:
+        space: The search space to use.
+        encoder: The encoder to use for encoding/decoding configurations.
+        sampler: The sampler to use for the initial design.
+
+            If set to "sobol", a Sobol sequence will be used.
+            If set to "uniform", a uniform random sampler will be used.
+            If set to "prior", a prior sampler will be used, based on the defaults,
+                and confidence scores of the hyperparameters.
+            If set to a custom sampler, the sampler will be used directly.
+
+        sample_size:
+            The number of configurations to sample.
+
+            If "ndim", the number of configurations will be equal to the number of dimensions.
+            If None, no configurations will be sampled.
+
+        sample_default_first: Whether to sample the default configuration first.
+        sample_fidelity:
+            At what fidelity to sample the configurations, including the default.
+
+            If set to "min" or "max", the configuration will be sampled
+            at the minimum or maximum fidelity, respectively. If set to an integer
+            or a float, the configuration will be sampled at that fidelity.
+            When specified as a dictionary, the keys should be the names of the
+            fidelity parameters and the values should be the target fidelities.
+            If set to `True`, the configuration will have its fidelity randomly sampled.
+        seed: The seed to use for the random number generator of samplers.
+
+    """
+    configs: list[dict[str, Any]] = []
+
+    # First, we establish what fidelity to apply to them.
+    match sample_fidelity:
+        case "min":
+            fids = {name: fid.lower for name, fid in space.fidelities.items()}
+        case "max":
+            fids = {name: fid.upper for name, fid in space.fidelities.items()}
+        case True:
+            fids = {name: hp.sample_value() for name, hp in space.fidelities.items()}
+        case int() | float():
+            if len(space.fidelities) != 1:
+                raise ValueError(
+                    "The target fidelity should be specified as a dictionary"
+                    " if there are multiple fidelities or no fidelity should"
+                    " be specified."
+                    " Current search space has fidelities: "
+                    f"{list(space.fidelities.keys())}"
+                )
+            name = next(iter(space.fidelities.keys()))
+            fids = {name: sample_fidelity}
+        case Mapping():
+            missing_keys = set(space.fidelities.keys()) - set(sample_fidelity.keys())
+            if any(missing_keys):
+                raise ValueError(
+                    f"Missing target fidelities for the following fidelities: "
+                    f"{missing_keys}"
+                )
+            fids = sample_fidelity
+        case _:
+            raise ValueError(
+                "Invalid value for `sample_default_at_target`. "
+                "Expected 'min', 'max', True, int, float, or dict."
+            )
+
+    if sample_default_first:
+        # TODO: No way to pass a seed to the sampler
+        default = {
+            name: hp.default if hp.default is not None else hp.sample_value()
+            for name, hp in space.hyperparameters.items()
+        }
+        configs.append({**default, **fids})
+
+    params = {**space.numerical, **space.categoricals}
+    ndims = len(params)
+
+    if sample_size == "ndim":
+        sample_size = ndims
+    elif sample_size is not None and not sample_size > 0:
+        raise ValueError(
+            "The sample size should be a positive integer if passing an int."
+        )
+
+    print("sample", sample_size, ndims)
+    if sample_size is not None:
+        match sampler:
+            case "sobol":
+                sampler = Sampler.sobol(ndim=len(params))
+            case "uniform":
+                sampler = Sampler.uniform(ndim=len(params))
+            case "prior":
+                sampler = Prior.from_parameters(params)
+            case _:
+                sampler = sampler
+
+        encoded_configs = sampler.sample(
+            sample_size * 2,
+            to=encoder.domains,
+            seed=seed,
+        )
+        uniq_x = torch.unique(encoded_configs, dim=0)
+        sample_configs = encoder.unpack(uniq_x[:sample_size])
+        configs.extend([{**config, **fids} for config in sample_configs])
+
+    return configs
diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index e8b34d25..066a0f99 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -1,70 +1,61 @@
-from __future__ import annotations
-
-import warnings
-from typing import TYPE_CHECKING, Any, Mapping
+from typing import Any, Mapping
 
+import math
 import numpy as np
+import torch
 
 from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig
-from neps.optimizers.bayesian_optimization.acquisition_functions.mf_pi import MFPI_Random
-from neps.optimizers.bayesian_optimization.acquisition_samplers.freeze_thaw_sampler import (
-    FreezeThawSampler,
-)
-from neps.optimizers.multi_fidelity.mf_bo import PFNSurrogate
-from neps.optimizers.multi_fidelity.utils import MFObservedData
+from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate
+from neps.optimizers.intial_design import make_initial_design
+from neps.sampling.samplers import Sampler
+from neps.search_spaces.domain import Domain
+from neps.search_spaces.encoding import CategoricalToUnitNorm, TensorEncoder
 from neps.search_spaces.search_space import FloatParameter, IntegerParameter, SearchSpace
 from neps.state.trial import Trial
+from neps.state.optimizer import BudgetInfo
+
+
+def _select_trials(trials: Mapping[str, Trial]) -> Mapping[str, Trial]:
+    return {
+        trial_id: trial
+        for trial_id, trial in trials.items()
+        if trial.state
+        not in (
+            Trial.State.FAILED,
+            Trial.State.CRASHED,
+            Trial.State.UNKNOWN,
+            Trial.State.CORRUPTED,
+        )
+    }
 
-if TYPE_CHECKING:
-    from neps.state.optimizer import BudgetInfo
-
-
-def _adjust_fidelity_for_freeze_thaw_steps(
-    pipeline_space: SearchSpace,
-    step_size: int,
-) -> SearchSpace:
-    """Adjusts the fidelity range to be divisible by `step_size` for Freeze-Thaw."""
-    assert pipeline_space.fidelity is not None
-
-    # Check if the fidelity range is divided into equal sized steps by `step_size`
-    fid_range = pipeline_space.fidelity.upper - pipeline_space.fidelity.lower
-    remainder = fid_range % step_size
-    if remainder == 0:
-        return pipeline_space
-
-    # Adjust the fidelity lower bound to be divisible by `step_size` into equal steps
-    # Pushing the lower bound of the fidelity space by an offset to ensure equal-sized steps
-    offset = step_size - remainder
-    pipeline_space.fidelity.lower += offset
-
-    warnings.warn(
-        f"Adjusted fidelity lower bound to {pipeline_space.fidelity.lower} "
-        f"for equal-sized steps of {step_size}.",
-        UserWarning,
-        stacklevel=3,
-    )
-    return pipeline_space
-
-
-# TODO: Maybe make this a part of searchspace functionality
-def get_budget_value(
-    space: SearchSpace,
-    step_size: int,
-    budget_level: int | float,
-) -> int | float:
-    assert space.fidelity is not None
-    match space.fidelity:
-        case IntegerParameter():
-            return int(step_size * budget_level + space.fidelity.lower)
-        case FloatParameter():
-            return step_size * budget_level + space.fidelity.lower
-        case _:
-            raise NotImplementedError(
-                f"Fidelity parameter: {space.fidelity}"
-                f"must be one of the types: "
-                f"[IntegerParameter, FloatParameter], but is type:"
-                f"{type(space.fidelity)}"
-            )
+
+def _remove_duplicates(x: torch.Tensor) -> torch.Tensor:
+    # Does a lexsort, same as if we sorted by (config_id, budget), where
+    # theyre are sorted according to increasing config_id and then increasing budget.
+    # x[i2] -> sorted by config id and budget
+    i1 = torch.argsort(x[:, 1])
+    i2 = i1[torch.argsort(x[i1][:, 0], stable=True)]
+    sorted_x = x[i2]
+
+    # Now that it's sorted, we essentially want to count the occurence of each id into counts
+    _, counts = torch.unique_consecutive(sorted_x[:, 0], return_counts=True)
+
+    # Now we can use these counts to get to the last occurence of each id
+    # The -1 is because we want to index from 0 but sum starts at 1.
+    ii = counts.cumsum(0) - 1
+    return sorted_x[ii]
+
+
+# NOTE: Ifbo was trained using 32 bit
+FTPFN_DTYPE = torch.float32
+
+
+def tokenize(
+    ids: torch.Tensor,
+    budgets: torch.Tensor,
+    configs: torch.Tensor,
+) -> torch.Tensor:
+    return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1)
 
 
 class IFBO(BaseOptimizer):
@@ -77,10 +68,11 @@ def __init__(
         use_priors: bool = False,
         sample_default_first: bool = False,
         sample_default_at_target: bool = False,
-        patience: int = 100,
         # arguments for model
         surrogate_model_args: dict | None = None,
-        initial_design_size: int = 1,
+        initial_design_size: int | None = None,
+        device: torch.device | None = None,
+        **kwargs: Any,  # TODO: Remove this
     ):
         """Initialise.
 
@@ -92,29 +84,66 @@ def __init__(
             promotion_policy: The type of promotion procedure to use
             sample_default_first: Whether to sample the default configuration first
             initial_design_size: Number of configurations to sample before starting optimization
-        """
-        assert self.pipeline_space.fidelity is not None
 
-        # Adjust pipeline space fidelity steps to be equally spaced
-        pipeline_space = _adjust_fidelity_for_freeze_thaw_steps(pipeline_space, step_size)
-        super().__init__(pipeline_space=pipeline_space, patience=patience)
+                If None, the number of configurations will be equal to the number of dimensions.
+
+            device: Device to use for the model
+        """
+        assert pipeline_space.fidelity is not None
+        assert isinstance(pipeline_space.fidelity_name, str)
 
+        super().__init__(pipeline_space=pipeline_space)
         self.step_size = step_size
         self.use_priors = use_priors
-        self.surrogate_model_args = surrogate_model_args
         self.sample_default_first = sample_default_first
         self.sample_default_at_target = sample_default_at_target
-
-        self._initial_design_size = initial_design_size
-
-        self.min_budget: int | float = self.pipeline_space.fidelity.lower
-        self.max_budget: int | float = self.pipeline_space.fidelity.upper
-
-        fidelity_name = self.pipeline_space.fidelity_name
-        assert isinstance(fidelity_name, str)
-        self.fidelity_name: str = fidelity_name
-
-        self._model_update_failed = False
+        self.surrogate_model_args = surrogate_model_args or {}
+        self.device = device
+        self.n_initial_design: int | None = initial_design_size
+
+        self._min_budget: int | float = pipeline_space.fidelity.lower
+        self._max_budget: int | float = pipeline_space.fidelity.upper
+        self._fidelity_name: str = pipeline_space.fidelity_name
+        self._ftpfn_encoder: TensorEncoder = TensorEncoder.default(
+            {
+                **self.pipeline_space.numerical,
+                **self.pipeline_space.categoricals,
+            },
+            custom_transformers={
+                cat_name: CategoricalToUnitNorm(choices=cat.choices)
+                for cat_name, cat in self.pipeline_space.categoricals.items()
+            },
+        )
+        self._initial_design: list[dict[str, Any]] | None = None
+
+        # TODO: We want it to be evenly divided by step size, so we need
+        # to add something to the minimum fidelity to ensure this.
+
+        # NOTE: The PFN model expects fidelities to be normalized between 0 and 1,
+        # hence, we make sure to do min-max normalization but include the number of bins.
+        # Also, it expects it specifically in column 1 so we can't include it with the configs
+        maybe_bins = math.ceil((self._max_budget - self._min_budget) / self.step_size) + 1
+        match pipeline_space.fidelity:
+            case IntegerParameter():
+                assert pipeline_space.fidelity.domain.cardinality is not None
+                bins = max(maybe_bins, pipeline_space.fidelity.domain.cardinality)
+            case FloatParameter():
+                bins = maybe_bins
+            case _:
+                raise NotImplementedError(
+                    f"Fidelity type {type(pipeline_space.fidelity)} not supported"
+                )
+
+        # Domain of fidelity values, i.e. what is given in the configs that we
+        # give to the user to evaluate at.
+        self._fid_domain = pipeline_space.fidelity.domain
+
+        # Domain in which we should pass budgets to ifbo model
+        self._budget_domain = Domain.float(1 / self._max_budget, 1)
+
+        # Domain from which we assign an index to each budget
+        # Automatically takes care of rounding
+        self._budget_index_domain = Domain.indices(bins)
 
     def ask(
         self,
@@ -122,102 +151,187 @@ def ask(
         budget_info: BudgetInfo,
         optimizer_state: dict[str, Any],
         seed: int | None = None,
-    ) -> tuple[SampledConfig, dict[str, Any]]:
+    ) -> SampledConfig:
         if seed is not None:
             raise NotImplementedError("Seed is not yet implemented for IFBO")
 
-        observed_configs = MFObservedData.from_trials(trials)
+        trials = _select_trials(trials)
 
-        in_initial_design_phase = (
-            len(observed_configs.completed_runs) < self._initial_design_size
-        )
-        if in_initial_design_phase:
-            # TODO: Copy BO setup where we can sample SOBOL or from Prior
-            self.logger.debug("Sampling from initial design...")
-            config = self.pipeline_space.sample(
-                patience=self.patience, user_priors=True, ignore_fidelity=False
+        ids = [
+            int(trial.metadata.id.split("_", maxsplit=1)[0]) for trial in trials.values()
+        ]
+        n_unique_ids = len(set(ids))
+        new_id = max(ids) + 1 if len(ids) > 0 else 0
+
+        # If we havn't passed the intial design phase
+        if self._initial_design is None:
+            self._initial_design = make_initial_design(
+                space=self.pipeline_space,
+                encoder=self._ftpfn_encoder,
+                sample_default_first=self.sample_default_first,
+                sampler="sobol",
+                seed=seed,
+                sample_fidelity="min",
+                sample_size=(
+                    "ndim" if self.n_initial_design is None else self.n_initial_design
+                ),
             )
-            _config_dict = config.hp_values()
-            _config_dict.update({self.fidelity_name: self.min_budget})
-            config.set_hyperparameters_from_dict(_config_dict)
-            _config_id = observed_configs.next_config_id()
-            return SampledConfig(
-                config=config.hp_values(), id=_config_id, previous_config_id=None
-            ), optimizer_state
-
-        # TODO: Maybe just remove `PFNSurrogate` as a whole and use FTPFN directly...
-        #    this depends on whether we can actually create a proper surrogate model abstraction
-        # TODO: Really all of this should just be passed into an __init__ instead of 3 stage process
-        model_policy = PFNSurrogate(
-            pipeline_space=self.pipeline_space,
-            surrogate_model_args=self.surrogate_model_args,
-            step_size=self.step_size,
+
+        if n_unique_ids < len(self._initial_design):
+            config = self._initial_design[n_unique_ids]
+            return SampledConfig(id=f"{new_id}_0", config=config)
+
+        # Otherwise, we proceed to surrogate phase
+        ftpfn = FTPFNSurrogate(
+            target_path=self.surrogate_model_args.get("target_path", None),
+            version=self.surrogate_model_args.get("version", "0.0.1"),
+            device=self.device,
         )
-        model_policy.observed_configs = observed_configs
-        model_policy.update_model()
 
-        # TODO: Replace with more efficient samplers we have from BO
-        # TODO: Just make this take in everything at __init__ instead of a 2 stage init
-        acquisition_sampler = FreezeThawSampler(
-            pipeline_space=self.pipeline_space, patience=self.patience
+        # NOTE: `0` is reserved in PFN, we add an additional +1 to all ids
+        train_ids = torch.tensor(ids, dtype=FTPFN_DTYPE, device=self.device) + 1
+        train_configs = self._ftpfn_encoder.encode([t.config for t in trials.values()])
+
+        train_fidelities = [t.config[self._fidelity_name] for t in trials.values()]
+        train_budgets = self._budget_domain.cast(
+            torch.tensor(train_fidelities, device=self.device, dtype=FTPFN_DTYPE),
+            frm=self._fid_domain,
         )
-        acquisition_sampler.set_state(
-            self.pipeline_space, observed_configs, self.step_size
+        x_train = tokenize(ids=train_ids, budgets=train_budgets, configs=train_configs)
+        x_train = x_train.to(FTPFN_DTYPE)
+
+        # TODO: Document that it's on the user to ensure these are already all bounded
+        # We could possibly include some bounded transform to assert this.
+        minimize_ys = torch.tensor(
+            [
+                trial.report.loss
+                if trial.report is not None and trial.report.loss is not None
+                else np.nan
+                for trial in trials.values()
+            ],
+            device=self.device,
+            dtype=FTPFN_DTYPE,
         )
+        if not all(0 <= y <= 1.0 for y in minimize_ys):
+            raise RuntimeError(
+                "ifBO requires that all loss values reported lie in the interval [0, 1]"
+                " but recieved loss value outside of that range!"
+                f"\n{minimize_ys}"
+            )
+        # Invert the ys
+        maximize_ys = 1 - minimize_ys
+        maximize_best_y = maximize_ys.max().item()
+        is_pending = minimize_ys.isnan()
+
+        maximize_ys[is_pending] = ftpfn.get_mean_performance(
+            train_x=x_train[~is_pending],
+            train_y=maximize_ys[~is_pending],
+            test_x=x_train[is_pending],
+        )
+
+        rng = np.random.RandomState(seed)
+        n_rand = 1_000  # TODO: parametrize
+
+        # TODO: Could also sample from a prior...
+        uniform = Sampler.uniform(ndim=self._ftpfn_encoder.ncols)
 
-        samples = acquisition_sampler.sample(set_new_sample_fidelity=self.min_budget)
+        # We sample the horizon in terms of step numbers to take
+        lower_index = self._budget_index_domain.lower
+        upper_index = self._budget_index_domain.upper
+        # The plus 1 here is because we want to sample that at least one step
+        # should be taken.
+        horizon_index_increment = rng.randint(lower_index, upper_index) + 1
 
-        # TODO: See if we can get away from `set_state` style things
-        # and just instantiate it with what it needs
-        acquisition = MFPI_Random(
-            pipeline_space=self.pipeline_space, surrogate_model_name="ftpfn"
+        # We then normalize it to FTPFN normalized budget domain
+        horizon = self._budget_domain.cast_one(
+            horizon_index_increment,
+            frm=self._budget_index_domain,
         )
-        acquisition.set_state(
-            self.pipeline_space,
-            model_policy.surrogate_model,
-            observed_configs,
-            self.step_size,
+
+        # Now let's create the random configurations
+        rand_configs = uniform.sample(
+            n=n_rand,
+            to=self._ftpfn_encoder.domains,
+            seed=None,  # TODO
+            device=self.device,
+        ).to(FTPFN_DTYPE)
+
+        # We give them all the special 0 id, as well as set the budget accordinly
+        acq_new = tokenize(
+            ids=torch.zeros(n_rand, dtype=FTPFN_DTYPE, device=self.device),
+            budgets=torch.zeros(n_rand, dtype=FTPFN_DTYPE, device=self.device),
+            configs=rand_configs,
         )
 
-        # `_samples` should have new configs with fidelities set to as required
-        acq, _samples = acquisition.eval(x=samples, asscalar=True)
-        # NOTE: len(samples) need not be equal to len(_samples) as `samples` contain
-        # all (partials + new) configurations obtained from the sampler, but
-        # in `_samples`, configs are removed that have reached maximum epochs allowed
+        # Construct all our samples for acqusition:
+        # 1. Take all non-pending configs
+        acq_train = x_train[~is_pending].clone().detach()
+
+        # 2. We only want to include the configuration rows
+        #   that are at their highest budget,
+        #   i.e. don't include config_0_0 and config_0_1
+        acq_train = _remove_duplicates(acq_train)
+
+        # 3. Sub select all that are at a partial budget i.e. can evaluate further
+        #   Note, it's important to do this after the above
+        partial_eval_mask = acq_train[:, 1] < 1
+        acq_train = acq_train[partial_eval_mask]
+
+        # 4. Add in the new sampled configurations
+        acq_samples = torch.vstack([acq_train, acq_new])
+
+        # 5. Add on the horizon to the budget, and clamping to maximum
+        #     Note that we hold onto the intermediate unclamped budget for later
+        unclamped_budgets = acq_samples[:, 1] + horizon
+        acq_samples[:, 1] = torch.clamp(unclamped_budgets, max=1)
+
+        # Now get the PI of these samples
+        lu = 10 ** rng.uniform(-4, -1)
+        f_inc = maximize_best_y * (1 - lu)
+        pi_new_samples = ftpfn.get_pi(
+            train_x=x_train.to(FTPFN_DTYPE),
+            train_y=maximize_ys.to(FTPFN_DTYPE),
+            test_x=acq_samples.to(FTPFN_DTYPE),
+            y_best=torch.full(
+                size=(len(acq_samples),),
+                fill_value=f_inc,
+                dtype=FTPFN_DTYPE,
+            ),
+        )
+        best_ix = pi_new_samples.argmax()
 
-        best_idx = acq.argmax()
-        _config_id = best_idx
+        # Extract out the row which had the best PI
+        best_id = int(acq_samples[best_ix, 0].round().item())
+        best_vector = acq_samples[best_ix, 2:].unsqueeze(0)
+        best_config = self._ftpfn_encoder.unpack(best_vector)[0]
 
-        # NOTE: `samples` and `_samples` should share the same index values, hence,
-        # avoid using `.iloc` and work with `.loc` on these pandas DataFrame/Series
-        config: SearchSpace = samples.loc[_config_id]
-        config = config.clone()
+        if best_id == 0:
+            # A newly sampled configuration was deemed more promising
+            config_id = f"{new_id}_0"
+            best_config[self._fidelity_name] = self._min_budget
+            previous_config_id = None
+            return SampledConfig(config_id, best_config, previous_config_id)
 
-        # IMPORTANT: setting the fidelity value appropriately
-        if best_idx > max(observed_configs.seen_config_ids):
-            next_fid_value = self.min_budget
         else:
-            max_observed_fids = (
-                observed_configs.get_max_observed_fidelity_level_per_config()
+            # To calculate the next step to take in fidelity space, we remove the horizon
+            previous_budget_of_acquired_config = unclamped_budgets[best_ix] - horizon
+
+            # Then we transform this:
+            # 1. Back to budget_index space
+            # 2. Increment it by one
+            # 3. Transform back to fidelity space
+            budget_ix = self._budget_index_domain.cast_one(
+                float(previous_budget_of_acquired_config), frm=self._budget_domain
             )
-            best_configs_max_fid = max_observed_fids.loc[best_idx]
-            budget_value = get_budget_value(
-                space=self.pipeline_space,
-                step_size=self.step_size,
-                budget_level=best_configs_max_fid,
+            budget_ix += 1
+            fid_value = self._fid_domain.cast_one(
+                budget_ix, frm=self._budget_index_domain
             )
-            next_fid_value = budget_value + self.step_size
 
-        config.update_hp_values({self.fidelity_name: next_fid_value})
+            real_best_id = best_id - 1  # NOTE: Remove the +1 we added to all ids
+            best_config[self._fidelity_name] = fid_value
 
-        # Lastly, we need to generate config id for it.
-        budget_level = int(np.ceil((next_fid_value - self.min_budget) / self.step_size))
-        if _config_id in observed_configs.seen_config_ids:
-            config_id = f"{_config_id}_{budget_level}"
-            previous_config_id = f"{_config_id}_{budget_value - 1}"
-        else:
-            config_id = f"{observed_configs.next_config_id()}_{budget_level}"
+            config_id = f"{real_best_id}_{budget_ix}"
+            previous_config_id = f"{real_best_id}_{budget_ix - 1}"
 
-        return SampledConfig(
-            config=config.hp_values(), id=config_id, previous_config_id=previous_config_id
-        ), optimizer_state
+            return SampledConfig(config_id, best_config, previous_config_id)
diff --git a/neps/optimizers/multi_fidelity/utils.py b/neps/optimizers/multi_fidelity/utils.py
index 8e7b4910..b9d6e174 100644
--- a/neps/optimizers/multi_fidelity/utils.py
+++ b/neps/optimizers/multi_fidelity/utils.py
@@ -355,47 +355,55 @@ def token_ids(self) -> np.ndarray:
         return self.df.index.values
 
     @classmethod
-    def from_trials(cls, trials: Mapping[str, Trial]) -> Self:
-        observed_configs = MFObservedData(
+    def from_trials(
+        cls,
+        trials: Mapping[str, Trial],
+        *,
+        # TODO: We should store dicts, not the SearchSpace object...
+        # Once done, we can remove this
+        space: SearchSpace,
+        on_error: Literal["ignore"] | float = "ignore",
+    ) -> Self:
+        observed_configs = cls(
             columns=["config", "perf", "learning_curves"],
             index_names=["config_id", "budget_id"],
         )
 
-        def _data(trial: Trial) -> Any:
-            # Considered pending
-            if report is None:
+        records: list[dict[str, Any]] = []
+        for trial_id, trial in trials.items():
+            _config_id, _budget_id = trial_id.split("_")
+
+            if trial.report is None:
                 loss = np.nan
                 lc = [np.nan]
-            else:
-                loss = report.loss if report.loss is not None else "error"
-                lc = (
-                    report.learning_curve
-                    if report.learning_curve is not None
-                    else "error"
-                )
-
-            return [trial.config, loss, lc]
-
-        # previous optimization run exists and needs to be loaded
-        def index_data_split(
-            config_id: str, trial: Trial
-        ) -> tuple[tuple[int, int], list]:
-            _config_id, _budget_id = config_id.split("_")
-            index = int(_config_id), int(_budget_id)
-            return index, _data(trial)
-
-        if len(trials) > 0:
-            index_row = [
-                tuple(index_data_split(trial_id, trial))
-                for trial_id, trial in trials.items()
-            ]
-            indices, rows = zip(*index_row, strict=True)
-            observed_configs.add_data(data=list(rows), index=list(indices))
-
-        # an aesthetic choice more than a functional choice
-        observed_configs.df = observed_configs.df.sort_index(
-            level=self.observed_configs.df.index.names, inplace=True
-        )
+            elif trial.report.loss is None:
+                assert trial.report.err is not None
+                if on_error == "ignore":
+                    return None
+
+                loss = on_error
+                lc = [on_error]
+            elif trial.report.loss is not None:
+                loss = trial.report.loss
+                assert trial.report.learning_curve is not None
+                lc = trial.report.learning_curve
+
+            records.append(
+                {
+                    "config_id": int(_config_id),
+                    "budget_id": int(_budget_id),
+                    # NOTE: Behavoiour around data in this requires that the dataframe stores
+                    # `SearchSpace` objects and not dictionaries
+                    "config": space.from_dict(trial.config),
+                    "perf": loss,
+                    "learning_curves": lc,
+                }
+            )
+
+        observed_configs.df = pd.DataFrame.from_records(
+            records,
+            index=["config_id", "budget_id"],
+        ).sort_index()
         return observed_configs
 
 
diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py
index 83c40e68..62c81ed8 100644
--- a/neps/sampling/priors.py
+++ b/neps/sampling/priors.py
@@ -22,11 +22,14 @@
     TruncatedNormal,
 )
 from neps.sampling.samplers import Sampler, WeightedSampler
+from neps.search_spaces import CategoricalParameter
 from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
 
 if TYPE_CHECKING:
     from torch.distributions import Distribution
 
+    from neps.search_spaces import FloatParameter, IntegerParameter
+
 
 class Prior(Sampler, Protocol):
     """A protocol for priors over search spaces.
@@ -103,7 +106,50 @@ def uniform(cls, ncols: int) -> UniformPrior:
         Args:
             ncols: The number of columns in the tensor to sample.
         """
-        return UniformPrior(ncols=ncols)
+        return UniformPrior(ndims=ncols)
+
+    @classmethod
+    def from_parameters(
+        cls,
+        parameters: dict[str, CategoricalParameter | FloatParameter | IntegerParameter],
+    ) -> Prior:
+        """Please refer to [`make_centered()`][neps.priors.Prior.make_centered]
+        for more details. This is a shortcut method.
+        """
+        # TODO: This needs to be moved to the search space class, however
+        # to not break the current prior based APIs used elsewhere, we can
+        # just manually create this here.
+        # We use confidence here where `0` means no confidence and `1` means
+        # absolute confidence. This gets translated in to std's and weights
+        # accordingly in a `CenteredPrior`
+        _mapping = {"low": 0.25, "medium": 0.5, "high": 0.75}
+
+        domains: dict[str, Domain] = {}
+        centers: dict[str, tuple[Any, float]] = {}
+        categoricals: set[str] = set()
+        for name, hp in parameters.items():
+            domains[name] = hp.domain  # type: ignore
+
+            if isinstance(hp, CategoricalParameter):
+                categoricals.add(name)
+
+            if hp.default is None:
+                continue
+
+            confidence_str = hp.default_confidence_choice
+            confidence_score = _mapping[confidence_str]
+            center = (
+                hp._default_index if isinstance(hp, CategoricalParameter) else hp.default
+            )
+
+            centers[name] = (center, confidence_score)
+
+        # Uses truncnorms for numerical and weighted choices categoricals
+        return Prior.make_centered(
+            domains=domains,
+            centers=centers,
+            categoricals=categoricals,
+        )
 
     @classmethod
     def make_centered(
@@ -356,9 +402,14 @@ class UniformPrior(Prior):
     Uses a UnitUniform under the hood before converting to the value domain.
     """
 
-    ncols: int
+    ndims: int
     """The number of columns in the tensor to sample from."""
 
+    @property
+    @override
+    def ncols(self) -> int:
+        return self.ndims
+
     @override
     def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor:
         # NOTE: We just assume everything is in bounds...
@@ -378,9 +429,9 @@ def sample(
             raise NotImplementedError("Seeding is not yet implemented.")
 
         _n = (
-            torch.Size((n, self.ncols))
+            torch.Size((n, self.ndims))
             if isinstance(n, int)
-            else torch.Size((*n, self.ncols))
+            else torch.Size((*n, self.ndims))
         )
         samples = torch.rand(_n, device=device, dtype=torch.float64)
         return Domain.translate(samples, frm=UNIT_FLOAT_DOMAIN, to=to)
diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py
index 43758094..64105534 100644
--- a/neps/sampling/samplers.py
+++ b/neps/sampling/samplers.py
@@ -9,7 +9,7 @@
 from collections.abc import Sequence
 from dataclasses import dataclass, field
 from functools import reduce
-from typing import Protocol
+from typing import TYPE_CHECKING, Protocol
 from typing_extensions import override
 
 import torch
@@ -17,6 +17,9 @@
 
 from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
 
+if TYPE_CHECKING:
+    from neps.sampling.priors import UniformPrior
+
 
 class Sampler(Protocol):
     """A protocol for sampling tensors and vonerting them to a given domain."""
@@ -53,18 +56,31 @@ def sample(
         ...
 
     @classmethod
-    def sobol(cls, ndim: int, *, scramble: bool = True, seed: int | None = None) -> Sobol:
+    def sobol(cls, ndim: int, *, scramble: bool = True) -> Sobol:
         """Create a Sobol sampler.
 
         Args:
             ndim: The number of columns to sample.
             scramble: Whether to scramble the Sobol sequence.
-            seed: The seed for the Sobol sequence.
 
         Returns:
             A Sobol sampler.
         """
-        return Sobol(ndim=ndim, scramble=scramble, seed=seed)
+        return Sobol(ndim=ndim, scramble=scramble)
+
+    @classmethod
+    def uniform(cls, ndim: int) -> UniformPrior:
+        """Create a uniform sampler.
+
+        Args:
+            ndim: The number of columns to sample.
+
+        Returns:
+            A uniform sampler.
+        """
+        from neps.sampling.priors import UniformPrior
+
+        return UniformPrior(ndims=ndim)
 
 
 # Technically this could be a prior with a uniform distribution
@@ -75,9 +91,6 @@ class Sobol(Sampler):
     ndim: int
     """The number of dimensions to sample for."""
 
-    seed: int | None = None
-    """The seed for the Sobol sequence."""
-
     scramble: bool = True
     """Whether to scramble the Sobol sequence."""
 
@@ -113,7 +126,7 @@ def sample(
         sobol = torch.quasirandom.SobolEngine(
             dimension=self.ndim,
             scramble=self.scramble,
-            seed=self.seed,
+            seed=seed,
         )
 
         out = torch.empty(_n, self.ncols, dtype=torch.float64, device=device)
diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py
index c1a10196..a5151aab 100644
--- a/neps/search_spaces/domain.py
+++ b/neps/search_spaces/domain.py
@@ -179,6 +179,15 @@ def int(
             bins=bins,
         )
 
+    def next_value(self, x: Tensor) -> Tensor:
+        """Get the next value for a tensor of values."""
+        if self.cardinality is None:
+            raise ValueError("Domain is non-finite, cannot get next value.")
+        cardinality_domain = Domain.indices(self.cardinality)
+        current_step = cardinality_domain.cast(x, frm=self)
+        bounded_next_step = (current_step + 1).clamp_max(self.cardinality - 1)
+        return self.cast(bounded_next_step, frm=cardinality_domain)
+
     @classmethod
     def indices(cls, n: int) -> Domain[int]:
         """Create a domain for a range of indices.
@@ -348,7 +357,7 @@ def translate(
             raise ValueError(
                 "The number of domains in `to` must match the number of tensors"
                 " if provided as a list."
-                f" Expected {ndims} from last dimension of {x.shape}, got {len(to)}."
+                f" Expected {ndims} from last dimension of {x.shape=}, got {len(to)}."
             )
 
         out = torch.empty_like(x)
@@ -357,5 +366,39 @@ def translate(
 
         return out
 
+    def cast_one(self, x: float | int, frm: Domain) -> float | int:
+        """Cast a single value from the domain `frm` to this domain.
+
+        Args:
+            x: Value in the `frm` domain to cast to this domain.
+            frm: The domain to cast from.
+
+        Returns:
+            Value cast to this domain.
+        """
+        return self.cast(torch.tensor(x), frm=frm).item()
+
+    def from_unit_one(self, x: float) -> float | int:
+        """Transform a single value from the unit interval [0, 1] to this domain.
+
+        Args:
+            x: A value in the unit interval [0, 1] to convert.
+
+        Returns:
+            Value lifted into this domain.
+        """
+        return self.from_unit(torch.tensor(x)).item()
+
+    def to_unit_one(self, x: float | int) -> float:
+        """Transform a single value from this domain to the unit interval [0, 1].
+
+        Args:
+            x: Value in this domain to convert.
+
+        Returns:
+            Value normalized to the unit interval [0, 1].
+        """
+        return self.to_unit(torch.tensor(x)).item()
+
 
 UNIT_FLOAT_DOMAIN = Domain.float(0.0, 1.0)
diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
index 5f68aff9..eef1b25b 100644
--- a/neps/search_spaces/encoding.py
+++ b/neps/search_spaces/encoding.py
@@ -102,16 +102,56 @@ def decode(self, x: torch.Tensor) -> list[Any]:
         return [self.choices[int(i)] for i in torch.round(x).tolist()]
 
 
+@dataclass
+class CategoricalToUnitNorm(TensorTransformer):
+    choices: Sequence[Any]
+
+    domain: Domain = field(init=False)
+    _integer_transformer: CategoricalToIntegerTransformer = field(init=False)
+
+    def __post_init__(self):
+        self._integer_transformer = CategoricalToIntegerTransformer(self.choices)
+
+    @override
+    def encode(
+        self,
+        x: Sequence[Any],
+        *,
+        out: torch.Tensor | None = None,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        integers = self._integer_transformer.encode(
+            x,
+            dtype=dtype if dtype is not None else torch.float64,
+            device=device,
+            out=out,
+        )
+        if out is not None:
+            return integers.div_(len(self.choices) - 1)
+
+        return integers / (len(self.choices) - 1)
+
+    @override
+    def decode(self, x: torch.Tensor) -> list[Any]:
+        x = torch.round(x * (len(self.choices) - 1)).type(torch.int64)
+        return self._integer_transformer.decode(x)
+
+
 # TODO: Maybe add a shift argument, could be useful to have `0` as midpoint
 # and `-0.5` as lower bound with `0.5` as upper bound.
 @dataclass
 class MinMaxNormalizer(TensorTransformer, Generic[V]):
     original_domain: Domain[V]
+    bins: int | None = None
 
     domain: Domain[float] = field(init=False)
 
     def __post_init__(self):
-        self.domain = UNIT_FLOAT_DOMAIN
+        if self.bins is None:
+            self.domain = UNIT_FLOAT_DOMAIN
+        else:
+            self.domain = Domain.float(0.0, 1.0, bins=self.bins)
 
     @override
     def encode(
@@ -128,7 +168,7 @@ def encode(
         else:
             dtype = torch.float64 if dtype is None else dtype
 
-        values = torch.tensor(list(x), dtype=dtype, device=device)
+        values = torch.tensor(x, dtype=dtype, device=device)
         values = self.domain.cast(values, frm=self.original_domain)
         if out is None:
             return values
@@ -251,7 +291,10 @@ def encode(
         return buffer
 
     def pack(
-        self, x: Sequence[Mapping[str, Any]], *, device: torch.device | None = None
+        self,
+        x: Sequence[Mapping[str, Any]],
+        *,
+        device: torch.device | None = None,
     ) -> TensorPack:
         return TensorPack(self.encode(x, device=device), self)
 
@@ -269,15 +312,27 @@ def unpack(self, x: torch.Tensor) -> list[dict[str, Any]]:
         ]
 
     @classmethod
-    def default(cls, parameters: Mapping[str, Parameter]) -> TensorEncoder:
+    def default(
+        cls,
+        parameters: Mapping[str, Parameter],
+        *,
+        custom_transformers: dict[str, TensorTransformer] | None = None,
+    ) -> TensorEncoder:
+        custom = custom_transformers or {}
         sorted_params = sorted(parameters.items())
         transformers: dict[str, TensorTransformer] = {}
         for name, hp in sorted_params:
-            if isinstance(hp, FloatParameter | IntegerParameter):
-                transformers[name] = MinMaxNormalizer(hp.domain)
-            else:
-                assert isinstance(hp, CategoricalParameter)
-                transformers[name] = CategoricalToIntegerTransformer(hp.choices)
+            if name in custom:
+                transformers[name] = custom[name]
+                continue
+
+            match hp:
+                case FloatParameter() | IntegerParameter():
+                    transformers[name] = MinMaxNormalizer(hp.domain)
+                case CategoricalParameter():
+                    transformers[name] = CategoricalToIntegerTransformer(hp.choices)
+                case _:
+                    raise ValueError(f"Unsupported parameter type: {type(hp)}")
 
         return TensorEncoder(transformers)
 
diff --git a/neps/state/neps_state.py b/neps/state/neps_state.py
index dd7d9279..056df9b4 100644
--- a/neps/state/neps_state.py
+++ b/neps/state/neps_state.py
@@ -111,12 +111,18 @@ def sample_trial(
 
             # NOTE: We don't want optimizers mutating this before serialization
             budget = opt_state.budget.clone() if opt_state.budget is not None else None
-            sampled_config, new_opt_state = optimizer.ask(
+            sampled_config_maybe_new_opt_state = optimizer.ask(
                 trials=trials,
                 budget_info=budget,
                 optimizer_state=opt_state.shared_state,
             )
 
+            if isinstance(sampled_config_maybe_new_opt_state, tuple):
+                sampled_config, new_opt_state = sampled_config_maybe_new_opt_state
+            else:
+                sampled_config = sampled_config_maybe_new_opt_state
+                new_opt_state = opt_state.shared_state
+
             if sampled_config.previous_config_id is not None:
                 previous_trial = trials.get(sampled_config.previous_config_id)
                 if previous_trial is None:
diff --git a/neps/state/trial.py b/neps/state/trial.py
index 0360300c..05d2e129 100644
--- a/neps/state/trial.py
+++ b/neps/state/trial.py
@@ -132,7 +132,7 @@ class Trial:
     MetaData: ClassVar = MetaData
     NotReportedYetError: ClassVar = NotReportedYetError
 
-    config: dict[str, Any]
+    config: Mapping[str, Any]
     metadata: MetaData
     state: State
     report: Report | None

From e9318785662de3dd1702837b6efa40867f6c09b3 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 25 Sep 2024 09:42:17 +0200
Subject: [PATCH 43/63] fix(ifbo): bin count for budget index

---
 neps/optimizers/multi_fidelity/ifbo.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index 066a0f99..42838a52 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -118,15 +118,11 @@ def __init__(
 
         # TODO: We want it to be evenly divided by step size, so we need
         # to add something to the minimum fidelity to ensure this.
-
-        # NOTE: The PFN model expects fidelities to be normalized between 0 and 1,
-        # hence, we make sure to do min-max normalization but include the number of bins.
-        # Also, it expects it specifically in column 1 so we can't include it with the configs
         maybe_bins = math.ceil((self._max_budget - self._min_budget) / self.step_size) + 1
         match pipeline_space.fidelity:
             case IntegerParameter():
                 assert pipeline_space.fidelity.domain.cardinality is not None
-                bins = max(maybe_bins, pipeline_space.fidelity.domain.cardinality)
+                bins = min(maybe_bins, pipeline_space.fidelity.domain.cardinality)
             case FloatParameter():
                 bins = maybe_bins
             case _:

From becd324eac9b0f985c7d5ab989a37711bae44a21 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 25 Sep 2024 09:43:36 +0200
Subject: [PATCH 44/63] optim: Use torch operation for validation

---
 neps/optimizers/multi_fidelity/ifbo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index 42838a52..5ff2e03d 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -208,7 +208,7 @@ def ask(
             device=self.device,
             dtype=FTPFN_DTYPE,
         )
-        if not all(0 <= y <= 1.0 for y in minimize_ys):
+        if minimize_ys.max() > 1 or minimize_ys.min() < 0:
             raise RuntimeError(
                 "ifBO requires that all loss values reported lie in the interval [0, 1]"
                 " but recieved loss value outside of that range!"

From 5149034252bcbff3221636972b36168b9e9bc8dc Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 25 Sep 2024 13:24:01 +0200
Subject: [PATCH 45/63] refactor: Cleanup

---
 .../acquisition_functions/mf_pi.py            | 208 ------------------
 .../freeze_thaw_sampler.py                    | 146 ------------
 neps/optimizers/multi_fidelity/ifbo.py        | 167 +++++++-------
 neps/optimizers/multi_fidelity/utils.py       |  53 -----
 4 files changed, 90 insertions(+), 484 deletions(-)
 delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py
 delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py

diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py
deleted file mode 100644
index 75c7f1e3..00000000
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# type: ignore
-from __future__ import annotations
-
-from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any
-
-import numpy as np
-import torch
-
-from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
-    BaseAcquisition,
-)
-from neps.optimizers.multi_fidelity.utils import (
-    MFObservedData,
-    get_freeze_thaw_normalized_step,
-    get_tokenized_data,
-)
-from neps.optimizers.utils import map_real_hyperparameters_from_tabular_ids
-
-if TYPE_CHECKING:
-    import pandas as pd
-
-    from neps.search_spaces.search_space import SearchSpace
-
-
-class MFPI(BaseAcquisition):
-    def __init__(
-        self,
-        pipeline_space: SearchSpace,
-        surrogate_model_name: str | None = None,
-    ):
-        super().__init__()
-        self.pipeline_space = pipeline_space
-        self.surrogate_model_name = surrogate_model_name
-        self.surrogate_model = None
-        self.observations = None
-        self.b_step = None
-
-    def set_state(
-        self,
-        pipeline_space: SearchSpace,
-        surrogate_model: Any,
-        observations: MFObservedData,
-        b_step: int | float,
-        **kwargs,
-    ):
-        # overload to select incumbent differently through observations
-        self.pipeline_space = pipeline_space
-        self.surrogate_model = surrogate_model
-        self.observations = observations
-        self.b_step = b_step
-
-    def preprocess(self, x: pd.Series) -> tuple[pd.Series, torch.Tensor]:
-        """Prepares the configurations for appropriate EI calculation.
-
-        Takes a set of points and computes the budget and incumbent for each point, as
-        required by the multi-fidelity Expected Improvement acquisition function.
-        """
-        raise NotImplementedError
-
-    def eval(self, x: pd.Series, asscalar: bool = False) -> tuple[np.ndarray, pd.Series]:
-        # deepcopy
-        # _x = pd.Series([deepcopy(x.loc[idx]) for idx in x.index.values], index=x.index)
-        if self.surrogate_model_name == "ftpfn":
-            # preprocesses configs to have the appropriate fidelity values for acquisition
-            _x, inc_list = self.preprocess(x.copy())
-            _x_tok = get_tokenized_data(_x.values, ignore_fidelity=True)
-            # padding IDs
-            _idx = torch.Tensor(_x.index.values + 1)
-            idx_mask = np.where(_idx > max(self.observations.seen_config_ids))[0]
-            _idx[idx_mask] = 0
-            # normalizing steps
-            _steps = torch.Tensor(
-                [
-                    get_freeze_thaw_normalized_step(
-                        _conf.fidelity.value,
-                        self.pipeline_space.fidelity.lower,
-                        self.pipeline_space.fidelity.upper,
-                        self.b_step,
-                    )
-                    for _conf in _x
-                ]
-            )
-            _x_tok = torch.hstack(
-                ((_idx).reshape(-1, 1), _steps.reshape(-1, 1), torch.Tensor(_x_tok))
-            )
-            pi = self.eval_pfn_pi(_x_tok, inc_list)
-        else:
-            raise ValueError(
-                f"Unrecognized surrogate model name: {self.surrogate_model_name}"
-            )
-        if pi.is_cuda:
-            pi = pi.cpu()
-        if len(_x) > 1 and asscalar:
-            return pi.detach().numpy(), _x
-        return pi.detach().numpy().item(), _x
-
-    def eval_pfn_pi(
-        self, x: Iterable, inc_list: Iterable
-    ) -> np.ndarray | torch.Tensor | float:
-        """PFN-PI modified to preprocess samples and accept list of incumbents."""
-        pi = self.surrogate_model.get_pi(x.to(self.surrogate_model.device), inc_list)
-        if len(pi.shape) == 2:
-            pi = pi.flatten()
-        return pi
-
-
-class MFPI_Random(MFPI):
-    BUDGET = 1000
-
-    def __init__(
-        self,
-        pipeline_space: SearchSpace,
-        horizon: str = "random",
-        threshold: str = "random",
-        surrogate_model_name: str | None = None,
-    ):
-        super().__init__(pipeline_space, surrogate_model_name)
-        self.horizon = horizon
-        self.threshold = threshold
-
-    def set_state(
-        self,
-        pipeline_space: SearchSpace,
-        surrogate_model: Any,
-        observations: MFObservedData,
-        b_step: int | float,
-        seed: int = 42,
-    ) -> None:
-        # set RNG
-        self.rng = np.random.RandomState(seed=seed)
-
-        # TODO: wut is this?
-        for _i in range(len(observations.completed_runs)):
-            self.rng.uniform(-4, -1)
-            self.rng.randint(1, 51)
-
-        return super().set_state(pipeline_space, surrogate_model, observations, b_step)
-
-    def sample_horizon(self, steps_passed):
-        if self.horizon == "random":
-            shortest = self.pipeline_space.fidelity.lower
-            longest = min(self.pipeline_space.fidelity.upper, self.BUDGET - steps_passed)
-            return self.rng.randint(shortest, longest + 1)
-        if self.horizon == "max":
-            return min(self.pipeline_space.fidelity.upper, self.BUDGET - steps_passed)
-        return int(self.horizon)
-
-    def sample_performance_threshold(self, f_inc):
-        if self.threshold == "random":
-            lu = 10 ** self.rng.uniform(-4, -1)  # % of gap closed
-        else:
-            lu = float(self.threshold)
-        return f_inc * (1 - lu)
-
-    def preprocess(self, x: pd.Series) -> tuple[pd.Series, torch.Tensor]:
-        """Prepares the configurations for appropriate EI calculation.
-
-        Takes a set of points and computes the budget and incumbent for each point, as
-        required by the multi-fidelity acquisition function.
-        """
-        if self.pipeline_space.has_tabular:
-            # preprocess tabular space differently
-            # expected input: IDs pertaining to the tabular data
-            x = map_real_hyperparameters_from_tabular_ids(x, self.pipeline_space)
-
-        indices_to_drop = []
-        inc_list = []
-
-        steps_passed = len(self.observations.completed_runs)
-
-        # Like EI-AtMax, use the global incumbent as a basis for the EI threshold
-        inc_value = min(self.observations.get_best_performance_for_each_budget())
-
-        # Extension: Add a random min improvement threshold to encourage high risk high gain
-        t_value = self.sample_performance_threshold(inc_value)
-        inc_value = t_value
-
-        # Like MFEI: set fidelities to query using horizon as self.b_step
-        # Extension: Unlike DyHPO, we sample the horizon randomly over the full range
-        horizon = self.sample_horizon(steps_passed)
-
-        for i, config in x.items():
-            if i <= max(self.observations.seen_config_ids):
-                if np.equal(config.fidelity.value, config.fidelity.upper):
-                    # this training run has ended, drop it from future selection
-                    indices_to_drop.append(i)
-                else:
-                    # a candidate partial training run to continue
-                    config.update_hp_values(
-                        {
-                            config.fidelity_name: min(
-                                config.fidelity.value + horizon, config.fidelity.upper
-                            )  # if horizon exceeds max, query at max
-                        }
-                    )
-                    inc_list.append(inc_value)
-            else:
-                # a candidate new training run that we would need to start
-                config.update_hp_values({config.fidelity_name: horizon})
-                inc_list.append(inc_value)
-
-        # Drop unused configs
-        x = x.drop(labels=indices_to_drop)
-
-        assert len(inc_list) == len(x)
-
-        return x, torch.Tensor(inc_list)
diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py
deleted file mode 100644
index cae5bee8..00000000
--- a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py
+++ /dev/null
@@ -1,146 +0,0 @@
-from __future__ import annotations
-
-import warnings
-from collections.abc import Callable
-from copy import deepcopy
-from typing import TYPE_CHECKING
-
-import numpy as np
-import pandas as pd
-
-from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-    AcquisitionSampler,
-)
-
-if TYPE_CHECKING:
-    from neps.optimizers.multi_fidelity.utils import MFObservedData
-    from neps.search_spaces.search_space import SearchSpace
-
-SAMPLES_TO_DRAW = (
-    100  # number of random samples to draw for optimizing acquisition function
-)
-
-
-class FreezeThawSampler(AcquisitionSampler):
-    def __init__(self, samples_to_draw: int | None = None, **kwargs):
-        super().__init__(**kwargs)
-        self.observations = None
-        self.b_step = None
-        self.n = None
-        self.pipeline_space = None
-        # args to manage tabular spaces/grid
-        self.is_tabular = False  # flag is set by `set_state()`
-        self.sample_full_table = None
-        self.samples_to_draw = (
-            samples_to_draw if samples_to_draw is not None else SAMPLES_TO_DRAW
-        )
-        self.set_sample_full_tabular(True)  # sets flag that samples full table
-
-    def set_sample_full_tabular(self, flag: bool = False):
-        if self.is_tabular:
-            self.sample_full_table = flag
-
-    def _sample_new(
-        self,
-        index_from: int,
-        n: int | None = None,
-        ignore_fidelity: bool = False,
-    ) -> pd.Series:
-        n = n if n is not None else self.samples_to_draw
-        assert self.pipeline_space is not None
-        new_configs = [
-            self.pipeline_space.sample(
-                patience=self.patience, user_priors=False, ignore_fidelity=ignore_fidelity
-            )
-            for _ in range(n)
-        ]
-
-        return pd.Series(
-            new_configs, index=range(index_from, index_from + len(new_configs))
-        )
-
-    def sample(
-        self,
-        acquisition_function: Callable | None = None,
-        n: int | None = None,
-        set_new_sample_fidelity: int | float | None = None,
-    ) -> pd.Series:
-        """Samples a new set and returns the total set of observed + new configs."""
-        assert self.observations is not None
-        assert self.pipeline_space is not None
-
-        partial_configs = self.observations.get_partial_configs_at_max_seen()
-
-        _n = n if n is not None else self.samples_to_draw
-        if self.is_tabular:
-            assert self.pipeline_space.custom_grid_table is not None
-            # handles tabular data such that the entire unseen set of configs from the
-            # table is considered to be the new set of candidates
-            _partial_ids = {conf["id"].value for conf in partial_configs}
-            _all_ids = set(self.pipeline_space.custom_grid_table.keys())
-
-            # accounting for unseen configs only, samples remaining table if flag is set
-            max_n = len(_all_ids) + 1 if self.sample_full_table else _n
-            _n = min(max_n, len(_all_ids - _partial_ids))
-
-            _new_configs = np.random.choice(
-                list(_all_ids - _partial_ids), size=_n, replace=False
-            )
-            placeholder_config = self.pipeline_space.sample(
-                patience=self.patience, user_priors=False, ignore_fidelity=False
-            )
-            _configs = [placeholder_config.clone() for _id in _new_configs]
-            for _i, val in enumerate(_new_configs):
-                _configs[_i]["id"].set_value(val)
-
-            new_configs = pd.Series(
-                _configs,
-                index=np.arange(
-                    len(partial_configs), len(partial_configs) + len(_new_configs)
-                ),
-            )
-        else:
-            # handles sampling new configurations for continuous spaces
-            new_configs = self._sample_new(
-                index_from=self.observations.next_config_id(), n=_n, ignore_fidelity=False
-            )
-            # Continuous benchmarks need to deepcopy individual configs here,
-            # because in contrast to tabular benchmarks
-            # they are not reset in every sampling step
-
-            # TODO: I do not know what the f p_config_ is meant to be so I don't know
-            # if we have a specific clone method or not...
-            partial_configs = pd.Series(
-                [deepcopy(p_config_) for idx, p_config_ in partial_configs.items()],
-                index=partial_configs.index,
-            )
-
-        # Updating fidelity values
-        new_fid = (
-            set_new_sample_fidelity
-            if set_new_sample_fidelity is not None
-            else self.pipeline_space.fidelity.lower
-        )
-        for config in new_configs:
-            config.update_hp_values({config.fidelity_name: new_fid})
-
-        return pd.concat([deepcopy(partial_configs), new_configs])
-
-    def set_state(
-        self,
-        pipeline_space: SearchSpace,
-        observations: MFObservedData,
-        b_step: int,
-        n: int | None = None,
-    ) -> None:
-        # overload to select incumbent differently through observations
-        self.pipeline_space = pipeline_space
-        self.observations = observations
-        self.b_step = b_step
-        self.n = n if n is not None else self.samples_to_draw
-        if (
-            hasattr(self.pipeline_space, "custom_grid_table")
-            and self.pipeline_space.custom_grid_table is not None
-        ):
-            self.is_tabular = True
-            self.set_sample_full_tabular(True)
diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index 5ff2e03d..e7caa38d 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -15,8 +15,29 @@
 from neps.state.optimizer import BudgetInfo
 
 
-def _select_trials(trials: Mapping[str, Trial]) -> Mapping[str, Trial]:
-    return {
+# NOTE: Ifbo was trained using 32 bit
+FTPFN_DTYPE = torch.float32
+
+
+def tokenize(
+    ids: torch.Tensor,
+    budgets: torch.Tensor,
+    configs: torch.Tensor,
+) -> torch.Tensor:
+    return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1)
+
+
+def _encode_ftpfn(
+    trials: Mapping[str, Trial],
+    encoder: TensorEncoder,
+    space: SearchSpace,
+    budget_domain: Domain,
+    device: torch.device | None = None,
+    dtype: torch.dtype = FTPFN_DTYPE,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # TODO: Currently we do not handle error cases, we can't use NaN as that
+    # is what we use for trials that have no loss yet, i.e. pending trials.
+    selected = {
         trial_id: trial
         for trial_id, trial in trials.items()
         if trial.state
@@ -27,6 +48,45 @@ def _select_trials(trials: Mapping[str, Trial]) -> Mapping[str, Trial]:
             Trial.State.CORRUPTED,
         )
     }
+    assert space.fidelity_name is not None
+    assert space.fidelity is not None
+    train_configs = encoder.encode([t.config for t in selected.values()], device=device)
+    ids = torch.tensor(
+        [int(config_id.split("_", maxsplit=1)[0]) for config_id in selected.keys()],
+        device=device,
+        dtype=torch.float64,
+    )
+    ids = ids
+    train_fidelities = torch.tensor(
+        [t.config[space.fidelity_name] for t in selected.values()],
+        device=device,
+        dtype=torch.float64,
+    )
+    train_budgets = budget_domain.cast(train_fidelities, frm=space.fidelity.domain)
+    X = tokenize(
+        ids=torch.tensor(ids, device=device), budgets=train_budgets, configs=train_configs
+    ).to(dtype)
+
+    # TODO: Document that it's on the user to ensure these are already all bounded
+    # We could possibly include some bounded transform to assert this.
+    minimize_ys = torch.tensor(
+        [
+            trial.report.loss
+            if trial.report is not None and trial.report.loss is not None
+            else np.nan
+            for trial in trials.values()
+        ],
+        device=device,
+        dtype=FTPFN_DTYPE,
+    )
+    if minimize_ys.max() > 1 or minimize_ys.min() < 0:
+        raise RuntimeError(
+            "ifBO requires that all loss values reported lie in the interval [0, 1]"
+            " but recieved loss value outside of that range!"
+            f"\n{minimize_ys}"
+        )
+    maximize_ys = 1 - minimize_ys
+    return X, maximize_ys
 
 
 def _remove_duplicates(x: torch.Tensor) -> torch.Tensor:
@@ -46,18 +106,6 @@ def _remove_duplicates(x: torch.Tensor) -> torch.Tensor:
     return sorted_x[ii]
 
 
-# NOTE: Ifbo was trained using 32 bit
-FTPFN_DTYPE = torch.float32
-
-
-def tokenize(
-    ids: torch.Tensor,
-    budgets: torch.Tensor,
-    configs: torch.Tensor,
-) -> torch.Tensor:
-    return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1)
-
-
 class IFBO(BaseOptimizer):
     """Base class for MF-BO algorithms that use DyHPO-like acquisition and budgeting."""
 
@@ -71,6 +119,7 @@ def __init__(
         # arguments for model
         surrogate_model_args: dict | None = None,
         initial_design_size: int | None = None,
+        n_acquisition_new_configs: int = 1_000,
         device: torch.device | None = None,
         **kwargs: Any,  # TODO: Remove this
     ):
@@ -100,6 +149,7 @@ def __init__(
         self.surrogate_model_args = surrogate_model_args or {}
         self.device = device
         self.n_initial_design: int | None = initial_design_size
+        self.n_acquisition_new_configs = n_acquisition_new_configs
 
         self._min_budget: int | float = pipeline_space.fidelity.lower
         self._max_budget: int | float = pipeline_space.fidelity.upper
@@ -151,12 +201,7 @@ def ask(
         if seed is not None:
             raise NotImplementedError("Seed is not yet implemented for IFBO")
 
-        trials = _select_trials(trials)
-
-        ids = [
-            int(trial.metadata.id.split("_", maxsplit=1)[0]) for trial in trials.values()
-        ]
-        n_unique_ids = len(set(ids))
+        ids = [int(config_id.split("_", maxsplit=1)[0]) for config_id in trials.keys()]
         new_id = max(ids) + 1 if len(ids) > 0 else 0
 
         # If we havn't passed the intial design phase
@@ -173,9 +218,8 @@ def ask(
                 ),
             )
 
-        if n_unique_ids < len(self._initial_design):
-            config = self._initial_design[n_unique_ids]
-            return SampledConfig(id=f"{new_id}_0", config=config)
+        if new_id < len(self._initial_design):
+            return SampledConfig(id=f"{new_id}_0", config=self._initial_design[new_id])
 
         # Otherwise, we proceed to surrogate phase
         ftpfn = FTPFNSurrogate(
@@ -183,42 +227,20 @@ def ask(
             version=self.surrogate_model_args.get("version", "0.0.1"),
             device=self.device,
         )
-
-        # NOTE: `0` is reserved in PFN, we add an additional +1 to all ids
-        train_ids = torch.tensor(ids, dtype=FTPFN_DTYPE, device=self.device) + 1
-        train_configs = self._ftpfn_encoder.encode([t.config for t in trials.values()])
-
-        train_fidelities = [t.config[self._fidelity_name] for t in trials.values()]
-        train_budgets = self._budget_domain.cast(
-            torch.tensor(train_fidelities, device=self.device, dtype=FTPFN_DTYPE),
-            frm=self._fid_domain,
-        )
-        x_train = tokenize(ids=train_ids, budgets=train_budgets, configs=train_configs)
-        x_train = x_train.to(FTPFN_DTYPE)
-
-        # TODO: Document that it's on the user to ensure these are already all bounded
-        # We could possibly include some bounded transform to assert this.
-        minimize_ys = torch.tensor(
-            [
-                trial.report.loss
-                if trial.report is not None and trial.report.loss is not None
-                else np.nan
-                for trial in trials.values()
-            ],
+        x_train, maximize_ys = _encode_ftpfn(
+            trials=trials,
+            encoder=self._ftpfn_encoder,
+            space=self.pipeline_space,
+            budget_domain=self._budget_domain,
             device=self.device,
-            dtype=FTPFN_DTYPE,
         )
-        if minimize_ys.max() > 1 or minimize_ys.min() < 0:
-            raise RuntimeError(
-                "ifBO requires that all loss values reported lie in the interval [0, 1]"
-                " but recieved loss value outside of that range!"
-                f"\n{minimize_ys}"
-            )
-        # Invert the ys
-        maximize_ys = 1 - minimize_ys
+        x_train[:, 1] = x_train[:, 1] + 1  # PFN uses `0` id for test configurations
+
+        # Get the best performance so far
         maximize_best_y = maximize_ys.max().item()
-        is_pending = minimize_ys.isnan()
 
+        # Fantasize the result of pending trials
+        is_pending = maximize_ys.isnan()
         maximize_ys[is_pending] = ftpfn.get_mean_performance(
             train_x=x_train[~is_pending],
             train_y=maximize_ys[~is_pending],
@@ -226,9 +248,6 @@ def ask(
         )
 
         rng = np.random.RandomState(seed)
-        n_rand = 1_000  # TODO: parametrize
-
-        # TODO: Could also sample from a prior...
         uniform = Sampler.uniform(ndim=self._ftpfn_encoder.ncols)
 
         # We sample the horizon in terms of step numbers to take
@@ -244,19 +263,16 @@ def ask(
             frm=self._budget_index_domain,
         )
 
-        # Now let's create the random configurations
-        rand_configs = uniform.sample(
-            n=n_rand,
-            to=self._ftpfn_encoder.domains,
-            seed=None,  # TODO
-            device=self.device,
-        ).to(FTPFN_DTYPE)
-
         # We give them all the special 0 id, as well as set the budget accordinly
         acq_new = tokenize(
-            ids=torch.zeros(n_rand, dtype=FTPFN_DTYPE, device=self.device),
-            budgets=torch.zeros(n_rand, dtype=FTPFN_DTYPE, device=self.device),
-            configs=rand_configs,
+            ids=torch.zeros(self.n_acquisition_new_configs, device=self.device),
+            budgets=torch.zeros(self.n_acquisition_new_configs, device=self.device),
+            configs=uniform.sample(
+                n=self.n_acquisition_new_configs,
+                to=self._ftpfn_encoder.domains,
+                seed=None,  # TODO
+                device=self.device,
+            ),
         )
 
         # Construct all our samples for acqusition:
@@ -284,15 +300,12 @@ def ask(
         # Now get the PI of these samples
         lu = 10 ** rng.uniform(-4, -1)
         f_inc = maximize_best_y * (1 - lu)
+        n_acq_samples = len(acq_samples)
         pi_new_samples = ftpfn.get_pi(
-            train_x=x_train.to(FTPFN_DTYPE),
-            train_y=maximize_ys.to(FTPFN_DTYPE),
-            test_x=acq_samples.to(FTPFN_DTYPE),
-            y_best=torch.full(
-                size=(len(acq_samples),),
-                fill_value=f_inc,
-                dtype=FTPFN_DTYPE,
-            ),
+            train_x=x_train,
+            train_y=maximize_ys,
+            test_x=acq_samples,
+            y_best=torch.full(size=(n_acq_samples,), fill_value=f_inc, dtype=FTPFN_DTYPE),
         )
         best_ix = pi_new_samples.argmax()
 
diff --git a/neps/optimizers/multi_fidelity/utils.py b/neps/optimizers/multi_fidelity/utils.py
index b9d6e174..0158fbdf 100644
--- a/neps/optimizers/multi_fidelity/utils.py
+++ b/neps/optimizers/multi_fidelity/utils.py
@@ -4,7 +4,6 @@
 from collections.abc import Sequence
 from copy import deepcopy
 from typing import TYPE_CHECKING, Any
-from typing_extensions import Self
 
 import numpy as np
 import pandas as pd
@@ -354,58 +353,6 @@ def get_max_observed_fidelity_level_per_config(self) -> pd.Series:
     def token_ids(self) -> np.ndarray:
         return self.df.index.values
 
-    @classmethod
-    def from_trials(
-        cls,
-        trials: Mapping[str, Trial],
-        *,
-        # TODO: We should store dicts, not the SearchSpace object...
-        # Once done, we can remove this
-        space: SearchSpace,
-        on_error: Literal["ignore"] | float = "ignore",
-    ) -> Self:
-        observed_configs = cls(
-            columns=["config", "perf", "learning_curves"],
-            index_names=["config_id", "budget_id"],
-        )
-
-        records: list[dict[str, Any]] = []
-        for trial_id, trial in trials.items():
-            _config_id, _budget_id = trial_id.split("_")
-
-            if trial.report is None:
-                loss = np.nan
-                lc = [np.nan]
-            elif trial.report.loss is None:
-                assert trial.report.err is not None
-                if on_error == "ignore":
-                    return None
-
-                loss = on_error
-                lc = [on_error]
-            elif trial.report.loss is not None:
-                loss = trial.report.loss
-                assert trial.report.learning_curve is not None
-                lc = trial.report.learning_curve
-
-            records.append(
-                {
-                    "config_id": int(_config_id),
-                    "budget_id": int(_budget_id),
-                    # NOTE: Behavoiour around data in this requires that the dataframe stores
-                    # `SearchSpace` objects and not dictionaries
-                    "config": space.from_dict(trial.config),
-                    "perf": loss,
-                    "learning_curves": lc,
-                }
-            )
-
-        observed_configs.df = pd.DataFrame.from_records(
-            records,
-            index=["config_id", "budget_id"],
-        ).sort_index()
-        return observed_configs
-
 
 if __name__ == "__main__":
     # TODO: Either delete these or convert them to tests (karibbov)

From 307fc4508291c1fd283254f3085cf19f817bf30b Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 25 Sep 2024 13:24:34 +0200
Subject: [PATCH 46/63] fix: Remove the removed MFPI_Random

---
 .../acquisition_functions/__init__.py                      | 7 -------
 neps/optimizers/multi_fidelity/mf_bo.py                    | 1 -
 2 files changed, 8 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py b/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py
index a125997d..03d41f6a 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py
@@ -4,7 +4,6 @@
 from neps.optimizers.bayesian_optimization.acquisition_functions.ei import (
     ComprehensiveExpectedImprovement,
 )
-from neps.optimizers.bayesian_optimization.acquisition_functions.mf_pi import MFPI_Random
 from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import (
     DecayingPriorWeightedAcquisition,
 )
@@ -31,11 +30,6 @@
         in_fill="posterior",
         augmented_ei=True,
     ),
-    "MFPI-random": partial(
-        MFPI_Random,
-        threshold="random",
-        horizon="random",
-    ),
     "UCB": partial(
         UpperConfidenceBound,
         maximize=False,
@@ -47,5 +41,4 @@
     "ComprehensiveExpectedImprovement",
     "UpperConfidenceBound",
     "DecayingPriorWeightedAcquisition",
-    "MFPI_Random",
 ]
diff --git a/neps/optimizers/multi_fidelity/mf_bo.py b/neps/optimizers/multi_fidelity/mf_bo.py
index 729e3718..e6205d00 100755
--- a/neps/optimizers/multi_fidelity/mf_bo.py
+++ b/neps/optimizers/multi_fidelity/mf_bo.py
@@ -5,7 +5,6 @@
 
 import torch
 
-from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
 from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate
 from neps.optimizers.multi_fidelity.utils import (
     MFObservedData,

From d162d1f89e7c776283b905c623fec9438f940ce5 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 25 Sep 2024 14:01:49 +0200
Subject: [PATCH 47/63] fix: Use budget_domain bounds where possible

---
 neps/optimizers/multi_fidelity/ifbo.py | 184 +++++++++++++++----------
 1 file changed, 108 insertions(+), 76 deletions(-)

diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index e7caa38d..7b74a8d3 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -1,4 +1,4 @@
-from typing import Any, Mapping
+from typing import Any, Mapping, Literal
 
 import math
 import numpy as np
@@ -7,6 +7,7 @@
 from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig
 from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate
 from neps.optimizers.intial_design import make_initial_design
+from neps.sampling.priors import Prior
 from neps.sampling.samplers import Sampler
 from neps.search_spaces.domain import Domain
 from neps.search_spaces.encoding import CategoricalToUnitNorm, TensorEncoder
@@ -19,7 +20,7 @@
 FTPFN_DTYPE = torch.float32
 
 
-def tokenize(
+def _tokenize(
     ids: torch.Tensor,
     budgets: torch.Tensor,
     configs: torch.Tensor,
@@ -27,7 +28,7 @@ def tokenize(
     return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1)
 
 
-def _encode_ftpfn(
+def _encode_for_ftpfn(
     trials: Mapping[str, Trial],
     encoder: TensorEncoder,
     space: SearchSpace,
@@ -63,7 +64,7 @@ def _encode_ftpfn(
         dtype=torch.float64,
     )
     train_budgets = budget_domain.cast(train_fidelities, frm=space.fidelity.domain)
-    X = tokenize(
+    X = _tokenize(
         ids=torch.tensor(ids, device=device), budgets=train_budgets, configs=train_configs
     ).to(dtype)
 
@@ -106,6 +107,36 @@ def _remove_duplicates(x: torch.Tensor) -> torch.Tensor:
     return sorted_x[ii]
 
 
+def _acquire_pfn(
+    train_x: torch.Tensor,
+    train_y: torch.Tensor,
+    test_x: torch.Tensor,
+    ftpfn: FTPFNSurrogate,
+    y_to_beat: float,
+    how: Literal["pi", "ei", "ucb", "lcb"],
+) -> torch.Tensor:
+    match how:
+        case "pi":
+            y_best = torch.full(
+                size=(len(test_x),), fill_value=y_to_beat, dtype=FTPFN_DTYPE
+            )
+            return ftpfn.get_pi(train_x, train_y, test_x, y_best=y_best)
+        case "ei":
+            y_best = torch.full(
+                size=(len(test_x),), fill_value=y_to_beat, dtype=FTPFN_DTYPE
+            )
+            return ftpfn.get_ei(train_x, train_y, test_x, y_best=y_best)
+        case "ucb":
+            y_best = torch.full(
+                size=(len(test_x),), fill_value=y_to_beat, dtype=FTPFN_DTYPE
+            )
+            return ftpfn.get_ucb(train_x, train_y, test_x)
+        case "lcb":
+            return ftpfn.get_lcb(train_x, train_y, test_x)
+        case _:
+            raise ValueError(f"Unknown acquisition function {how}")
+
+
 class IFBO(BaseOptimizer):
     """Base class for MF-BO algorithms that use DyHPO-like acquisition and budgeting."""
 
@@ -127,8 +158,7 @@ def __init__(
 
         Args:
             pipeline_space: Space in which to search
-            use_priors: Allows random samples to be generated from a default
-                Samples generated from a Gaussian centered around the default value
+            step_size: The size of the step to take in the fidelity domain.
             sampling_policy: The type of sampling procedure to use
             promotion_policy: The type of promotion procedure to use
             sample_default_first: Whether to sample the default configuration first
@@ -154,17 +184,19 @@ def __init__(
         self._min_budget: int | float = pipeline_space.fidelity.lower
         self._max_budget: int | float = pipeline_space.fidelity.upper
         self._fidelity_name: str = pipeline_space.fidelity_name
+        self._initial_design: list[dict[str, Any]] | None = None
+
+        params = {**self.pipeline_space.numerical, **self.pipeline_space.categoricals}
+        self._prior = Prior.from_parameters(params) if use_priors else None
         self._ftpfn_encoder: TensorEncoder = TensorEncoder.default(
-            {
-                **self.pipeline_space.numerical,
-                **self.pipeline_space.categoricals,
-            },
+            params,
+            # FTPFN doesn't support categoricals and we were recomenned to just evenly distribute
+            # in the unit norm
             custom_transformers={
                 cat_name: CategoricalToUnitNorm(choices=cat.choices)
                 for cat_name, cat in self.pipeline_space.categoricals.items()
             },
         )
-        self._initial_design: list[dict[str, Any]] | None = None
 
         # TODO: We want it to be evenly divided by step size, so we need
         # to add something to the minimum fidelity to ensure this.
@@ -189,7 +221,7 @@ def __init__(
 
         # Domain from which we assign an index to each budget
         # Automatically takes care of rounding
-        self._budget_index_domain = Domain.indices(bins)
+        self._budget_ix_domain = Domain.indices(bins)
 
     def ask(
         self,
@@ -210,7 +242,7 @@ def ask(
                 space=self.pipeline_space,
                 encoder=self._ftpfn_encoder,
                 sample_default_first=self.sample_default_first,
-                sampler="sobol",
+                sampler="sobol" if self._prior is None else self._prior,
                 seed=seed,
                 sample_fidelity="min",
                 sample_size=(
@@ -227,17 +259,15 @@ def ask(
             version=self.surrogate_model_args.get("version", "0.0.1"),
             device=self.device,
         )
-        x_train, maximize_ys = _encode_ftpfn(
+        x_train, maximize_ys = _encode_for_ftpfn(
             trials=trials,
             encoder=self._ftpfn_encoder,
             space=self.pipeline_space,
             budget_domain=self._budget_domain,
             device=self.device,
         )
-        x_train[:, 1] = x_train[:, 1] + 1  # PFN uses `0` id for test configurations
-
-        # Get the best performance so far
-        maximize_best_y = maximize_ys.max().item()
+        # PFN uses `0` id for test configurations, we remove this later
+        x_train[:, 1] = x_train[:, 1] + 1
 
         # Fantasize the result of pending trials
         is_pending = maximize_ys.isnan()
@@ -247,69 +277,74 @@ def ask(
             test_x=x_train[is_pending],
         )
 
+        # We then sample a horizon, minimum one budget index increment and cast
+        # to the budget domain expected by the ftpfn model
         rng = np.random.RandomState(seed)
-        uniform = Sampler.uniform(ndim=self._ftpfn_encoder.ncols)
-
-        # We sample the horizon in terms of step numbers to take
-        lower_index = self._budget_index_domain.lower
-        upper_index = self._budget_index_domain.upper
-        # The plus 1 here is because we want to sample that at least one step
-        # should be taken.
-        horizon_index_increment = rng.randint(lower_index, upper_index) + 1
-
-        # We then normalize it to FTPFN normalized budget domain
+        lower_index = self._budget_ix_domain.lower
+        upper_index = self._budget_ix_domain.upper
         horizon = self._budget_domain.cast_one(
-            horizon_index_increment,
-            frm=self._budget_index_domain,
+            rng.randint(lower_index, upper_index) + 1,
+            frm=self._budget_ix_domain,
         )
 
-        # We give them all the special 0 id, as well as set the budget accordinly
-        acq_new = tokenize(
+        # Now we sample some new configurations into the domain expected by the FTPFN
+        if self._prior is not None:
+            acq_sampler = self._prior
+        else:
+            acq_sampler = Sampler.uniform(ndim=self._ftpfn_encoder.ncols)
+
+        new_acq_configs = acq_sampler.sample(
+            self.n_acquisition_new_configs,
+            to=self._ftpfn_encoder.domains,
+            device=self.device,
+            seed=None,  # TODO
+        )
+        acq_new = _tokenize(
             ids=torch.zeros(self.n_acquisition_new_configs, device=self.device),
-            budgets=torch.zeros(self.n_acquisition_new_configs, device=self.device),
-            configs=uniform.sample(
-                n=self.n_acquisition_new_configs,
-                to=self._ftpfn_encoder.domains,
-                seed=None,  # TODO
+            budgets=torch.full(
+                size=(self.n_acquisition_new_configs,),
+                fill_value=self._budget_domain.lower,
                 device=self.device,
             ),
+            configs=new_acq_configs,
         )
 
         # Construct all our samples for acqusition:
         # 1. Take all non-pending configs
-        acq_train = x_train[~is_pending].clone().detach()
+        acq_continue_existing = x_train[~is_pending].clone().detach()
 
-        # 2. We only want to include the configuration rows
-        #   that are at their highest budget,
-        #   i.e. don't include config_0_0 and config_0_1
-        acq_train = _remove_duplicates(acq_train)
+        # 2. We only want to include the configuration at their highest
+        # budget evaluated, i.e. don't include config_0_0 if config_0_1 is highest
+        acq_continue_existing = _remove_duplicates(acq_continue_existing)
 
-        # 3. Sub select all that are at a partial budget i.e. can evaluate further
-        #   Note, it's important to do this after the above
-        partial_eval_mask = acq_train[:, 1] < 1
-        acq_train = acq_train[partial_eval_mask]
+        # 3. Sub select all that are not fully evaluated
+        acq_continue_existing = acq_continue_existing[acq_continue_existing[:, 1] < 1]
 
         # 4. Add in the new sampled configurations
-        acq_samples = torch.vstack([acq_train, acq_new])
+        acq_samples = torch.vstack([acq_continue_existing, acq_new])
 
-        # 5. Add on the horizon to the budget, and clamping to maximum
-        #     Note that we hold onto the intermediate unclamped budget for later
+        # 5. Add on the horizon to the budget
         unclamped_budgets = acq_samples[:, 1] + horizon
-        acq_samples[:, 1] = torch.clamp(unclamped_budgets, max=1)
 
-        # Now get the PI of these samples
+        # 6. Clamp to the maximum of the budget domain
+        acq_samples[:, 1] = torch.clamp(unclamped_budgets, max=self._budget_domain.upper)
+
+        # Now get the PI of these samples according to MFPI_Random
+        maximize_best_y = maximize_ys.max().item()
         lu = 10 ** rng.uniform(-4, -1)
         f_inc = maximize_best_y * (1 - lu)
-        n_acq_samples = len(acq_samples)
-        pi_new_samples = ftpfn.get_pi(
+
+        acq_scores = _acquire_pfn(
             train_x=x_train,
-            train_y=maximize_ys,
+            train_y=maximize_ys[~is_pending],
             test_x=acq_samples,
-            y_best=torch.full(size=(n_acq_samples,), fill_value=f_inc, dtype=FTPFN_DTYPE),
+            ftpfn=ftpfn,
+            y_to_beat=f_inc,
+            how="pi",
         )
-        best_ix = pi_new_samples.argmax()
 
         # Extract out the row which had the best PI
+        best_ix = acq_scores.argmax()
         best_id = int(acq_samples[best_ix, 0].round().item())
         best_vector = acq_samples[best_ix, 2:].unsqueeze(0)
         best_config = self._ftpfn_encoder.unpack(best_vector)[0]
@@ -321,26 +356,23 @@ def ask(
             previous_config_id = None
             return SampledConfig(config_id, best_config, previous_config_id)
 
-        else:
-            # To calculate the next step to take in fidelity space, we remove the horizon
-            previous_budget_of_acquired_config = unclamped_budgets[best_ix] - horizon
-
-            # Then we transform this:
-            # 1. Back to budget_index space
-            # 2. Increment it by one
-            # 3. Transform back to fidelity space
-            budget_ix = self._budget_index_domain.cast_one(
-                float(previous_budget_of_acquired_config), frm=self._budget_domain
-            )
-            budget_ix += 1
-            fid_value = self._fid_domain.cast_one(
-                budget_ix, frm=self._budget_index_domain
-            )
+        # To get to the next fidelity value to provide,
+        # 1. Get the budget before we added the horizon
+        budget = float(unclamped_budgets[best_ix] - horizon)
 
-            real_best_id = best_id - 1  # NOTE: Remove the +1 we added to all ids
-            best_config[self._fidelity_name] = fid_value
+        # 2. Cast to budget index domain
+        budget_ix = self._budget_ix_domain.cast_one(budget, frm=self._budget_domain)
 
-            config_id = f"{real_best_id}_{budget_ix}"
-            previous_config_id = f"{real_best_id}_{budget_ix - 1}"
+        # 3. Increment it to the next budget index
+        budget_ix += 1
 
-            return SampledConfig(config_id, best_config, previous_config_id)
+        # 4. And finally convert back into the fidelity domain
+        fid_value = self._fid_domain.cast_one(budget_ix, frm=self._budget_ix_domain)
+
+        real_best_id = best_id - 1  # NOTE: Remove the +1 we added to all ids earlier
+        best_config[self._fidelity_name] = fid_value
+
+        config_id = f"{real_best_id}_{budget_ix}"
+        previous_config_id = f"{real_best_id}_{budget_ix - 1}"
+
+        return SampledConfig(config_id, best_config, previous_config_id)

From 7e2a048b6b27a9667664c03d6753d6b5811f498f Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 25 Sep 2024 14:24:19 +0200
Subject: [PATCH 48/63] fix: Increment lower bound of fidelity space to make
 divisble

---
 neps/optimizers/multi_fidelity/ifbo.py | 99 ++++++++++++++++++--------
 1 file changed, 71 insertions(+), 28 deletions(-)

diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index 7b74a8d3..2d041df1 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -20,6 +20,59 @@
 FTPFN_DTYPE = torch.float32
 
 
+def _adjust_pipeline_space_to_match_stepsize(
+    pipeline_space: SearchSpace,
+    step_size: int | float,
+) -> tuple[SearchSpace, int]:
+    """Adjust the pipeline space to be evenly divisible by the step size.
+
+    This is done by incrementing the lower bound of the fidelity domain to the
+    that enables this.
+
+    Args:
+        pipeline_space: The pipeline space to adjust
+        step_size: The size of the step to take in the fidelity domain.
+
+    Returns:
+        The adjusted pipeline space and the number of bins it can be divided into
+    """
+    fidelity = pipeline_space.fidelity
+    fidelity_name = pipeline_space.fidelity_name
+    assert fidelity_name is not None
+    assert isinstance(fidelity, FloatParameter | IntegerParameter)
+    if fidelity.log:
+        raise NotImplementedError("Log fidelity not yet supported")
+
+    # Can't use mod since it's quite innacurate for floats
+    # Use the fact that we can always write x = n*k + r
+    # where k = stepsize and x = (fid_upper - fid_lower)
+    # x = n*k + r
+    # n = x // k
+    # r = x - n*k
+    x = fidelity.upper - fidelity.lower
+    n = int(x // step_size)
+
+    if n <= 0:
+        raise ValueError(
+            f"Step size ({step_size}) is too large for the fidelity domain {fidelity}."
+            "Considering lowering this parameter to ifBO."
+        )
+
+    r = x - n * step_size
+    new_lower = fidelity.lower + r
+    new_fid = fidelity.__class__(
+        lower=new_lower,
+        upper=fidelity.upper,
+        log=fidelity.log,
+        default=fidelity.default,
+        default_confidence=fidelity.default_confidence_choice,
+    )
+    return (
+        SearchSpace(**{**pipeline_space.hyperparameters, fidelity_name: new_fid}),
+        n,
+    )
+
+
 def _tokenize(
     ids: torch.Tensor,
     budgets: torch.Tensor,
@@ -143,7 +196,7 @@ class IFBO(BaseOptimizer):
     def __init__(
         self,
         pipeline_space: SearchSpace,
-        step_size: int = 1,
+        step_size: int | float = 1,
         use_priors: bool = False,
         sample_default_first: bool = False,
         sample_default_at_target: bool = False,
@@ -168,25 +221,30 @@ def __init__(
 
             device: Device to use for the model
         """
-        assert pipeline_space.fidelity is not None
-        assert isinstance(pipeline_space.fidelity_name, str)
+        # TODO: I'm not sure how this might effect tables, whose lowest fidelity
+        # might be below to possibly increased lower bound.
+        space, fid_bins = _adjust_pipeline_space_to_match_stepsize(
+            pipeline_space, step_size
+        )
+        assert space.fidelity is not None
+        assert isinstance(space.fidelity_name, str)
 
-        super().__init__(pipeline_space=pipeline_space)
+        super().__init__(pipeline_space=space)
         self.step_size = step_size
         self.use_priors = use_priors
         self.sample_default_first = sample_default_first
         self.sample_default_at_target = sample_default_at_target
-        self.surrogate_model_args = surrogate_model_args or {}
         self.device = device
-        self.n_initial_design: int | None = initial_design_size
+        self.n_initial_design = initial_design_size
         self.n_acquisition_new_configs = n_acquisition_new_configs
+        self.surrogate_model_args = surrogate_model_args or {}
 
-        self._min_budget: int | float = pipeline_space.fidelity.lower
-        self._max_budget: int | float = pipeline_space.fidelity.upper
-        self._fidelity_name: str = pipeline_space.fidelity_name
+        self._min_budget: int | float = space.fidelity.lower
+        self._max_budget: int | float = space.fidelity.upper
+        self._fidelity_name: str = space.fidelity_name
         self._initial_design: list[dict[str, Any]] | None = None
 
-        params = {**self.pipeline_space.numerical, **self.pipeline_space.categoricals}
+        params = {**space.numerical, **space.categoricals}
         self._prior = Prior.from_parameters(params) if use_priors else None
         self._ftpfn_encoder: TensorEncoder = TensorEncoder.default(
             params,
@@ -194,34 +252,19 @@ def __init__(
             # in the unit norm
             custom_transformers={
                 cat_name: CategoricalToUnitNorm(choices=cat.choices)
-                for cat_name, cat in self.pipeline_space.categoricals.items()
+                for cat_name, cat in space.categoricals.items()
             },
         )
 
-        # TODO: We want it to be evenly divided by step size, so we need
-        # to add something to the minimum fidelity to ensure this.
-        maybe_bins = math.ceil((self._max_budget - self._min_budget) / self.step_size) + 1
-        match pipeline_space.fidelity:
-            case IntegerParameter():
-                assert pipeline_space.fidelity.domain.cardinality is not None
-                bins = min(maybe_bins, pipeline_space.fidelity.domain.cardinality)
-            case FloatParameter():
-                bins = maybe_bins
-            case _:
-                raise NotImplementedError(
-                    f"Fidelity type {type(pipeline_space.fidelity)} not supported"
-                )
-
         # Domain of fidelity values, i.e. what is given in the configs that we
         # give to the user to evaluate at.
-        self._fid_domain = pipeline_space.fidelity.domain
+        self._fid_domain = space.fidelity.domain
 
         # Domain in which we should pass budgets to ifbo model
         self._budget_domain = Domain.float(1 / self._max_budget, 1)
 
         # Domain from which we assign an index to each budget
-        # Automatically takes care of rounding
-        self._budget_ix_domain = Domain.indices(bins)
+        self._budget_ix_domain = Domain.indices(fid_bins)
 
     def ask(
         self,

From 995bf9c7fdfe594303f6863788c25d914080da3f Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 25 Sep 2024 14:36:07 +0200
Subject: [PATCH 49/63] doc(ifbo): Document how encoding works

---
 neps/optimizers/multi_fidelity/ifbo.py | 44 +++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index 2d041df1..c4ac5a0c 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -89,6 +89,43 @@ def _encode_for_ftpfn(
     device: torch.device | None = None,
     dtype: torch.dtype = FTPFN_DTYPE,
 ) -> tuple[torch.Tensor, torch.Tensor]:
+    """Encode the trials into a format that the FTPFN model can understand.
+
+    !!! warning "loss values reported"
+
+        The `ys` are a single dimension but consist of the losses inverted to scores.
+        As result, we have to assert that the loss values provided in the trials are
+        in the range [0, 1].
+
+    !!! note "X layout"
+
+        The layout of the X is:
+
+        ```
+        | config_id | budget (normalized from fidelity) | hp_1 | hp_2 | ... | hp_n |
+        ```
+
+        Here the `budget` is normalized to the range [0, 1] while the hp parameters
+        are encoded according to the provided encoder, which should map the parameter
+        values from the original domain to some domain in [0, 1].
+
+    !!! warning "Pending and Error trials"
+
+        We currently do not handle error cases, **and they are ignored**.
+        For trials which do not have a loss reported yet, they are considered pending
+        and will have `torch.nan` as their score inside the returned y values.
+
+    Args:
+        trials: The trials to encode
+        encoder: The encoder to use
+        space: The search space
+        budget_domain: The domain to use for the budgets of the FTPFN
+        device: The device to use
+        dtype: The dtype to use
+
+    Returns:
+        The encoded trials and their corresponding **scores**
+    """
     # TODO: Currently we do not handle error cases, we can't use NaN as that
     # is what we use for trials that have no loss yet, i.e. pending trials.
     selected = {
@@ -110,7 +147,7 @@ def _encode_for_ftpfn(
         device=device,
         dtype=torch.float64,
     )
-    ids = ids
+    ids = ids + 1  # We add one to all ids to make room for the test configurations
     train_fidelities = torch.tensor(
         [t.config[space.fidelity_name] for t in selected.values()],
         device=device,
@@ -143,7 +180,7 @@ def _encode_for_ftpfn(
     return X, maximize_ys
 
 
-def _remove_duplicates(x: torch.Tensor) -> torch.Tensor:
+def _keep_highest_budget_evaluation(x: torch.Tensor) -> torch.Tensor:
     # Does a lexsort, same as if we sorted by (config_id, budget), where
     # theyre are sorted according to increasing config_id and then increasing budget.
     # x[i2] -> sorted by config id and budget
@@ -358,7 +395,7 @@ def ask(
 
         # 2. We only want to include the configuration at their highest
         # budget evaluated, i.e. don't include config_0_0 if config_0_1 is highest
-        acq_continue_existing = _remove_duplicates(acq_continue_existing)
+        acq_continue_existing = _keep_highest_budget_evaluation(acq_continue_existing)
 
         # 3. Sub select all that are not fully evaluated
         acq_continue_existing = acq_continue_existing[acq_continue_existing[:, 1] < 1]
@@ -417,5 +454,4 @@ def ask(
 
         config_id = f"{real_best_id}_{budget_ix}"
         previous_config_id = f"{real_best_id}_{budget_ix - 1}"
-
         return SampledConfig(config_id, best_config, previous_config_id)

From 9b1d050089b66ad8ed79c2e619065782c1351f4e Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 25 Sep 2024 15:19:55 +0200
Subject: [PATCH 50/63] fix(ifbo): Example running again

---
 .../acquisition_samplers/__init__.py          |  2 -
 neps/optimizers/default_searchers/ifbo.yaml   | 14 ++---
 neps/optimizers/multi_fidelity/ifbo.py        | 53 +++++++++++--------
 3 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/__init__.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/__init__.py
index e3b12572..7f53780c 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_samplers/__init__.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/__init__.py
@@ -1,5 +1,4 @@
 from .evolution_sampler import EvolutionSampler
-from .freeze_thaw_sampler import FreezeThawSampler
 from .mutation_sampler import MutationSampler
 from .random_sampler import RandomSampler
 
@@ -7,5 +6,4 @@
     "random": RandomSampler,
     "mutation": MutationSampler,
     "evolution": EvolutionSampler,
-    "freeze-thaw": FreezeThawSampler,
 }
diff --git a/neps/optimizers/default_searchers/ifbo.yaml b/neps/optimizers/default_searchers/ifbo.yaml
index 76522922..dda4ee8c 100644
--- a/neps/optimizers/default_searchers/ifbo.yaml
+++ b/neps/optimizers/default_searchers/ifbo.yaml
@@ -1,9 +1,11 @@
 strategy: ifbo
-surrogate_model: ftpfn
 surrogate_model_args:
   version: "0.0.1"
-acquisition: MFPI-random
-acquisition_sampler: freeze-thaw
-acquisition_sampler_args:
-  samples_to_draw: 250
-model_policy: PFNSurrogate
\ No newline at end of file
+  target_path: null  # Defaults to current_working_directory/.model
+step_size: 1  # Step size to use for partial evaluations
+use_priors: false  # Whether to use priors set through `default` and `default_confidence`
+sample_default_first: false  # Whether to sample the default configuration first
+sample_default_at_target: false  # Whether to evaluate the default at the maximum fidelity or not
+initial_design_size: "ndim"  # How many initial samples to try before using the model
+n_acquisition_new_configs: 1_000  # Number samples of new configs to include during acqusition
+device: null  # Device to load the model on with torch
diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index c4ac5a0c..cf5f27f5 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -1,6 +1,5 @@
 from typing import Any, Mapping, Literal
 
-import math
 import numpy as np
 import torch
 
@@ -18,6 +17,8 @@
 
 # NOTE: Ifbo was trained using 32 bit
 FTPFN_DTYPE = torch.float32
+ID_COL = 0
+BUDGET_COL = 1
 
 
 def _adjust_pipeline_space_to_match_stepsize(
@@ -65,6 +66,7 @@ def _adjust_pipeline_space_to_match_stepsize(
         upper=fidelity.upper,
         log=fidelity.log,
         default=fidelity.default,
+        is_fidelity=True,
         default_confidence=fidelity.default_confidence_choice,
     )
     return (
@@ -78,7 +80,9 @@ def _tokenize(
     budgets: torch.Tensor,
     configs: torch.Tensor,
 ) -> torch.Tensor:
-    return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1)
+    return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1).to(
+        FTPFN_DTYPE
+    )
 
 
 def _encode_for_ftpfn(
@@ -147,7 +151,6 @@ def _encode_for_ftpfn(
         device=device,
         dtype=torch.float64,
     )
-    ids = ids + 1  # We add one to all ids to make room for the test configurations
     train_fidelities = torch.tensor(
         [t.config[space.fidelity_name] for t in selected.values()],
         device=device,
@@ -155,7 +158,9 @@ def _encode_for_ftpfn(
     )
     train_budgets = budget_domain.cast(train_fidelities, frm=space.fidelity.domain)
     X = _tokenize(
-        ids=torch.tensor(ids, device=device), budgets=train_budgets, configs=train_configs
+        ids=torch.tensor(ids, device=device),
+        budgets=train_budgets,
+        configs=train_configs,
     ).to(dtype)
 
     # TODO: Document that it's on the user to ensure these are already all bounded
@@ -184,12 +189,12 @@ def _keep_highest_budget_evaluation(x: torch.Tensor) -> torch.Tensor:
     # Does a lexsort, same as if we sorted by (config_id, budget), where
     # theyre are sorted according to increasing config_id and then increasing budget.
     # x[i2] -> sorted by config id and budget
-    i1 = torch.argsort(x[:, 1])
-    i2 = i1[torch.argsort(x[i1][:, 0], stable=True)]
+    i1 = torch.argsort(x[:, BUDGET_COL])
+    i2 = i1[torch.argsort(x[i1][:, ID_COL], stable=True)]
     sorted_x = x[i2]
 
     # Now that it's sorted, we essentially want to count the occurence of each id into counts
-    _, counts = torch.unique_consecutive(sorted_x[:, 0], return_counts=True)
+    _, counts = torch.unique_consecutive(sorted_x[:, ID_COL], return_counts=True)
 
     # Now we can use these counts to get to the last occurence of each id
     # The -1 is because we want to index from 0 but sum starts at 1.
@@ -237,12 +242,14 @@ def __init__(
         use_priors: bool = False,
         sample_default_first: bool = False,
         sample_default_at_target: bool = False,
-        # arguments for model
         surrogate_model_args: dict | None = None,
-        initial_design_size: int | None = None,
+        initial_design_size: int | Literal["ndim"] = "ndim",
         n_acquisition_new_configs: int = 1_000,
         device: torch.device | None = None,
-        **kwargs: Any,  # TODO: Remove this
+        budget: int | float | None = None,  # TODO: Remove
+        loss_value_on_error: float | None = None,  # TODO: Remove
+        cost_value_on_error: float | None = None,  # TODO: Remove
+        ignore_errors: bool = False,  # TODO: Remove
     ):
         """Initialise.
 
@@ -272,7 +279,7 @@ def __init__(
         self.sample_default_first = sample_default_first
         self.sample_default_at_target = sample_default_at_target
         self.device = device
-        self.n_initial_design = initial_design_size
+        self.n_initial_design: int | Literal["ndim"] = initial_design_size
         self.n_acquisition_new_configs = n_acquisition_new_configs
         self.surrogate_model_args = surrogate_model_args or {}
 
@@ -325,9 +332,7 @@ def ask(
                 sampler="sobol" if self._prior is None else self._prior,
                 seed=seed,
                 sample_fidelity="min",
-                sample_size=(
-                    "ndim" if self.n_initial_design is None else self.n_initial_design
-                ),
+                sample_size=self.n_initial_design,
             )
 
         if new_id < len(self._initial_design):
@@ -347,7 +352,7 @@ def ask(
             device=self.device,
         )
         # PFN uses `0` id for test configurations, we remove this later
-        x_train[:, 1] = x_train[:, 1] + 1
+        x_train[:, ID_COL] = x_train[:, ID_COL] + 1
 
         # Fantasize the result of pending trials
         is_pending = maximize_ys.isnan()
@@ -391,23 +396,27 @@ def ask(
 
         # Construct all our samples for acqusition:
         # 1. Take all non-pending configs
-        acq_continue_existing = x_train[~is_pending].clone().detach()
+        acq_existing = x_train[~is_pending].clone().detach()
 
         # 2. We only want to include the configuration at their highest
         # budget evaluated, i.e. don't include config_0_0 if config_0_1 is highest
-        acq_continue_existing = _keep_highest_budget_evaluation(acq_continue_existing)
+        acq_existing = _keep_highest_budget_evaluation(acq_existing)
 
         # 3. Sub select all that are not fully evaluated
-        acq_continue_existing = acq_continue_existing[acq_continue_existing[:, 1] < 1]
+        acq_existing = acq_existing[
+            acq_existing[:, BUDGET_COL] < self._budget_domain.upper
+        ]
 
         # 4. Add in the new sampled configurations
-        acq_samples = torch.vstack([acq_continue_existing, acq_new])
+        acq_samples = torch.vstack([acq_existing, acq_new])
 
         # 5. Add on the horizon to the budget
-        unclamped_budgets = acq_samples[:, 1] + horizon
+        unclamped_budgets = acq_samples[:, BUDGET_COL] + horizon
 
         # 6. Clamp to the maximum of the budget domain
-        acq_samples[:, 1] = torch.clamp(unclamped_budgets, max=self._budget_domain.upper)
+        acq_samples[:, BUDGET_COL] = torch.clamp(
+            unclamped_budgets, max=self._budget_domain.upper
+        )
 
         # Now get the PI of these samples according to MFPI_Random
         maximize_best_y = maximize_ys.max().item()
@@ -425,7 +434,7 @@ def ask(
 
         # Extract out the row which had the best PI
         best_ix = acq_scores.argmax()
-        best_id = int(acq_samples[best_ix, 0].round().item())
+        best_id = int(acq_samples[best_ix, ID_COL].round().item())
         best_vector = acq_samples[best_ix, 2:].unsqueeze(0)
         best_config = self._ftpfn_encoder.unpack(best_vector)[0]
 

From da34471614323748a5e32ed22f8e5d38f2a3d4da Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 25 Sep 2024 15:29:19 +0200
Subject: [PATCH 51/63] refactor: Remove PFNSurrogate

---
 neps/optimizers/multi_fidelity/mf_bo.py | 99 -------------------------
 1 file changed, 99 deletions(-)

diff --git a/neps/optimizers/multi_fidelity/mf_bo.py b/neps/optimizers/multi_fidelity/mf_bo.py
index e6205d00..8cb31ceb 100755
--- a/neps/optimizers/multi_fidelity/mf_bo.py
+++ b/neps/optimizers/multi_fidelity/mf_bo.py
@@ -15,7 +15,6 @@
     calc_total_resources_spent,
     update_fidelity,
 )
-from neps.utils.common import instance_from_map
 
 
 class MFBOBase:
@@ -182,101 +181,3 @@ def sample_new_config(
                 ignore_fidelity=True,
             )
         return config
-
-
-class PFNSurrogate:
-    """Special class to deal with PFN surrogate model and freeze-thaw acquisition."""
-
-    def __init__(
-        self,
-        pipeline_space: SearchSpace,
-        surrogate_model: str = "ftpfn",
-        surrogate_model_args: dict | None = None,
-        step_size: int = 1,
-    ):
-        self.train_x = None
-        self.train_y = None
-        self.observed_configs: MFObservedData | None = None
-        self.pipeline_space = pipeline_space
-        self.surrogate_model_name = surrogate_model
-        self.surrogate_model_args = (
-            surrogate_model_args if surrogate_model_args is not None else {}
-        )
-
-        # TODO: Lift this into the responsility of the caller of this function.
-        self.surrogate_model = FTPFNSurrogate(**surrogate_model_args)
-        self.step_size = step_size
-
-    def update_model(self) -> None:
-        # tokenize the observations
-        idxs, steps, configs, performance = get_training_data_for_freeze_thaw(
-            self.observed_configs.df.loc[self.observed_configs.completed_runs_index],
-            self.observed_configs.config_col,
-            self.observed_configs.perf_col,
-            self.pipeline_space,
-            step_size=self.step_size,
-            maximize=True,  # inverts performance since NePS minimizes
-        )
-        df_idxs = torch.Tensor(idxs)
-        df_x = torch.Tensor(get_tokenized_data(configs))
-        df_steps = torch.Tensor(steps)
-        train_x = torch.hstack(
-            [
-                df_idxs.reshape(df_steps.shape[0], 1),
-                df_steps.reshape(df_steps.shape[0], 1),
-                df_x,
-            ]
-        )
-        train_y = torch.Tensor(performance)
-
-        # fit the model, on only completed runs
-        self._fit(train_x, train_y)
-
-        # fantasize pending evaluations
-        if self.observed_configs.pending_condition.any():
-            # tokenize the pending observations
-            _idxs, _steps, _configs, _ = get_training_data_for_freeze_thaw(
-                self.observed_configs.df.loc[self.observed_configs.pending_runs_index],
-                self.observed_configs.config_col,
-                self.observed_configs.perf_col,
-                self.pipeline_space,
-                step_size=self.step_size,
-                maximize=True,  # inverts performance since NePS minimizes
-            )
-            _df_x = torch.Tensor(get_tokenized_data(_configs))
-            _df_idxs = torch.Tensor(_idxs)
-            _df_steps = torch.Tensor(_steps)
-            _test_x = torch.hstack(
-                [
-                    _df_idxs.reshape(_df_idxs.shape[0], 1),
-                    _df_steps.reshape(_df_steps.shape[0], 1),
-                    _df_x,
-                ]
-            )
-            _performances = self._predict(_test_x)  # returns maximizing metric
-            # update the training data
-            train_x = torch.vstack([train_x, _test_x])
-            train_y = torch.hstack([train_y, _performances])
-            # refit the model, on completed runs + fantasized pending runs
-            self._fit(train_x, train_y)
-
-    def _fit(self, train_x: torch.Tensor, train_y: torch.Tensor) -> None:
-        # no training required,, only preprocessing the training data as context during inference
-        assert self.surrogate_model is not None, "Surrogate model not set!"
-        self.surrogate_model.train_x = train_x
-        self.surrogate_model.train_y = train_y
-
-    def _predict(self, test_x: torch.Tensor) -> torch.Tensor:
-        assert (
-            self.surrogate_model.train_x is not None
-            and self.surrogate_model.train_y is not None
-        ), "Model not trained yet!"
-        return self.surrogate_model.get_mean_performance(test_x)
-
-    def set_state(
-        self,
-        pipeline_space,
-        surrogate_model_args,
-        **kwargs,  # pylint: disable=unused-argument
-    ):
-        self.pipeline_space = pipeline_space

From 268dfb6feb37393f80d32eebd4574824d57191be Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 25 Sep 2024 15:36:40 +0200
Subject: [PATCH 52/63] fix(ifbo): handle all trials which contain model-able
 info

---
 neps/optimizers/multi_fidelity/ifbo.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index cf5f27f5..569e0ccf 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -130,18 +130,12 @@ def _encode_for_ftpfn(
     Returns:
         The encoded trials and their corresponding **scores**
     """
-    # TODO: Currently we do not handle error cases, we can't use NaN as that
-    # is what we use for trials that have no loss yet, i.e. pending trials.
+    # Select all trials which have something we can actually use for modelling
+    # The absence of a report signifies pending
     selected = {
         trial_id: trial
         for trial_id, trial in trials.items()
-        if trial.state
-        not in (
-            Trial.State.FAILED,
-            Trial.State.CRASHED,
-            Trial.State.UNKNOWN,
-            Trial.State.CORRUPTED,
-        )
+        if trial.report is None or trial.report.loss is not None
     }
     assert space.fidelity_name is not None
     assert space.fidelity is not None

From 673530ea8f08ffdb8935f6999c1c591547f28607 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 25 Sep 2024 16:24:08 +0200
Subject: [PATCH 53/63] refactor: Remove unused

---
 .../bayesian_optimization/kernels/__init__.py |  25 -
 .../kernels/grakel_replace/__init__.py        |   8 -
 .../kernels/grakel_replace/edge_histogram.py  | 128 ---
 .../kernels/grakel_replace/utils.py           |  58 --
 .../grakel_replace/vertex_histogram.py        | 447 ----------
 .../grakel_replace/weisfeiler_lehman.py       | 766 ------------------
 .../bayesian_optimization/kernels/utils.py    | 155 ----
 .../kernels/weisfilerlehman.py                | 121 ---
 .../bayesian_optimization/models/__init__.py  |   9 +-
 .../bayesian_optimization/models/gp.py        |   2 +-
 .../bayesian_optimization/optimizer.py        |  32 +-
 .../bayesian_optimization.yaml                |  18 +-
 neps/optimizers/default_searchers/pibo.yaml   |  19 +-
 neps/optimizers/multi_fidelity/mf_bo.py       |   7 -
 .../multi_fidelity/sampling_policy.py         | 165 ----
 neps/search_spaces/architecture/api.py        |   2 +-
 neps_examples/basic_usage/architecture.py     |   5 +-
 .../architecture_and_hyperparameters.py       |   5 +
 ...rs_for_architecture_and_hyperparameters.py |   2 +-
 .../experimental/hierarchical_architecture.py |   5 +
 tests/test_examples.py                        |   7 +
 .../testing_scripts/default_neps.py           |  34 -
 22 files changed, 46 insertions(+), 1974 deletions(-)
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/__init__.py
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/utils.py
 delete mode 100644 neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py

diff --git a/neps/optimizers/bayesian_optimization/kernels/__init__.py b/neps/optimizers/bayesian_optimization/kernels/__init__.py
deleted file mode 100644
index 7c7018d0..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from collections.abc import Callable
-from functools import partial
-
-from .vectorial_kernels import HammingKernel, Matern32Kernel, Matern52Kernel, RBFKernel
-from .weisfilerlehman import WeisfilerLehman
-
-StationaryKernelMapping: dict[str, Callable] = {
-    "m52": Matern52Kernel,
-    "m32": Matern32Kernel,
-    "rbf": RBFKernel,
-    "hm": HammingKernel,
-}
-
-GraphKernelMapping: dict[str, Callable] = {
-    "wl": partial(
-        WeisfilerLehman,
-        h=2,
-        oa=False,
-    ),
-    "vh": partial(
-        WeisfilerLehman,
-        h=0,
-        oa=False,
-    ),
-}
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py
deleted file mode 100644
index ac1c60ad..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from neps.optimizers.bayesian_optimization.kernels.grakel_replace.vertex_histogram import (
-    VertexHistogram,
-)
-from neps.optimizers.bayesian_optimization.kernels.grakel_replace.weisfeiler_lehman import (
-    WeisfeilerLehman,
-)
-
-__all__ = ["VertexHistogram", "WeisfeilerLehman"]
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py
deleted file mode 100644
index 12a83a19..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py
+++ /dev/null
@@ -1,128 +0,0 @@
-"""The Edge Histogram kernel as defined in :cite:`sugiyama2015halting`."""
-
-from __future__ import annotations
-
-from collections import Counter
-from collections.abc import Iterable
-from warnings import warn
-
-from grakel.graph import Graph
-from numpy import zeros
-from scipy.sparse import csr_matrix
-
-from .vertex_histogram import VertexHistogram
-
-
-class EdgeHistogram(VertexHistogram):
-    """Edge Histogram kernel as found in :cite:`sugiyama2015halting`.
-
-    Parameters
-    ----------
-    sparse : bool, or 'auto', default='auto'
-        Defines if the data will be stored in a sparse format.
-        Sparse format is slower, but less memory consuming and in some cases the only solution.
-        If 'auto', uses a sparse matrix when the number of zeros is more than the half of the matrix size.
-        In all cases if the dense matrix doesn't fit system memory, I sparse approach will be tried.
-
-    Attributes:
-    ----------
-    None.
-
-    """
-
-    def parse_input(self, X: Iterable, **kwargs):
-        """Parse and check the given input for EH kernel.
-
-        Parameters
-        ----------
-        X : iterable
-            For the input to pass the test, we must have:
-            Each element must be an iterable with at most three features and at
-            least one. The first that is obligatory is a valid graph structure
-            (adjacency matrix or edge_dictionary) while the second is
-            node_labels and the third edge_labels (that fitting the given graph
-            format).
-
-        Returns:
-        -------
-        out : np.array, shape=(len(X), n_labels)
-            A np array for frequency (cols) histograms for all Graphs (rows).
-
-        """
-        if not isinstance(X, Iterable):
-            raise TypeError("input must be an iterable\n")
-        rows, cols, data = [], [], []
-        if self._method_calling in [1, 2]:
-            labels = {}
-            self._labels = labels
-        elif self._method_calling == 3:
-            labels = dict(self._labels)
-        ni = 0
-        for i, x in enumerate(iter(X)):
-            is_iter = isinstance(x, Iterable)
-            if is_iter:
-                x = list(x)
-            if is_iter and len(x) in [0, 3]:
-                if len(x) == 0:
-                    warn("Ignoring empty element on index: " + str(i))
-                    continue
-                # Our element is an iterable of at least 2 elements
-                L = x[2]
-            elif isinstance(x, Graph):
-                # get labels in any existing format
-                L = x.get_labels(purpose="any", label_type="edge")
-            else:
-                raise TypeError(
-                    "each element of X must be either a "
-                    + "graph object or a list with at least "
-                    + "a graph like object and node labels "
-                    + "dict \n"
-                )
-
-            if L is None:
-                raise ValueError("Invalid graph entry at location " + str(i) + "!")
-            # construct the data input for the numpy array
-            for label, frequency in Counter(L.values()).items():
-                # for the row that corresponds to that graph
-                rows.append(ni)
-
-                # and to the value that this label is indexed
-                col_idx = labels.get(label, None)
-                if col_idx is None:
-                    # if not indexed, add the new index (the next)
-                    col_idx = len(labels)
-                    labels[label] = col_idx
-
-                # designate the certain column information
-                cols.append(col_idx)
-
-                # as well as the frequency value to data
-                data.append(frequency)
-            ni += 1
-
-        # Initialise the feature matrix
-        if self._method_calling in [1, 2]:
-            if self.sparse == "auto":
-                self.sparse_ = len(cols) / float(ni * len(labels)) <= 0.5
-            else:
-                self.sparse_ = bool(self.sparse)
-
-        if self.sparse_:
-            features = csr_matrix(
-                (data, (rows, cols)), shape=(ni, len(labels)), copy=False
-            )
-        else:
-            # Initialise the feature matrix
-            try:
-                features = zeros(shape=(ni, len(labels)))
-                features[rows, cols] = data
-            except MemoryError:
-                warn("memory-error: switching to sparse")
-                self.sparse_, features = (
-                    True,
-                    csr_matrix((data, (rows, cols)), shape=(ni, len(labels)), copy=False),
-                )
-
-        if ni == 0:
-            raise ValueError("parsed input is empty")
-        return features
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py
deleted file mode 100644
index fe8f8d06..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from __future__ import annotations
-
-import torch
-
-
-def calculate_kernel_matrix_as_tensor(
-    X, Y=None, oa=False, se_kernel=None, normalize=True
-) -> torch.Tensor:
-    """Same as calculate kernel matrix, but in pytorch framework and uses autodiff to compute the gradient of
-    the kernel function with respect to the feature vector.
-
-    This function is taken out of the class to facilitate derivative computation.
-
-    One difference is that to prevent the un-differentiable point at the min operation if optimal assignment
-    kernel is used, we replace the hard-min with a soft-min differentiable approximation that uses the x-norm
-    approximation.
-
-    Parameters
-    ----------
-    X, Y: the feature vectors (X: train, Y: test). When Y is not supplied, the kernel matrix is computed with
-        respect to itself.
-
-    oa: bool: whether the optimal assignment kernel should be used.
-
-    se_kernel: Defines any successive embedding kernel to be applied over the inner produce of X and Y. If none,
-        a simple
-
-    normalize: bool: Whether to normalize the GP covariance matrix to the range of [0, 1]. Default is True.
-
-    Returns:
-    -------
-    K: pytorch tensor, shape = [n_targets, n_inputs]
-    dK_dY: pytorch tensor, of the same shape of K. The derivative of the value of the kernel function with
-    respect to each of the X. If Y is None, the derivative is instead taken at the *training point* (i.e. X).
-    """
-    if Y is None:
-        K = se_kernel.forward(X, X) if se_kernel is not None else X @ X.t()
-        if normalize:
-            K_diag = torch.sqrt(torch.diag(K))
-            K_diag_outer = torch.outer(K_diag, K_diag)
-            return K / K_diag_outer
-    else:
-        assert Y.shape[1] == X.shape[1], (
-            "got Y shape " + str(Y.shape[1]) + " but X shape " + str(X.shape[1])
-        )
-        K = se_kernel.forward(X, Y) if se_kernel is not None else Y @ X.t()
-        if normalize:
-            Kxx = calculate_kernel_matrix_as_tensor(
-                X, X, oa=oa, se_kernel=se_kernel, normalize=False
-            )
-            Kyy = calculate_kernel_matrix_as_tensor(
-                Y, Y, oa=oa, se_kernel=se_kernel, normalize=False
-            )
-            K_diag_outer = torch.outer(
-                torch.sqrt(torch.diag(Kyy)), torch.sqrt(torch.diag(Kxx))
-            )
-            return K / K_diag_outer
-    return K
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
deleted file mode 100644
index a3a31bdf..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py
+++ /dev/null
@@ -1,447 +0,0 @@
-"""The vertex kernel as defined in :cite:`sugiyama2015halting`."""
-
-from __future__ import annotations
-
-import logging
-from collections import Counter
-from collections.abc import Iterable
-from typing import TYPE_CHECKING
-from warnings import warn
-
-import numpy as np
-import torch
-from grakel.graph import Graph
-from grakel.kernels import Kernel
-from numpy import array, einsum, squeeze, zeros
-from scipy.sparse import csr_matrix
-from sklearn.exceptions import NotFittedError
-from sklearn.utils.validation import check_is_fitted
-
-if TYPE_CHECKING:
-    from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import (
-        NumericKernel,
-    )
-
-
-class VertexHistogram(Kernel):
-    """Vertex Histogram kernel as found in :cite:`sugiyama2015halting`.
-
-    Parameters
-    ----------
-    sparse : bool, or 'auto', default='auto'
-        Defines if the data will be stored in a sparse format.
-        Sparse format is slower, but less memory consuming and in some cases the only solution.
-        If 'auto', uses a sparse matrix when the number of zeros is more than the half of the matrix size.
-        In all cases if the dense matrix doesn't fit system memory, I sparse approach will be tried.
-
-    oa: bool: default=True
-        Defines whether optimal assignment variant of the kernel should be used.
-
-    se_kernel: default=None
-        The standard vectorial kernel to be used for successive embedding (i.e. after the transformation from graph
-        to the vector embedding, whether to use an additional kernel to compute the vector similarity.
-
-    se_kernel_params: dict, default=None
-        Any parameters to be passed to the se_kernel
-
-    mahalanobis_precision: np.array:
-        If supplied, the Malahanobis distance with the precision matrix as supplied will be computed in the dot
-        product, instead of the vanilla dot product.
-
-    Attributes:
-    ----------
-    None.
-
-    """
-
-    def __init__(
-        self,
-        n_jobs=None,
-        normalize=False,
-        sparse="auto",
-        oa=False,
-        mahalanobis_precision=None,
-        se_kernel: NumericKernel | None = None,
-        requires_ordered_features: bool = False,
-        as_tensor: bool = True,
-    ):
-        """Initialise a vertex histogram kernel.
-
-        require_ordered_features: bool
-            Whether the ordering of the features in the feature matrix matters.
-            If True, the features will be parsed in the same order as the WL
-            node label.
-
-            Note that if called directly (not from Weisfiler Lehman kernel), turning
-            this option on could break the code, as the label in general is non-int.
-
-        """
-        super().__init__(n_jobs=n_jobs, normalize=normalize)
-        self.as_tensor = as_tensor
-        if self.as_tensor:
-            self.sparse = False
-        else:
-            self.sparse = sparse
-
-        self.oa = oa
-        self.se_kernel = se_kernel
-        self._initialized.update({"sparse": True})
-        self.mahalanobis_precision = mahalanobis_precision
-        self.require_ordered_features = requires_ordered_features
-
-        self._X_diag = None
-        self.X_tensor = None
-        self.Y_tensor = None
-
-        self._labels = None
-        self.sparse_ = None
-        self._method_calling = None
-        self._Y = None
-        self._is_transformed = None
-        self.X = None
-
-    def initialize(self):
-        """Initialize all transformer arguments, needing initialization."""
-        if not self._initialized["n_jobs"]:
-            if self.n_jobs is not None:
-                warn("no implemented parallelization for VertexHistogram")
-            self._initialized["n_jobs"] = True
-        if not self._initialized["sparse"]:
-            if self.sparse not in ["auto", False, True]:
-                raise TypeError("sparse could be False, True or auto")
-            self._initialized["sparse"] = True
-
-    def parse_input(self, X, label_start_idx=0, label_end_idx=None):
-        """Parse and check the given input for VH kernel.
-
-        Parameters
-        ----------
-        X : iterable
-            For the input to pass the test, we must have:
-            Each element must be an iterable with at most three features and at
-            least one. The first that is obligatory is a valid graph structure
-            (adjacency matrix or edge_dictionary) while the second is
-            node_labels and the third edge_labels (that fitting the given graph
-            format).
-
-
-
-        Returns:
-        -------
-        out : np.array, shape=(len(X), n_labels)
-            A np.array for frequency (cols) histograms for all Graphs (rows).
-
-        """
-        if self.require_ordered_features:
-            if label_start_idx is None or label_end_idx is None:
-                raise ValueError(
-                    "When requires_ordered_features flag is True, you must supply the start and end"
-                    "indices of the feature matrix to have consistent feature dimensions!"
-                )
-            assert (
-                label_end_idx > label_start_idx
-            ), "End index must be larger than the start index!"
-
-        if not isinstance(X, Iterable):
-            raise TypeError("input must be an iterable\n")
-        rows, cols, data = [], [], []
-        if self._method_calling in [0, 1, 2]:
-            labels = {}
-            self._labels = labels
-        elif self._method_calling == 3:
-            labels = dict(self._labels)
-        ni = 0
-        for i, x in enumerate(iter(X)):
-            is_iter = isinstance(x, Iterable)
-            if is_iter:
-                x = list(x)
-            if is_iter and len(x) in [0, 2, 3]:
-                if len(x) == 0:
-                    warn("Ignoring empty element on index: " + str(i))
-                    continue
-                # Our element is an iterable of at least 2 elements
-                L = x[1]
-            elif isinstance(x, Graph):
-                # get labels in any existing format
-                L = x.get_labels(purpose="any")
-            else:
-                raise TypeError(
-                    "each element of X must be either a "
-                    "graph object or a list with at least "
-                    "a graph like object and node labels "
-                    "dict \n"
-                )
-
-            # construct the data input for the numpy array
-            for label, frequency in Counter(L.values()).items():
-                # for the row that corresponds to that graph
-                rows.append(ni)
-
-                # and to the value that this label is indexed
-                if self.require_ordered_features:
-                    try:
-                        col_idx = int(label) - label_start_idx  # Offset
-                    except ValueError:
-                        logging.error(
-                            "Failed to convert label to a valid integer. Check whether all labels are"
-                            "numeric, and whether you called this kernel directly instead of from the"
-                            "Weisfiler-Lehman kernel. Falling back to the default unordered feature"
-                            "matrix."
-                        )
-                        self.require_ordered_features = False
-                if not self.require_ordered_features:
-                    col_idx = labels.get(label, None)
-                    if col_idx is None:
-                        # if not indexed, add the new index (the next)
-                        col_idx = len(labels)
-                        labels[label] = col_idx
-
-                # designate the certain column information
-                cols.append(col_idx)
-
-                # as well as the frequency value to data
-                data.append(frequency)
-            ni += 1
-
-        if self.require_ordered_features:
-            label_length = max(label_end_idx - label_start_idx, *cols) + 1
-        else:
-            label_length = len(labels)
-
-        if self._method_calling in [0, 1, 2]:
-            if self.sparse == "auto":
-                self.sparse_ = len(cols) / float(ni * label_length) <= 0.5
-            else:
-                self.sparse_ = bool(self.sparse)
-
-        if self.sparse_:
-            features = csr_matrix(
-                (data, (rows, cols)), shape=(ni, label_length), copy=False
-            )
-        else:
-            # Initialise the feature matrix
-            try:
-                features = zeros(shape=(ni, label_length))
-                features[rows, cols] = data
-
-            except MemoryError:
-                warn("memory-error: switching to sparse")
-                self.sparse_, features = (
-                    True,
-                    csr_matrix(
-                        (data, (rows, cols)), shape=(ni, label_length), copy=False
-                    ),
-                )
-
-        if ni == 0:
-            raise ValueError("parsed input is empty")
-        return features
-
-    def _calculate_kernel_matrix(self, Y=None):
-        """Calculate the kernel matrix given a target_graph and a kernel.
-
-        Each a matrix is calculated between all elements of Y on the rows and
-        all elements of X on the columns.
-
-        Parameters
-        ----------
-        Y : np.array, default=None
-            The array between samples and features.
-
-        Returns:
-        -------
-        K : numpy array, shape = [n_targets, n_inputs]
-            The kernel matrix: a calculation between all pairs of graphs
-            between targets and inputs. If Y is None targets and inputs
-            are the taken from self.X. Otherwise Y corresponds to targets
-            and self.X to inputs.
-
-        """
-        if Y is None:
-            if self.oa:
-                K = np.zeros((self.X.shape[0], self.X.shape[0]))
-                for i in range(self.X.shape[0]):
-                    for j in range(i, self.X.shape[0]):
-                        K[i, j] = np.sum(np.minimum(self.X[i, :], self.X[j, :]))
-                        K[j, i] = K[i, j]
-            elif self.se_kernel is not None:
-                K = self.se_kernel._forward(self.X, self.X)
-            else:
-                K = self.X @ self.X.T
-        elif self.oa:
-            K = np.zeros((Y.shape[0], self.X.shape[0]))
-            for i in range(Y.shape[0]):
-                for j in range(self.X.shape[0]):
-                    K[i, j] = np.sum(np.minimum(self.X[j, :], Y[i, : self.X.shape[1]]))
-        elif self.se_kernel is not None:
-            K = self.se_kernel._forward(self.X, Y)
-        else:
-            K = Y[:, : self.X.shape[1]] @ self.X.T
-
-        if self.sparse_:
-            return K.toarray()
-        return K
-
-    def diagonal(self, use_tensor=False):
-        """Calculate the kernel matrix diagonal of the fitted data.
-
-        Parameters
-        ----------
-        None.
-
-        Returns:
-        -------
-        X_diag : np.array
-            The diagonal of the kernel matrix, of the fitted. This consists
-            of each element calculated with itself.
-
-        use_tensor: bool:
-            The flag to use whether return tensor instead of numpy array. All other operations are the same
-
-        """
-        # Check is fit had been called
-        check_is_fitted(self, ["X", "sparse_"])
-        try:
-            check_is_fitted(self, ["_X_diag"])
-        except NotFittedError:
-            # Calculate diagonal of X
-            if use_tensor:
-                self._X_diag = torch.einsum("ij,ij->i", [self.X_tensor, self.X_tensor])
-            elif self.sparse_:
-                self._X_diag = squeeze(array(self.X.multiply(self.X).sum(axis=1)))
-            else:
-                self._X_diag = einsum("ij,ij->i", self.X, self.X)
-        try:
-            check_is_fitted(self, ["_Y"])
-            if use_tensor:
-                Y_diag = torch.einsum("ij, ij->i", [self.Y_tensor, self.Y_tensor])
-                return self._X_diag, Y_diag
-            if self.sparse_:
-                Y_diag = squeeze(array(self._Y.multiply(self._Y).sum(axis=1)))
-            else:
-                Y_diag = einsum("ij,ij->i", self._Y, self._Y)
-            return self._X_diag, Y_diag
-        except NotFittedError:
-            return self._X_diag
-
-    def transform(self, X, return_embedding_only=False, **kwargs):
-        """Calculate the kernel matrix, between given and fitted dataset.
-
-        Parameters
-        ----------
-        X : iterable
-            Each element must be an iterable with at most three features and at
-            least one. The first that is obligatory is a valid graph structure
-            (adjacency matrix or edge_dictionary) while the second is
-            node_labels and the third edge_labels (that fitting the given graph
-            format). If None the kernel matrix is calculated upon fit data.
-            The test samples.
-
-        return_embedding_only: bool
-            Whether returns the vector embedding of the kernel only (without actually
-            computing the kernel function). This is used when computing the derivative
-            of the kernel w.r.t. the test points/
-
-        Returns:
-        -------
-        K : numpy array, shape = [n_targets, n_input_graphs]
-            corresponding to the kernel matrix, a calculation between
-            all pairs of graphs between target an features
-
-        """
-        self._method_calling = 3
-        # Check is fit had been called
-        check_is_fitted(self, ["X"])
-
-        # Input validation and parsing
-        if X is None:
-            raise ValueError("`transform` input cannot be None")
-        Y = self.parse_input(X, **kwargs)
-        if return_embedding_only:
-            return Y
-
-        self._Y = Y
-        self._is_transformed = True
-
-        # Transform - calculate kernel matrix
-        km = self._calculate_kernel_matrix(Y)
-        # Self transform must appear before the diagonal call on normilization
-        if self.normalize:
-            X_diag, Y_diag = self.diagonal()
-            km /= np.sqrt(np.outer(Y_diag, X_diag))
-        if self.as_tensor:
-            km = torch.tensor(km)
-        return km
-
-    def fit_transform(self, X, **kwargs):
-        """Fit and transform, on the same dataset.
-
-        Parameters
-        ----------
-        X : iterable
-            Each element must be an iterable with at most three features and at
-            least one. The first that is obligatory is a valid graph structure
-            (adjacency matrix or edge_dictionary) while the second is
-            node_labels and the third edge_labels (that fitting the given graph
-            format). If None the kernel matrix is calculated upon fit data.
-            The test samples.
-
-        y : None
-            There is no need of a target in a transformer, yet the pipeline API
-            requires this parameter.
-
-        Returns:
-        -------
-        K : numpy array, shape = [n_targets, n_input_graphs]
-            corresponding to the kernel matrix, a calculation between
-            all pairs of graphs between target an features
-
-        """
-        self._method_calling = 2
-        self.fit(X, **kwargs)
-
-        # Transform - calculate kernel matrix
-        km = self._calculate_kernel_matrix()
-
-        self._X_diag = np.diagonal(km)
-        if self.normalize:
-            km = km / np.sqrt(np.outer(self._X_diag, self._X_diag))
-        if self.as_tensor:
-            km = torch.tensor(km)
-        return km
-
-    def fit(self, X, y=None, **kwargs):
-        """Fit a dataset, for a transformer.
-
-        Parameters
-        ----------
-        X : iterable
-            Each element must be an iterable with at most three features and at
-            least one. The first that is obligatory is a valid graph structure
-            (adjacency matrix or edge_dictionary) while the second is
-            node_labels and the third edge_labels (that fitting the given graph
-            format). The train samples.
-
-        y : None
-            There is no need of a target in a transformer, yet the pipeline API
-            requires this parameter.
-
-        Returns:
-        -------
-        self : object
-        Returns self.
-
-        """
-        self._is_transformed = False
-        self._method_calling = 1
-
-        # Parameter initialization
-        self.initialize()
-
-        # Input validation and parsing
-        if X is None:
-            raise ValueError("`fit` input cannot be None")
-        self.X = self.parse_input(X, **kwargs)
-
-        # Return the transformer
-        return self
diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
deleted file mode 100644
index be35c02a..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py
+++ /dev/null
@@ -1,766 +0,0 @@
-"""The weisfeiler lehman kernel :cite:`shervashidze2011weisfeiler`."""
-
-from __future__ import annotations
-
-import collections
-import collections.abc
-import logging
-import warnings
-from ast import literal_eval
-from collections import OrderedDict
-from collections.abc import Iterable
-from copy import deepcopy
-
-import numpy as np
-import torch
-from grakel.graph import Graph
-from grakel.kernels import Kernel
-from sklearn.exceptions import NotFittedError
-from sklearn.utils.validation import check_is_fitted
-
-from .vertex_histogram import VertexHistogram
-
-
-class WeisfeilerLehman(Kernel):
-    """Compute the Weisfeiler Lehman Kernel.
-
-     See :cite:`shervashidze2011weisfeiler`.
-
-    Parameters
-    ----------
-    h : int, default=5
-        The number of iterations.
-
-    base_graph_kernel : `grakel.kernel_operators.Kernel` or tuple, default=None
-        If tuple it must consist of a valid kernel object and a
-        dictionary of parameters. General parameters concerning
-        normalization, concurrency, .. will be ignored, and the
-        ones of given on `__init__` will be passed in case it is needed.
-        Default `base_graph_kernel` is `VertexHistogram`.
-
-    node_weights: iterable
-        If not None, the nodes will be assigned different weights according
-        to this vector. Must be a dictionary with the following format:
-        {'node_name1': weight1, 'node_name2': weight2 ... }
-        Must be of the same length as the number of different node attributes
-
-    Attributes:
-    ----------
-    X : dict
-     Holds a dictionary of fitted subkernel modules for all levels.
-
-    _nx : number
-        Holds the number of inputs.
-
-    _h : int
-        Holds the number, of iterations.
-
-    _base_graph_kernel : function
-        A void function that initializes a base kernel object.
-
-    _inv_labels : dict
-        An inverse dictionary, used for relabeling on each iteration.
-
-    """
-
-    _graph_format = "dictionary"
-
-    def __init__(
-        self,
-        n_jobs=None,
-        normalize: bool = False,
-        h: int = 5,
-        base_graph_kernel=VertexHistogram,
-        node_weights=None,
-        layer_weights=None,
-        as_tensor: bool = True,
-    ):
-        """Initialise a `weisfeiler_lehman` kernel."""
-        super().__init__(n_jobs=n_jobs, normalize=normalize)
-
-        self.h = h
-        self.base_graph_kernel = base_graph_kernel
-        self._initialized.update(
-            {"h": False, "base_graph_kernel": False, "layer_weights": False}
-        )
-        self._base_graph_kernel = None
-        self.weights = None
-        self.node_weights = node_weights
-        self.as_tensor = as_tensor
-        self.layer_weights = layer_weights  # The weights of each layer. If None, each WL iteration has same weight
-        self.feature_dims = [
-            0,
-        ]  # Record the dimensions of the vectors of each WL iteration
-        self._params = None
-        self._h = None
-        self._nx = None
-        self._inv_labels = None
-        self._inv_label_node_attr = None
-        self._label_node_attr = None
-        self._feature_weight = None
-        self._method_calling = None
-        self._is_transformed = None
-        self.X = None
-        self._X_diag = None
-
-        self.X_fit = {}
-        self.K_precomputed = {}
-        self.base_graph_kernel_precomputed = {}
-
-    def initialize(self):
-        """Initialize all transformer arguments, needing initialization."""
-        super().initialize()
-        if not self._initialized["base_graph_kernel"]:
-            base_graph_kernel = self.base_graph_kernel
-            if base_graph_kernel is None:
-                base_graph_kernel, params = VertexHistogram, {}
-            # TODO: make sure we're always passing like this
-            elif type(base_graph_kernel) is type and issubclass(  # pylint: disable=C0123
-                base_graph_kernel, Kernel
-            ):
-                params = {}
-            else:
-                try:
-                    base_graph_kernel, params = base_graph_kernel
-                except Exception as _error:
-                    raise TypeError(
-                        "Base kernel was not formulated in "
-                        "the correct way. "
-                        "Check documentation."
-                    ) from _error
-
-                if not (
-                    type(base_graph_kernel) is type  # pylint: disable=C0123
-                    and issubclass(base_graph_kernel, Kernel)
-                ):
-                    raise TypeError(
-                        "The first argument must be a valid "
-                        "grakel.kernel.kernel Object"
-                    )
-                if not isinstance(params, dict):
-                    raise ValueError(
-                        "If the second argument of base "
-                        "kernel exists, it must be a diction"
-                        "ary between parameters names and "
-                        "values"
-                    )
-                params.pop("normalize", None)
-
-            params["normalize"] = False
-            params["n_jobs"] = None
-            self._base_graph_kernel = base_graph_kernel
-            self._params = params
-            self._initialized["base_graph_kernel"] = True
-
-        if not self._initialized["h"]:
-            if not isinstance(self.h, int) or self.h < 0:
-                raise TypeError(
-                    "'h' must be a non-negative integer. Got h:" + str(self.h)
-                )
-            self._h = self.h + 1
-            self._initialized["h"] = True
-
-            if self.layer_weights is None or self.layer_weights.shape[0] != self._h:
-                self.layer_weights = np.ones((self._h,))
-            if self.as_tensor and not isinstance(self.layer_weights, torch.Tensor):
-                self.layer_weights = torch.tensor(self.layer_weights)
-
-            self._initialized["h"] = True
-            self._initialized["layer_weights"] = True
-
-    def change_se_kernel(self, se_kernel):
-        if self._base_graph_kernel is None:
-            self.initialize()
-        self._params["se_kernel"] = se_kernel
-        logging.info("Base kernel changed")
-
-    def parse_input(
-        self, X: Iterable, return_embedding_only: bool = False, gp_fit: bool = True
-    ):
-        """Parse input for weisfeiler lehman.
-
-        Parameters
-        ----------
-        X : iterable
-            For the input to pass the test, we must have:
-            Each element must be an iterable with at most three features and at
-            least one. The first that is obligatory is a valid graph structure
-            (adjacency matrix or edge_dictionary) while the second is
-            node_labels and the third edge_labels (that correspond to the given
-            graph format). A valid input also consists of graph type objects.
-
-        return_embedding_only: bool
-            Whether to return the embedding of the graphs only, instead of computing the kernel all
-            the way to the end.
-
-        gp_fit: bool
-            If False use precomputed vals for first N values, else compute them and save them
-
-        Returns:
-        -------
-        base_graph_kernel : object
-        Returns base_graph_kernel.
-
-        if requires_grad is enabled and we call fit_transform or transform, an additional torch tensor
-        K_grad is returned as well.
-
-        """
-        if self._method_calling not in [1, 2]:
-            raise ValueError(
-                "method call must be called either from fit " + "or fit-transform"
-            )
-        if hasattr(self, "_X_diag"):
-            # Clean _X_diag value
-            delattr(self, "_X_diag")
-
-        # skip kernel computation if we have already computed the corresponding kernel
-        if self._h in self.K_precomputed and self.X_fit[self._h] == X:
-            K = self.K_precomputed[self._h]
-            base_graph_kernel = self.base_graph_kernel_precomputed[self._h]
-        else:
-            # Input validation and parsing
-            if not isinstance(X, collections.abc.Iterable):
-                raise TypeError("input must be an iterable\n")
-            nx = 0
-            Gs_ed, L, distinct_values, extras = {}, {}, set(), {}
-            for idx, x in enumerate(iter(X)):
-                is_iter = isinstance(x, collections.abc.Iterable)
-                if is_iter:
-                    x = list(x)
-                if is_iter and (len(x) == 0 or len(x) >= 2):
-                    if len(x) == 0:
-                        warnings.warn("Ignoring empty element on index: " + str(idx))
-                        continue
-                    if len(x) > 2:
-                        extra = ()
-                        if len(x) > 3:
-                            extra = tuple(x[3:])
-                        x = Graph(x[0], x[1], x[2], graph_format=self._graph_format)
-                        extra = (
-                            x.get_labels(
-                                purpose=self._graph_format,
-                                label_type="edge",
-                                return_none=True,
-                            ),
-                            *extra,
-                        )
-                    else:
-                        x = Graph(x[0], x[1], {}, graph_format=self._graph_format)
-                        extra = ()
-
-                elif isinstance(x, Graph):
-                    x.desired_format(self._graph_format)
-                    el = x.get_labels(
-                        purpose=self._graph_format,
-                        label_type="edge",
-                        return_none=True,
-                    )
-                    extra = () if el is None else (el,)
-
-                else:
-                    raise TypeError(
-                        "each element of X must be either a "
-                        + "graph object or a list with at least "
-                        + "a graph like object and node labels "
-                        + "dict \n"
-                    )
-                Gs_ed[nx] = x.get_edge_dictionary()
-                L[nx] = x.get_labels(purpose="dictionary")
-                extras[nx] = extra
-                distinct_values |= set(L[nx].values())
-                nx += 1
-            if nx == 0:
-                raise ValueError("parsed input is empty")
-
-            # Save the number of "fitted" graphs.
-            self._nx = nx
-            WL_labels_inverse = OrderedDict()
-
-            # assign a number to each label
-            label_count = 0
-            for dv in sorted(distinct_values):
-                WL_labels_inverse[dv] = label_count
-                label_count += 1
-
-            # Initalize an inverse dictionary of labels for all iterations
-            self._inv_labels = (
-                OrderedDict()
-            )  # Inverse dictionary of labels, in term of the *previous layer*
-            self._inv_labels[0] = deepcopy(WL_labels_inverse)
-            self.feature_dims.append(
-                len(WL_labels_inverse)
-            )  # Update the zeroth iteration feature dim
-
-            self._inv_label_node_attr = (
-                OrderedDict()
-            )  # Inverse dictionary of labels, in term of the *node attribute*
-            self._label_node_attr = (
-                OrderedDict()
-            )  # Same as above, but with key and value inverted
-            self._label_node_attr[0], self._inv_label_node_attr[0] = self.translate_label(
-                WL_labels_inverse, 0
-            )
-
-            if self.node_weights is not None:
-                self._feature_weight = OrderedDict()
-                # Ensure the order is the same
-                self._feature_weight[0] = self._compute_feature_weight(
-                    self.node_weights, 0, WL_labels_inverse
-                )[1]
-            else:
-                self._feature_weight = None
-
-            def generate_graphs(label_count: int, WL_labels_inverse):
-                new_graphs = []
-                for j in range(self._nx):
-                    new_labels = {}
-                    for k in L[j]:
-                        new_labels[k] = WL_labels_inverse[L[j][k]]
-                    L[j] = new_labels
-                    # add new labels
-                    new_graphs.append((Gs_ed[j], new_labels) + extras[j])
-                yield new_graphs
-
-                for i in range(1, self._h):
-                    label_set, WL_labels_inverse, L_temp = set(), {}, {}
-                    for j in range(nx):
-                        # Find unique labels and sort
-                        # them for both graphs
-                        # Keep for each node the temporary
-                        L_temp[j] = {}
-                        for v in Gs_ed[j]:
-                            credential = (
-                                str(L[j][v])
-                                + ","
-                                + str(sorted(L[j][n] for n in Gs_ed[j][v]))
-                            )
-                            L_temp[j][v] = credential
-                            label_set.add(credential)
-
-                    label_list = sorted(label_set)
-                    for dv in label_list:
-                        WL_labels_inverse[dv] = label_count
-                        label_count += 1
-
-                    # Recalculate labels
-                    new_graphs = []
-                    for j in range(nx):
-                        new_labels = {}
-                        for k in L_temp[j]:
-                            new_labels[k] = WL_labels_inverse[L_temp[j][k]]
-                        L[j] = new_labels
-                        # relabel
-                        new_graphs.append((Gs_ed[j], new_labels) + extras[j])
-                    self._inv_labels[i] = WL_labels_inverse
-                    # Compute the translated inverse node label
-                    (
-                        self._label_node_attr[i],
-                        self._inv_label_node_attr[i],
-                    ) = self.translate_label(
-                        WL_labels_inverse, i, self._label_node_attr[i - 1]
-                    )
-                    self.feature_dims.append(
-                        self.feature_dims[-1] + len(self._label_node_attr[i])
-                    )
-                    # Compute the feature weight of the current layer
-                    if self.node_weights is not None:
-                        self._feature_weight[i] = self._compute_feature_weight(
-                            self.node_weights, i, self._inv_label_node_attr[i]
-                        )[1]
-                    # assert len(self._feature_weight[i] == len(WL_labels_inverse))
-                    yield new_graphs
-
-            # Initialise the base graph kernel.
-            base_graph_kernel = {}
-
-            K = []
-            for i, g in enumerate(generate_graphs(label_count, WL_labels_inverse)):
-                param = self._params
-                if self._feature_weight is not None:
-                    param.update({"mahalanobis_precision": self._feature_weight[i]})
-                base_graph_kernel.update({i: self._base_graph_kernel(**param)})
-                if return_embedding_only:
-                    K.append(
-                        base_graph_kernel[i].parse_input(
-                            g,
-                            label_start_idx=self.feature_dims[i],
-                            label_end_idx=self.feature_dims[i + 1],
-                        )
-                    )
-                elif self._method_calling == 1:
-                    base_graph_kernel[i].fit(
-                        g,
-                        label_start_idx=self.feature_dims[i],
-                        label_end_idx=self.feature_dims[i + 1],
-                    )
-                else:
-                    K.append(
-                        self.layer_weights[i]
-                        * base_graph_kernel[i].fit_transform(
-                            g,
-                            label_start_idx=self.feature_dims[i],
-                            label_end_idx=self.feature_dims[i + 1],
-                        )
-                    )
-
-            if gp_fit:
-                self.X_fit[self._h] = X
-                self.K_precomputed[self._h] = K
-                self.base_graph_kernel_precomputed[self._h] = base_graph_kernel
-
-        if return_embedding_only:
-            return K
-        if self._method_calling == 1:
-            return base_graph_kernel
-        if self._method_calling == 2:
-            if self.as_tensor:
-                K = torch.stack(K, dim=0).sum(dim=0)
-                return K, base_graph_kernel
-            return np.sum(K, axis=0), base_graph_kernel
-        return None
-
-    def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True):  # pylint: disable=unused-argument
-        """Fit and transform, on the same dataset.
-
-        Parameters
-        ----------
-        X : iterable
-            Each element must be an iterable with at most three features and at
-            least one. The first that is obligatory is a valid graph structure
-            (adjacency matrix or edge_dictionary) while the second is
-            node_labels and the third edge_labels (that fitting the given graph
-            format). If None the kernel matrix is calculated upon fit data.
-            The test samples.
-
-        y : Object, default=None
-            Ignored argument, added for the pipeline.
-
-        Returns:
-        -------
-        K : numpy array, shape = [n_targets, n_input_graphs]
-            corresponding to the kernel matrix, a calculation between
-            all pairs of graphs between target an features
-
-        """
-        self._method_calling = 2
-        self._is_transformed = False
-        self.initialize()
-        self.feature_dims = [
-            0,
-        ]  # Flush the feature dimensions
-        if X is None:
-            raise ValueError("transform input cannot be None")
-        km, self.X = self.parse_input(X, gp_fit=gp_fit)
-
-        return km
-
-    def transform(self, X: Iterable, return_embedding_only: bool = True):
-        """Calculate the kernel matrix, between given and fitted dataset.
-
-        Parameters
-        ----------
-        X : iterable
-            Each element must be an iterable with at most three features and at
-            least one. The first that is obligatory is a valid graph structure
-            (adjacency matrix or edge_dictionary) while the second is
-            node_labels and the third edge_labels (that fitting the given graph
-            format). If None the kernel matrix is calculated upon fit data.
-            The test samples.
-
-        return_embedding_only: bool
-            Whether to return the embedding of the graphs only, instead of computing the kernel all
-            the way to the end.
-
-        Returns:
-        -------
-        K : numpy array, shape = [n_targets, n_input_graphs]
-            corresponding to the kernel matrix, a calculation between
-            all pairs of graphs between target an features
-
-        """
-        self._method_calling = 3
-        # Check is fit had been called
-        check_is_fitted(self, ["X", "_nx", "_inv_labels"])
-
-        # Input validation and parsing
-        if X is None:
-            raise ValueError("transform input cannot be None")
-        if not isinstance(X, collections.abc.Iterable):
-            raise ValueError("input must be an iterable\n")
-        nx = 0
-        distinct_values = set()
-        Gs_ed, L = {}, {}
-        for i, x in enumerate(iter(X)):
-            is_iter = isinstance(x, collections.abc.Iterable)
-            if is_iter:
-                x = list(x)
-            if is_iter and len(x) in [0, 2, 3]:
-                if len(x) == 0:
-                    warnings.warn("Ignoring empty element on index: " + str(i))
-                    continue
-
-                if len(x) in [2, 3]:
-                    x = Graph(x[0], x[1], {}, self._graph_format)
-            elif isinstance(x, Graph):
-                x.desired_format("dictionary")
-            else:
-                raise ValueError(
-                    "each element of X must have at "
-                    + "least one and at most 3 elements\n"
-                )
-            Gs_ed[nx] = x.get_edge_dictionary()
-            L[nx] = x.get_labels(purpose="dictionary")
-
-            # Hold all the distinct values
-            distinct_values |= {v for v in L[nx].values() if v not in self._inv_labels[0]}
-            nx += 1
-        if nx == 0:
-            raise ValueError("parsed input is empty")
-
-        nl = len(self._inv_labels[0])
-        WL_labels_inverse = {
-            dv: idx for (idx, dv) in enumerate(sorted(distinct_values), nl)
-        }
-        WL_labels_inverse = OrderedDict(WL_labels_inverse)
-
-        def generate_graphs_transform(WL_labels_inverse, nl):
-            # calculate the kernel matrix for the 0 iteration
-            new_graphs = []
-            for j in range(nx):
-                new_labels = {}
-                for k, v in L[j].items():
-                    if v in self._inv_labels[0]:
-                        new_labels[k] = self._inv_labels[0][v]
-                    else:
-                        new_labels[k] = WL_labels_inverse[v]
-                L[j] = new_labels
-                # produce the new graphs
-                new_graphs.append([Gs_ed[j], new_labels])
-            yield new_graphs
-
-            for i in range(1, self._h):
-                new_graphs = []
-                L_temp, label_set = {}, set()
-                nl += len(self._inv_labels[i])
-                for j in range(nx):
-                    # Find unique labels and sort them for both graphs
-                    # Keep for each node the temporary
-                    L_temp[j] = {}
-                    for v in Gs_ed[j]:
-                        credential = (
-                            str(L[j][v]) + "," + str(sorted(L[j][n] for n in Gs_ed[j][v]))
-                        )
-                        L_temp[j][v] = credential
-                        if credential not in self._inv_labels[i]:
-                            label_set.add(credential)
-
-                # Calculate the new label_set
-                WL_labels_inverse = {}
-                if len(label_set) > 0:
-                    for dv in sorted(label_set):
-                        idx = len(WL_labels_inverse) + nl
-                        WL_labels_inverse[dv] = idx
-
-                # Recalculate labels
-                new_graphs = []
-                for j in range(nx):
-                    new_labels = {}
-                    for k, v in L_temp[j].items():
-                        if v in self._inv_labels[i]:
-                            new_labels[k] = self._inv_labels[i][v]
-                        else:
-                            new_labels[k] = WL_labels_inverse[v]
-                    L[j] = new_labels
-                    # Create the new graphs with the new labels.
-                    new_graphs.append([Gs_ed[j], new_labels])
-                yield new_graphs
-
-        if return_embedding_only:
-            K = []
-            for i, g in enumerate(generate_graphs_transform(WL_labels_inverse, nl)):
-                K.append(
-                    self.X[i].transform(
-                        g,
-                        label_start_idx=self.feature_dims[i],
-                        label_end_idx=self.feature_dims[i + 1],
-                        return_embedding_only=True,
-                    )
-                )
-            return K
-
-        # Calculate the kernel matrix without parallelization
-        if self.as_tensor:
-            summand = [
-                self.layer_weights[i]
-                * self.X[i].transform(
-                    g,
-                    label_start_idx=self.feature_dims[i],
-                    label_end_idx=self.feature_dims[i + 1],
-                )
-                for i, g in enumerate(generate_graphs_transform(WL_labels_inverse, nl))
-            ]
-            K = torch.stack(summand, dim=0).sum(dim=0)
-        else:
-            K = np.sum(
-                (
-                    self.layer_weights[i]
-                    * self.X[i].transform(
-                        g,
-                        label_start_idx=self.feature_dims[i],
-                        label_end_idx=self.feature_dims[i + 1],
-                    )
-                    for (i, g) in enumerate(
-                        generate_graphs_transform(WL_labels_inverse, nl)
-                    )
-                ),
-                axis=0,
-            )
-
-        self._is_transformed = True
-        if self.normalize:
-            X_diag, Y_diag = self.diagonal()
-            if self.as_tensor:
-                div_ = torch.sqrt(torch.outer(Y_diag, X_diag))
-                K /= div_
-            else:
-                old_settings = np.seterr(divide="ignore")
-                K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag))))
-                np.seterr(**old_settings)
-
-        return K
-
-    def diagonal(self):
-        """Calculate the kernel matrix diagonal for fitted data.
-
-        A funtion called on transform on a seperate dataset to apply
-        normalization on the exterior.
-
-        Parameters
-        ----------
-        None.
-
-        Returns:
-        -------
-        X_diag : np.array
-            The diagonal of the kernel matrix, of the fitted data.
-            This consists of kernel calculation for each element with itself.
-
-        Y_diag : np.array
-            The diagonal of the kernel matrix, of the transformed data.
-            This consists of kernel calculation for each element with itself.
-
-        """
-        # Check if fit had been called
-        check_is_fitted(self, ["X"])
-        try:
-            check_is_fitted(self, ["_X_diag"])
-            if self._is_transformed:
-                Y_diag = self.X[0].diagonal()[1]
-                for i in range(1, self._h):
-                    Y_diag += self.X[i].diagonal()[1]
-        except NotFittedError:
-            # Calculate diagonal of X
-            if self._is_transformed:
-                X_diag, Y_diag = self.X[0].diagonal()
-                # X_diag is considered a mutable and should not affect the kernel matrix itself.
-                X_diag.flags.writeable = True
-                for i in range(1, self._h):
-                    x, y = self.X[i].diagonal()
-                    X_diag += x
-                    Y_diag += y
-                    self._X_diag = X_diag
-
-                # case sub kernel is only fitted
-                X_diag = self.X[0].diagonal()
-                # X_diag is considered a mutable and should not affect the kernel matrix itself.
-                X_diag.flags.writeable = True
-                for i in range(1, self._n_iter):
-                    x = self.X[i].diagonal()
-                    X_diag += x
-                self._X_diag = X_diag
-
-        if self.as_tensor:
-            self._X_diag = torch.tensor(self._X_diag)
-            if Y_diag is not None:
-                Y_diag = torch.tensor(Y_diag)
-        if self._is_transformed:
-            return self._X_diag, Y_diag
-        return self._X_diag
-
-    @staticmethod
-    def translate_label(curr_layer: dict, h: int, prev_layer: dict | None = None):
-        """Translate the label to be in terms of the node attributes
-        curr_layer: the WL_label_inverse object. A dictionary with element of the format of
-        {pattern: encoding}.
-
-        Return:
-           label_in_node_attr: in terms of {encoding: pattern}, but pattern is always in term of the node attribute
-           inv_label_in_node_attr: in terms of {pattern: encoding}
-
-        """
-        if h == 0:
-            return {v: str(k) for k, v in curr_layer.items()}, curr_layer
-        assert prev_layer is not None
-        label_in_node_attr, inv_label_in_node_attr = OrderedDict(), OrderedDict()
-        for pattern, encoding in curr_layer.items():
-            # current pattern is in terms of the encoding previous layer. Find the pattern from the prev_layer
-            root, leaf = literal_eval(pattern)
-            root_ = prev_layer[root]
-            leaf_ = [prev_layer[i] for i in leaf]
-            label_in_node_attr.update({encoding: "~".join([root_, *leaf_])})
-            inv_label_in_node_attr.update({"~".join([root_, *leaf_]): encoding})
-        return label_in_node_attr, inv_label_in_node_attr
-
-    @staticmethod
-    def _compute_feature_weight(
-        node_weight: OrderedDict, h: int, inv_label_node_attr: OrderedDict
-    ):
-        """Compute the feature weight, based on the average weight of the constituent node attributes.
-
-        Return:
-            feature_weights: a dictionary with h layers, each of which is a dictionary of the format of
-            {tuple1: weight1; tuplr2, weight2 ...} where tuplex is the tuple representation of the learned graph feature.
-
-            feature_weight_flattened: same as above, but in a flattened np format.
-        """
-        feature_weights = OrderedDict()
-        feature_weights_flattened = []
-        if h == 0:
-            feature_weight = OrderedDict(
-                {k: (node_weight[k]) ** 2 for k in inv_label_node_attr}
-            )
-            feature_weights_flattened = np.array(list(feature_weight.values())).flatten()
-        else:
-            for k, _ in inv_label_node_attr.items():
-                # k is the pattern, v is the encoding
-                k_sep = k.split("~")
-                average_weight = np.mean([(node_weight[i]) ** 2 for i in k_sep])
-                feature_weights.update({k: average_weight})
-                feature_weights_flattened.append(average_weight)
-        feature_weights_flattened = np.array(feature_weights_flattened).flatten()
-        assert len(feature_weights_flattened) == len(inv_label_node_attr)
-        return feature_weights, feature_weights_flattened
-
-    def dK_dX(self, X_test: None):
-        """Do additional forward and backward pass, compute the kernel derivative wrt the testing location.
-        If no test locations are provided, the derivatives are evaluated at the training points.
-
-        Returns.
-        -------
-
-        """
-
-
-def efit(obj, data):
-    """Fit an object on data."""
-    obj.fit(data)
-
-
-def efit_transform(obj, data):
-    """Fit-Transform an object on data."""
-    return obj.fit_transform(data)
-
-
-def etransform(obj, data):
-    """Transform an object on data."""
-    return obj.transform(data)
diff --git a/neps/optimizers/bayesian_optimization/kernels/utils.py b/neps/optimizers/bayesian_optimization/kernels/utils.py
deleted file mode 100644
index 6d94a25d..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/utils.py
+++ /dev/null
@@ -1,155 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-import networkx as nx
-import numpy as np
-
-if TYPE_CHECKING:
-    from neps.search_spaces.search_space import SearchSpace
-
-
-def transform_to_undirected(gr: list):
-    """Transform a list of directed graphs by undirected graphs."""
-    undirected_gr = []
-    for g in gr:
-        if not isinstance(g, nx.Graph):
-            continue
-        if isinstance(g, nx.DiGraph):
-            undirected_gr.append(g.to_undirected())
-        else:
-            undirected_gr.append(g)
-    return undirected_gr
-
-
-def extract_configs(configs: list[SearchSpace]) -> tuple[list, list]:
-    """Extracts graph & HPs from configs objects.
-
-    Args:
-        configs (list): Object holding graph and/or HPs
-
-    Returns:
-        Tuple[list, list]: list of graphs, list of HPs
-    """
-    config_hps = [conf.get_normalized_hp_categories() for conf in configs]
-    graphs = [hps["graphs"] for hps in config_hps]
-
-    # Don't call np.array on structured objects
-    # https://github.com/numpy/numpy/issues/24546#issuecomment-1693913119
-    # _nested_graphs = np.array(graphs, dtype=object)
-    # if _nested_graphs.ndim == 3
-    #   graphs = _nested_graphs[:, :, 0].reshape(-1).tolist()
-    # Long hand way of doing the above
-    # I guess this is just flattening...
-    if (
-        len(graphs) > 0
-        and isinstance(graphs[0], list)
-        and len(graphs[0]) > 0
-        and isinstance(graphs[0][0], list)
-    ):
-        graphs = [_list for list_of_list in graphs for _list in list_of_list]
-
-    return graphs, config_hps
-
-
-def graph_metrics(graph, metric=None, directed=True):
-    G = graph if directed else graph.to_undirected()
-
-    # global metrics
-    if metric == "avg_path_length":
-        avg_path_length = nx.average_shortest_path_length(G)
-        metric_score = avg_path_length
-
-    elif metric == "density":
-        density = nx.density(G)
-        metric_score = density
-
-    else:
-        raise NotImplementedError
-
-    return metric_score
-
-
-def extract_configs_hierarchy(
-    configs: list, d_graph_features: int, hierarchy_consider=None
-) -> tuple[list, list]:
-    """Extracts graph & graph features from configs objects
-    Args:
-        configs (list): Object holding graph and/or graph features
-        d_graph_features (int): Number of global graph features used; if d_graph_features=0, indicate not using global graph features
-        hierarchy_consider (list or None): Specify graphs at which earlier hierarchical levels to be considered
-    Returns:
-        Tuple[list, list]: list of graphs, list of HPs.
-    """
-    N = len(configs)
-
-    config_hps = [conf.get_normalized_hp_categories() for conf in configs]
-    combined_graphs = [hps["graphs"] for hps in config_hps]
-    if N > 0 and hierarchy_consider is not None and combined_graphs[0]:
-        # graphs = list(
-        #     map(
-        #         list,
-        #         zip(
-        #             *[
-        #                 [g[0][0]]
-        #                 + [g[0][1][hierarchy_id] for hierarchy_id in hierarchy_consider]
-        #                 for g in combined_graphs
-        #             ]
-        #         ),
-        #     )
-        # )
-        graphs = list(
-            map(
-                list,
-                zip(
-                    *[
-                        [g[0][0]]
-                        + [
-                            g[0][1][hierarchy_id]
-                            if hierarchy_id in g[0][1]
-                            else g[0][1][max(g[0][1].keys())]
-                            for hierarchy_id in hierarchy_consider
-                        ]
-                        for g in combined_graphs
-                    ],
-                    strict=False,
-                ),
-            )
-        )
-        ### full graph, 0th hierarchy (high-level, smallest), 1st hierarchy, 2nd hierarchy, 3rd hierarchy, ...
-        ### graph gets bigger of hierarchies
-        ### list shape: (1+4) x N
-
-        # modify the node attribute labels on earlier hierarchy graphs e.g.
-        # note the node feature for graph in earlier hierarchical level should be more coarse
-        # e.g. {'op_name': '(Cell diamond (OPS id) (OPS avg_pool) (OPS id) (OPS avg_pool))'} -> {'op_name': 'Cell diamond '}
-        for hg_list in graphs[1:]:
-            for G in hg_list:
-                original_node_labels = nx.get_node_attributes(G, "op_name")
-                new_node_labels = {
-                    k: v.split("(")[1]
-                    for k, v in original_node_labels.items()
-                    if "(" in v and ")" in v
-                }
-                nx.set_node_attributes(G, new_node_labels, name="op_name")
-    else:
-        # graphs = [g[0][0] for g in combined_graphs]
-        graphs = combined_graphs
-
-    if N > 0 and d_graph_features > 0:
-        # graph_features = [c['metafeature'] for c in configs]
-        # these feature values are normalised between 0 and 1
-        # the two graph features used are 'avg_path_length', 'density'
-        graph_features = [
-            [
-                graph_metrics(g[0][0], metric="avg_path_length"),
-                graph_metrics(g[0][0], metric="density"),
-            ]
-            for g in combined_graphs
-        ]
-        graph_features_array = np.vstack(graph_features)  # shape n_archs x 2 (nx(2+d_hp))
-    else:
-        # if not using global graph features of the final architectures, set them to None
-        graph_features_array = [None] * N
-
-    return graphs, graph_features_array
diff --git a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
deleted file mode 100644
index 44e8b8e1..00000000
--- a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py
+++ /dev/null
@@ -1,121 +0,0 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-from typing_extensions import Self
-
-import numpy as np
-import numpy.typing as npt
-import torch
-from torch import nn
-
-from neps.optimizers.bayesian_optimization.kernels.grakel_replace import (
-    VertexHistogram,
-    WeisfeilerLehman as _WL,
-)
-from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel
-
-if TYPE_CHECKING:
-    from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import (
-        NumericKernel,
-    )
-
-GRID_WL_LENGTHSCALES = torch.tensor([np.e**i for i in range(-2, 3)])
-GRID_WL_SUBTREE_CANDIDATES = (1, 2, 3, 4, 5)
-
-
-def normal_prior(param: torch.Tensor, mean: float, std: float) -> torch.Tensor:
-    return -0.5 * torch.sum(((param - mean) / std) ** 2) - torch.sum(
-        torch.log(std * torch.sqrt(2 * torch.tensor(np.pi)))
-    )
-
-
-def kernel_hp_prior(params: dict[str, nn.Parameter]) -> torch.Tensor:
-    return normal_prior(params["layer_weights"], mean=0, std=1)
-
-
-class WeisfilerLehman(Kernel[npt.NDArray[np.object_]]):
-    """Weisfiler Lehman kernel using grakel functions."""
-
-    def __init__(
-        self,
-        *,
-        h: int = 0,
-        se_kernel: NumericKernel | None = None,
-        layer_weights: torch.Tensor | None = None,
-        oa: bool = False,
-        node_label: str = "op_name",
-    ):
-        """Initializes the Weisfeiler-Lehman kernel.
-
-        Args:
-            h: The number of Weisfeiler-Lehman iterations
-            se_kernel: defines a stationary vector kernel to be used for
-                successive embedding (i.e. the kernel function on which the
-                vector embedding inner products are computed).
-                If None, uses the default linear kernel
-            layer_weights: The weights for each layer of the Weisfeiler-Lehman kernel.
-                If None, uses uniform 1s
-            oa: whether the optimal assignment variant of the Weisfiler-Lehman
-                kernel should be used
-            node_label: the node_label defining the key node attribute.
-        """
-        super().__init__(hyperparameter_prior=kernel_hp_prior)
-        if se_kernel is not None and oa:
-            raise ValueError(
-                "Only one or none of se (successive embedding) and oa (optimal assignment) may be true!"
-            )
-
-        self.h = h
-        self.se_kernel = se_kernel
-        self.layer_weights = (
-            layer_weights if layer_weights is not None else torch.ones(h + 1)
-        )
-        self.oa = oa
-        self.node_label = node_label
-        if node_label != "op_name":
-            raise NotImplementedError("Only 'op_name' is supported for node_label")
-
-        self.wl_kernel_: _WL | None = None
-
-    def as_optimizable(self) -> Self:
-        return self.clone_with(layer_weights=nn.Parameter(self.layer_weights))
-
-    def fit_transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor:
-        self.wl_kernel_ = _WL(
-            h=self.h,
-            base_graph_kernel=(  # type: ignore
-                VertexHistogram,
-                {
-                    "sparse": False,
-                    "se_kernel": self.se_kernel,
-                    "oa": self.oa,
-                    "requires_ordered_features": True,
-                },
-            ),
-            layer_weights=self.layer_weights / self.layer_weights.sum(),
-            normalize=True,
-        )
-
-        K = self.wl_kernel_.fit_transform(iter(gr))
-        return torch.as_tensor(K, dtype=torch.float64)
-
-    def transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor:
-        assert self.wl_kernel_ is not None
-
-        K = self.wl_kernel_.transform(iter(gr))
-        return torch.as_tensor(K, dtype=torch.float64)
-
-    def forward(
-        self,
-        x: npt.NDArray[np.object_],
-        x2: npt.NDArray[np.object_] | None = None,
-    ) -> torch.Tensor:
-        if x2 is None:
-            K = self.fit_transform(x)
-            self.wl_kernel_ = None
-            return K
-
-        self.fit_transform(x)
-        K = self.transform(x2)
-        self.wl_kernel_ = None
-        return K
diff --git a/neps/optimizers/bayesian_optimization/models/__init__.py b/neps/optimizers/bayesian_optimization/models/__init__.py
index 49ac7258..034049a3 100755
--- a/neps/optimizers/bayesian_optimization/models/__init__.py
+++ b/neps/optimizers/bayesian_optimization/models/__init__.py
@@ -1,9 +1,4 @@
 from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate
+from neps.optimizers.bayesian_optimization.models.gp import make_default_single_obj_gp
 
-# TODO: Need the GP back here
-#  * What actually uses the GP
-SurrogateModelMapping = {
-    "ftpfn": FTPFNSurrogate,
-}
-
-__all__ = ["FTPFNSurrogate", "SurrogateModelMapping"]
+__all__ = ["FTPFNSurrogate", "make_default_single_obj_gp"]
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index b3f5c2b2..6cafb1f7 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -50,7 +50,7 @@ def default_categorical_kernel(
     )
 
 
-def default_single_obj_gp(
+def make_default_single_obj_gp(
     x: TensorPack,
     y: torch.Tensor,
     *,
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 11eca577..ad167a8a 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -19,7 +19,7 @@
     pibo_acquisition,
 )
 from neps.optimizers.bayesian_optimization.models.gp import (
-    default_single_obj_gp,
+    make_default_single_obj_gp,
     optimize_acq,
 )
 from neps.optimizers.intial_design import make_initial_design
@@ -28,10 +28,7 @@
 from neps.search_spaces.hyperparameters.categorical import CategoricalParameter
 
 if TYPE_CHECKING:
-    from neps.search_spaces import (
-        SearchSpace,
-    )
-    from neps.search_spaces.domain import Domain
+    from neps.search_spaces import SearchSpace
     from neps.search_spaces.hyperparameters.float import FloatParameter
     from neps.search_spaces.hyperparameters.integer import IntegerParameter
     from neps.state import BudgetInfo, Trial
@@ -144,7 +141,11 @@ def __init__(  # noqa: D417
         device: torch.device | None = None,
         encoder: TensorEncoder | None = None,
         seed: int | None = None,
-        treat_fidelity_as_hyperparameters: bool = False,
+        budget: Any | None = None,  # TODO: remove
+        surrogate_model: Any | None = None,  # TODO: remove
+        loss_value_on_error: Any | None = None,  # TODO: remove
+        cost_value_on_error: Any | None = None,  # TODO: remove
+        ignore_errors: Any | None = None,  # TODO: remove
     ):
         """Initialise the BO loop.
 
@@ -167,9 +168,6 @@ def __init__(  # noqa: D417
             device: Device to use for the optimization.
             encoder: Encoder to use for encoding the configurations. If None, it will
                 will use the default encoder.
-            treat_fidelity_as_hyperparameters: Whether to treat fidelities as
-                hyperparameters. If left as False, fidelities will be ignored
-                and configurations will always be sampled at the maximum fidelity.
 
         Raises:
             ValueError: if initial_design_size < 1
@@ -183,12 +181,8 @@ def __init__(  # noqa: D417
             **pipeline_space.numerical,
             **pipeline_space.categoricals,
         }
-        if treat_fidelity_as_hyperparameters:
-            params.update(pipeline_space.fidelities)
-
         self.encoder = TensorEncoder.default(params) if encoder is None else encoder
         self.prior = Prior.from_parameters(params) if use_priors is True else None
-        self.treat_fidelity_as_hyperparameters = treat_fidelity_as_hyperparameters
         self.seed = seed
         self.use_cost = use_cost
         self.device = device
@@ -222,9 +216,7 @@ def ask(
                 sample_size=(
                     "ndim" if self.n_initial_design is None else self.n_initial_design
                 ),
-                sample_fidelity=(
-                    "max" if not self.treat_fidelity_as_hyperparameters else True
-                ),
+                sample_fidelity="max",
             )
 
         if n_trials_sampled < len(self.initial_design_):
@@ -233,10 +225,10 @@ def ask(
 
         # Now we actually do the BO loop, start by encoding the data
         # TODO: Lift this into runtime, let the optimizer advertise the encoding wants...
-        x_configs: list[dict[str, Any]] = []
+        x_configs: list[Mapping[str, Any]] = []
         ys: list[float] = []
         costs: list[float] = []
-        pending: list[dict[str, Any]] = []
+        pending: list[Mapping[str, Any]] = []
         for trial in trials.values():
             if trial.state.pending():
                 pending.append(trial.config)
@@ -260,7 +252,7 @@ def ask(
         y = _missing_y_strategy(y)
 
         # Now fit our model
-        y_model = default_single_obj_gp(
+        y_model = make_default_single_obj_gp(
             x,
             y,
             # TODO: We should consider applying some heurisitc to see if this should
@@ -318,7 +310,7 @@ def ask(
             cost = torch.tensor(costs, dtype=torch.float64, device=self.device)
             cost_z_score = _missing_cost_strategy(cost)
 
-            cost_model = default_single_obj_gp(
+            cost_model = make_default_single_obj_gp(
                 x,
                 cost_z_score,
                 y_transform=ChainedOutcomeTransform(
diff --git a/neps/optimizers/default_searchers/bayesian_optimization.yaml b/neps/optimizers/default_searchers/bayesian_optimization.yaml
index 9b5a3f37..c3525cc4 100644
--- a/neps/optimizers/default_searchers/bayesian_optimization.yaml
+++ b/neps/optimizers/default_searchers/bayesian_optimization.yaml
@@ -1,16 +1,6 @@
 strategy: bayesian_optimization
 # Arguments that can be modified by the user
-surrogate_model: gp
-acquisition: EI  # or {"LogEI", "AEI"}
-log_prior_weighted: false
-acquisition_sampler: mutation  # or {"random", "evolution"}
-random_interleave_prob: 0.0
-disable_priors: true
-sample_default_first: false
-
-# Other arguments:
-# surrogate_model_args: None # type: dict
-# optimal_assignment: false # type: bool
-# domain_se_kernel: None # type: str
-# graph_kernels: None # type: list
-# hp_kernels: None # type: list
+initial_design_size: null  # Defaults to depending on number or hyperparameters
+use_cost: false  # Whether to factor in cost when selecting new configurations
+sample_default_first: # Whether to sample the default configuration first
+device: null  # Device to load the gaussian process model on with torch
diff --git a/neps/optimizers/default_searchers/pibo.yaml b/neps/optimizers/default_searchers/pibo.yaml
index 0dc7a7db..36bff8b2 100644
--- a/neps/optimizers/default_searchers/pibo.yaml
+++ b/neps/optimizers/default_searchers/pibo.yaml
@@ -1,17 +1,6 @@
 strategy: pibo
 # Arguments that can be modified by the user
-surrogate_model: gp
-acquisition: EI  # or {"LogEI", "AEI"}
-log_prior_weighted: false
-acquisition_sampler: mutation  # or {"random", "evolution"}
-random_interleave_prob: 0.0
-disable_priors: false
-prior_confidence: medium  # or {"low", "high"}
-sample_default_first: false
-
-# Other arguments:
-# surrogate_model_args: None # type: dict
-# optimal_assignment: false # type: bool
-# domain_se_kernel: None # type: str
-# graph_kernels: None # type: list
-# hp_kernels: None # type: list
+initial_design_size: null  # Defaults to depending on number or hyperparameters
+use_cost: false  # Whether to factor in cost when selecting new configurations
+sample_default_first: # Whether to sample the default configuration first
+device: null  # Device to load the gaussian process model on with torch
diff --git a/neps/optimizers/multi_fidelity/mf_bo.py b/neps/optimizers/multi_fidelity/mf_bo.py
index 8cb31ceb..2a092da1 100755
--- a/neps/optimizers/multi_fidelity/mf_bo.py
+++ b/neps/optimizers/multi_fidelity/mf_bo.py
@@ -3,14 +3,7 @@
 
 from copy import deepcopy
 
-import torch
 
-from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate
-from neps.optimizers.multi_fidelity.utils import (
-    MFObservedData,
-    get_tokenized_data,
-    get_training_data_for_freeze_thaw,
-)
 from neps.optimizers.multi_fidelity_prior.utils import (
     calc_total_resources_spent,
     update_fidelity,
diff --git a/neps/optimizers/multi_fidelity/sampling_policy.py b/neps/optimizers/multi_fidelity/sampling_policy.py
index 9208e4c3..58d75387 100644
--- a/neps/optimizers/multi_fidelity/sampling_policy.py
+++ b/neps/optimizers/multi_fidelity/sampling_policy.py
@@ -16,7 +16,6 @@
 from neps.optimizers.bayesian_optimization.acquisition_samplers import (
     AcquisitionSamplerMapping,
 )
-from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping
 from neps.optimizers.multi_fidelity_prior.utils import (
     compute_config_dist,
     custom_crossover,
@@ -397,167 +396,3 @@ def sample(
         # TODO: can generalize s.t. sampler works for all types, currently,
         #  random sampler in NePS does not do what is required here
         # return self.acquisition_sampler.sample(self.acquisition)
-
-
-class BaseDynamicModelPolicy(SamplingPolicy):
-    def __init__(
-        self,
-        pipeline_space: SearchSpace,
-        observed_configs: Any = None,
-        surrogate_model: str | Any = "gp",
-        domain_se_kernel: str | None = None,
-        hp_kernels: list | None = None,
-        graph_kernels: list | None = None,
-        surrogate_model_args: dict | None = None,
-        acquisition: str | BaseAcquisition = "EI",
-        use_priors: bool = False,
-        log_prior_weighted: bool = False,
-        acquisition_sampler: str | AcquisitionSampler = "random",
-        patience: int = 100,
-        logger=None,
-    ):
-        super().__init__(pipeline_space=pipeline_space, logger=logger)
-
-        surrogate_model_args = surrogate_model_args or {}
-
-        graph_kernels, hp_kernels = get_default_kernels(
-            pipeline_space=pipeline_space,
-            domain_se_kernel=domain_se_kernel,
-            graph_kernels=graph_kernels,
-            hp_kernels=hp_kernels,
-            optimal_assignment=False,
-        )
-        if "graph_kernels" not in surrogate_model_args:
-            surrogate_model_args["graph_kernels"] = graph_kernels
-        if "hp_kernels" not in surrogate_model_args:
-            surrogate_model_args["hp_kernels"] = hp_kernels
-        if not surrogate_model_args["hp_kernels"]:
-            raise ValueError("No kernels are provided!")
-        if "vectorial_features" not in surrogate_model_args:
-            surrogate_model_args["vectorial_features"] = (
-                pipeline_space.get_vectorial_dim()
-            )
-
-        self.surrogate_model = instance_from_map(
-            SurrogateModelMapping,
-            surrogate_model,
-            name="surrogate model",
-            kwargs=surrogate_model_args,
-        )
-
-        self.acquisition = instance_from_map(
-            AcquisitionMapping,
-            acquisition,
-            name="acquisition function",
-        )
-
-        if use_priors and pipeline_space.has_prior:
-            self.acquisition = DecayingPriorWeightedAcquisition(
-                self.acquisition, log=log_prior_weighted
-            )
-
-        self.acquisition_sampler = instance_from_map(
-            AcquisitionSamplerMapping,
-            acquisition_sampler,
-            name="acquisition sampler function",
-            kwargs={"patience": patience, "pipeline_space": pipeline_space},
-        )
-
-        self.sampling_args: dict = {}
-
-        self.observed_configs = observed_configs
-
-    def _fantasize_pending(self, train_x, train_y, pending_x):
-        if len(pending_x) == 0:
-            return train_x, train_y
-        # fit model on finished evaluations
-        self.surrogate_model.fit(train_x, train_y)
-        # hallucinating: predict for the pending evaluations
-        _y, _ = self.surrogate_model.predict(pending_x)
-        _y = _y.detach().numpy().tolist()
-        # appending to training data
-        train_x.extend(pending_x)
-        train_y.extend(_y)
-        return train_x, train_y
-
-    def update_model(self, train_x=None, train_y=None, pending_x=None, decay_t=None):
-        if train_x is None:
-            train_x = []
-        if train_y is None:
-            train_y = []
-        if pending_x is None:
-            pending_x = []
-
-        if decay_t is None:
-            decay_t = len(train_x)
-        train_x, train_y = self._fantasize_pending(train_x, train_y, pending_x)
-        self.surrogate_model.fit(train_x, train_y)
-        self.acquisition.set_state(self.surrogate_model, decay_t=decay_t)
-        self.acquisition_sampler.set_state(x=train_x, y=train_y)
-
-    @abstractmethod
-    def sample(self, *args, **kwargs) -> tuple[int, SearchSpace]:
-        pass
-
-
-class RandomPromotionDynamicPolicy(BaseDynamicModelPolicy):
-    def __init__(self, *args, **kwargs):
-        self.num_train_configs = 0
-
-        super().__init__(*args, **kwargs)
-
-    def _fantasize_pending(self, *args, **kwargs):
-        pending_configs = []
-
-        # Select configs that are neither pending nor resulted in error
-        completed_configs = self.observed_configs.completed_runs.copy(deep=True)
-
-        # Get the config, performance values for the maximum budget runs that are completed
-        max_budget_samples = completed_configs.sort_index().groupby(level=0).last()
-        max_budget_configs = max_budget_samples[
-            self.observed_configs.config_col
-        ].to_list()
-        max_budget_perf = max_budget_samples[self.observed_configs.perf_col].to_list()
-
-        pending_condition = self.observed_configs.pending_condition
-        if pending_condition.any():
-            pending_configs = (
-                self.observed_configs.df[pending_condition]
-                .loc[(), self.observed_configs.config_col]
-                .unique()
-                .to_list()
-            )
-        return super()._fantasize_pending(
-            max_budget_configs, max_budget_perf, pending_configs
-        )
-
-    def sample(self, rand_promotion_prob=0.5, seed=777, is_promotion=False, **kwargs):
-        promoted = False
-        # np.random.seed(seed)
-        if np.random.random_sample() < rand_promotion_prob:
-            config_id = (
-                self.observed_configs.df[~self.observed_configs.error_condition]
-                .sample(1)
-                .index[0][0]
-            )
-            max_budget_id = self.observed_configs.df.loc[(config_id,)].index[-1]
-            config = self.observed_configs.df.loc[
-                (config_id, max_budget_id), self.observed_configs.config_col
-            ]
-            promoted = True
-
-        else:
-            config_id = len(self.observed_configs.df.index.levels[0])
-            config = self.acquisition_sampler.sample(self.acquisition)
-
-        if is_promotion and promoted:
-            return config_id
-        if is_promotion:
-            return None
-        return config
-
-    # def sample(self, **kwargs):
-    #     return self._sample(is_promotion=False, **kwargs)
-    #
-    # def retrieve_promotions(self, **kwargs):
-    #     return self._sample(is_promotion=True, **kwargs)
diff --git a/neps/search_spaces/architecture/api.py b/neps/search_spaces/architecture/api.py
index de19a9ef..ba73f1ca 100644
--- a/neps/search_spaces/architecture/api.py
+++ b/neps/search_spaces/architecture/api.py
@@ -1,4 +1,4 @@
-
+from __future__ import annotations
 
 import inspect
 from typing import TYPE_CHECKING, Callable
diff --git a/neps_examples/basic_usage/architecture.py b/neps_examples/basic_usage/architecture.py
index 5d43efe7..cc73029a 100644
--- a/neps_examples/basic_usage/architecture.py
+++ b/neps_examples/basic_usage/architecture.py
@@ -1,4 +1,7 @@
-
+raise NotImplementedError(
+    "Support for graphs was temporarily removed, if you'd like to use a version"
+    " of NePS that supports graphs, please use version v0.12.2"
+)
 
 import logging
 
diff --git a/neps_examples/basic_usage/architecture_and_hyperparameters.py b/neps_examples/basic_usage/architecture_and_hyperparameters.py
index e0b63fe4..c83f3eac 100644
--- a/neps_examples/basic_usage/architecture_and_hyperparameters.py
+++ b/neps_examples/basic_usage/architecture_and_hyperparameters.py
@@ -1,3 +1,8 @@
+raise NotImplementedError(
+    "Support for graphs was temporarily removed, if you'd like to use a version"
+    " of NePS that supports graphs, please use version v0.12.2"
+)
+
 import logging
 
 from torch import nn
diff --git a/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py b/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py
index 77ce9e9f..5aa3e523 100644
--- a/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py
+++ b/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py
@@ -130,5 +130,5 @@ def run_pipeline(some_architecture, some_float, some_integer, some_cat):
     pipeline_space=pipeline_space,
     root_directory="results/user_priors_with_graphs",
     max_evaluations_total=15,
-    log_prior_weighted=True,
+    use_priors=True,
 )
diff --git a/neps_examples/experimental/hierarchical_architecture.py b/neps_examples/experimental/hierarchical_architecture.py
index 55ed9144..20a912d0 100644
--- a/neps_examples/experimental/hierarchical_architecture.py
+++ b/neps_examples/experimental/hierarchical_architecture.py
@@ -1,3 +1,8 @@
+raise NotImplementedError(
+    "Support for graphs was temporarily removed, if you'd like to use a version"
+    " of NePS that supports graphs, please use version v0.12.2"
+)
+
 import logging
 
 from torch import nn
diff --git a/tests/test_examples.py b/tests/test_examples.py
index abdd10c5..5575eb4d 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -38,6 +38,13 @@ def test_core_examples(example):
         # Run hyperparameters example to have something to analyse
         runpy.run_path(str(core_examples_scripts[0]), run_name="__main__")
 
+    if example.name in (
+        "architecture.py",
+        "hierarchical_architecture.py",
+        "expert_priors_for_architecture_and_hyperparameters.py",
+    ):
+        pytest.xfail("Architecture were removed temporarily")
+
     runpy.run_path(str(example), run_name="__main__")
 
 
diff --git a/tests/test_neps_api/testing_scripts/default_neps.py b/tests/test_neps_api/testing_scripts/default_neps.py
index 370c6255..c6e1ac12 100644
--- a/tests/test_neps_api/testing_scripts/default_neps.py
+++ b/tests/test_neps_api/testing_scripts/default_neps.py
@@ -1,7 +1,6 @@
 import logging
 
 import neps
-from neps.optimizers.bayesian_optimization.kernels import GraphKernelMapping
 
 pipeline_space_fidelity_priors = dict(
     val1=neps.FloatParameter(lower=-10, upper=10, default=1),
@@ -46,39 +45,6 @@ def run_pipeline(val1, val2):
     eta=3,
 )
 
-# Case 2: Choosing Bayesian optimization
-
-early_hierarchies_considered = "0_1_2_3"
-hierarchy_considered = [int(hl) for hl in early_hierarchies_considered.split("_")]
-graph_kernels = ["wl"] * (len(hierarchy_considered) + 1)
-wl_h = [2, 1] + [2] * (len(hierarchy_considered) - 1)
-graph_kernels = [
-    GraphKernelMapping[kernel](
-        h=wl_h[j],
-        oa=False,
-        se_kernel=None,
-    )
-    for j, kernel in enumerate(graph_kernels)
-]
-surrogate_model =
-surrogate_model_args = {
-    "graph_kernels": graph_kernels,
-    "hp_kernels": [],
-    "verbose": False,
-    "hierarchy_consider": hierarchy_considered,
-    "d_graph_features": 0,
-    "vectorial_features": None,
-}
-neps.run(
-    run_pipeline=run_pipeline,
-    pipeline_space=pipeline_space_not_fidelity,
-    root_directory="bo_user_decided",
-    max_evaluations_total=1,
-    searcher="bayesian_optimization",
-    surrogate_model=surrogate_model,
-    surrogate_model_args=surrogate_model_args,
-)
-
 # Testing neps decision tree on deciding the searcher and rejecting the
 # additional arguments.
 

From 178ae68d302187de003b299f89468135d6f8c9e3 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Mon, 30 Sep 2024 16:15:14 +0200
Subject: [PATCH 54/63] refactor(ifbo): Mostly clean

---
 .../acquisition_functions/cost_cooling.py     |   2 -
 .../acquisition_functions/pibo.py             |   4 +-
 .../weighted_acquisition.py                   |  15 +-
 .../bayesian_optimization/models/ftpfn.py     | 272 +++++++++++++++++-
 .../bayesian_optimization/models/gp.py        | 248 ++++++++++++++--
 .../bayesian_optimization/optimizer.py        | 249 ++++------------
 neps/optimizers/intial_design.py              |  13 +-
 neps/optimizers/multi_fidelity/ifbo.py        | 260 ++++++++---------
 .../multi_fidelity/sampling_policy.py         | 129 +++++----
 neps/sampling/priors.py                       |  21 +-
 neps/sampling/samplers.py                     | 104 ++++++-
 neps/search_spaces/domain.py                  |  54 ++--
 neps/search_spaces/encoding.py                | 121 +-------
 13 files changed, 897 insertions(+), 595 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py b/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py
index a32baebe..cea2aebd 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py
@@ -42,7 +42,6 @@ def cost_cooled_acq(
     acq_fn: AcquisitionFunction,
     model: GPyTorchModel,
     used_budget_percentage: float,
-    X_pending: torch.Tensor | None = None,
 ) -> WeightedAcquisition:
     assert 0 <= used_budget_percentage <= 1
     return WeightedAcquisition(
@@ -52,5 +51,4 @@ def cost_cooled_acq(
             cost_model=model,
             alpha=1 - used_budget_percentage,
         ),
-        X_pending=X_pending,
     )
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py
index db3120e7..c61d9d56 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py
@@ -40,7 +40,7 @@ def apply_pibo_acquisition_weight(
     prior_exponent: float,
 ):
     if acq._log:
-        weighted_log_probs = prior.log_prob(X, frm=x_domain) * prior_exponent
+        weighted_log_probs = prior.log_prob(X, frm=x_domain) + prior_exponent
         return acq_values + weighted_log_probs
 
     weighted_probs = prior.prob(X, frm=x_domain).pow(prior_exponent)
@@ -52,7 +52,6 @@ def pibo_acquisition(
     prior: Prior,
     prior_exponent: float,
     x_domain: Domain | list[Domain],
-    X_pending: Tensor | None = None,
 ) -> WeightedAcquisition:
     return WeightedAcquisition(
         acq=acq_fn,
@@ -62,5 +61,4 @@ def pibo_acquisition(
             x_domain=x_domain,
             prior_exponent=prior_exponent,
         ),
-        X_pending=X_pending,
     )
diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
index f589298b..fd23d331 100644
--- a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
+++ b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
@@ -97,7 +97,6 @@ def __init__(
         self,
         acq: A,
         apply_weight: Callable[[Tensor, Tensor, A], Tensor],
-        X_pending: Tensor | None = None,
     ) -> None:
         """Initialize the weighted acquisition function.
 
@@ -109,15 +108,19 @@ def __init__(
 
                 Please see the module docstring for more information on the dimensions
                 and how to handle them.
-            X_pending: `n x d` Tensor with `n` `d`-dim design points that have
-                been submitted for evaluation but have not yet been evaluated.
         """
         super().__init__(model=acq.model)
         # NOTE: We remove the X_pending from the base acquisition function as we will get
         # it in our own forward with `@concatenate_pending_points` and pass that forward.
-        # This avoids possible duplicates
-        acq.set_X_pending(None)
-        self.set_X_pending(X_pending)
+        # This avoids possible duplicates. Also important to explicitly set it to None
+        # even if it does not exist as otherwise the attribute does not exists -_-
+        if (X_pending := getattr(acq, "X_pending", None)) is not None:
+            acq.set_X_pending(None)
+            self.set_X_pending(X_pending)
+        else:
+            acq.set_X_pending(None)
+            self.set_X_pending(None)
+
         self.apply_weight = apply_weight
         self.acq = acq
         self._log = acq._log
diff --git a/neps/optimizers/bayesian_optimization/models/ftpfn.py b/neps/optimizers/bayesian_optimization/models/ftpfn.py
index 2041396f..6c6464a6 100644
--- a/neps/optimizers/bayesian_optimization/models/ftpfn.py
+++ b/neps/optimizers/bayesian_optimization/models/ftpfn.py
@@ -1,11 +1,40 @@
 from __future__ import annotations
 
+from collections.abc import Callable, Mapping
+from dataclasses import dataclass
 from pathlib import Path
-from typing import Any
+from typing import Any, Literal
 
 import torch
 from ifbo import FTPFN
 
+from neps.sampling.samplers import Sampler
+from neps.search_spaces.domain import Domain
+from neps.search_spaces.encoding import CategoricalToUnitNorm, ConfigEncoder
+from neps.search_spaces.search_space import SearchSpace
+from neps.state.trial import Trial
+
+
+def _keep_highest_budget_evaluation(
+    x: torch.Tensor,
+    id_col: int = 0,
+    budget_col: int = 1,
+) -> torch.Tensor:
+    # Does a lexsort, same as if we sorted by (config_id, budget), where
+    # theyre are sorted according to increasing config_id and then increasing budget.
+    # x[i2] -> sorted by config id and budget
+    i1 = torch.argsort(x[:, budget_col])
+    i2 = i1[torch.argsort(x[i1][:, id_col], stable=True)]
+    sorted_x = x[i2]
+
+    # Now that it's sorted, we essentially want to count the occurence of each id into counts
+    _, counts = torch.unique_consecutive(sorted_x[:, id_col], return_counts=True)
+
+    # Now we can use these counts to get to the last occurence of each id
+    # The -1 is because we want to index from 0 but sum starts at 1.
+    ii = counts.cumsum(0) - 1
+    return sorted_x[ii]
+
 
 def _download_workaround_for_ifbo_issue_10(path: Path | None, version: str) -> Path:
     # TODO: https://github.com/automl/ifBO/issues/10
@@ -68,6 +97,233 @@ def _cast_tensor_shapes(x: torch.Tensor) -> torch.Tensor:
     raise ValueError(f"Shape not recognized: {x.shape}")
 
 
+# NOTE: Ifbo was trained using 32 bit
+FTPFN_DTYPE = torch.float32
+
+
+def encode_trials_for_ftpfn(
+    trials: Mapping[str, Trial],
+    space: SearchSpace,
+    budget_domain: Domain,
+    encoder: ConfigEncoder,
+    *,
+    device: torch.device | None = None,
+    dtype: torch.dtype = FTPFN_DTYPE,
+    error_value: float = 0.0,
+) -> FTPFNData:
+    """Encode the trials into a format that the FTPFN model can understand.
+
+    !!! warning "Pending trials"
+
+        For trials which do not have a loss reported yet, they are considered pending
+        and will have `torch.nan` as their score inside the returned y values.
+        If using
+        [`acquire_next_from_ftpfn()`][neps.optimizers.bayesian_optimization.models.ftpfn.acquire_next_from_ftpfn],
+        the result of these configurations will be fantasized.
+
+    !!! warning "Error values"
+
+        The FTPFN model requires that all loss values lie in the interval [0, 1].
+        By default, using the value of `error_value=0.0`, we encode crashed configurations as
+        having an error value of 0.
+
+    Args:
+        trials: The trials to encode
+        encoder: The encoder to use
+        space: The search space
+        budget_domain: The domain to use for the budgets of the FTPFN
+        device: The device to use
+        dtype: The dtype to use
+
+    Returns:
+        The encoded trials and their corresponding **scores**
+    """
+    # Select all trials which have something we can actually use for modelling
+    # The absence of a report signifies pending
+    selected = {trial_id: trial for trial_id, trial in trials.items()}
+    assert space.fidelity_name is not None
+    assert space.fidelity is not None
+    assert 0 <= error_value <= 1
+    train_configs = encoder.encode([t.config for t in selected.values()], device=device)
+    ids = torch.tensor(
+        [int(config_id.split("_", maxsplit=1)[0]) for config_id in selected.keys()],
+        device=device,
+        dtype=dtype,
+    )
+    # PFN uses `0` id for test configurations
+    ids = ids + 1
+
+    train_fidelities = torch.tensor(
+        [t.config[space.fidelity_name] for t in selected.values()],
+        device=device,
+        dtype=dtype,
+    )
+    train_budgets = budget_domain.cast(train_fidelities, frm=space.fidelity.domain)
+
+    # TODO: Document that it's on the user to ensure these are already all bounded
+    # We could possibly include some bounded transform to assert this.
+    minimize_ys = torch.tensor(
+        [
+            torch.nan
+            if trial.report is None
+            else (error_value if trial.report.loss is None else trial.report.loss)
+            for trial in trials.values()
+        ],
+        device=device,
+        dtype=dtype,
+    )
+    if minimize_ys.max() > 1 or minimize_ys.min() < 0:
+        raise RuntimeError(
+            "ifBO requires that all loss values reported lie in the interval [0, 1]"
+            " but recieved loss value outside of that range!"
+            f"\n{minimize_ys}"
+        )
+    maximize_ys = 1 - minimize_ys
+    return FTPFNData(
+        ids=ids,
+        x=train_configs,
+        y=maximize_ys,
+        budgets=train_budgets,
+        pending_mask=minimize_ys.isnan(),
+    )
+
+
+@dataclass
+class FTPFNData:
+    """Dataclass to hold the data for the FTPFN model.
+
+    The layout of the data is as follows:
+
+    * `ids`: The configuration ids. These will have +1 added to them as FTPFN uses `0`
+    for test configurations, but NePS starts ids at `0`.
+    * `x`: The encoded configurations, includes everything that was encoded by the encoder
+        passed to
+        [`encode_trials_for_ftpfn()`][neps.optimizers.bayesian_optimization.models.ftpfn.encode_trials_for_ftpfn]
+    * `y`: The scores of the configurations, these are inverted such they are to be maximized, where 1 is the maximum
+        score obtainable and 0 is the minimum. Any configuration which did not have a loss gets a score of `nan`.
+    * `budgets`: The budgets of the configurations, normalized to the range [0, 1].
+        These are normalized such that the lower bound of the fidelity domain maps to `1/max_fid`
+        while the upper bound maps to `1`.
+    * `pending_mask`: A mask to indicate which configurations are pending, i.e. have not been evaluated yet.
+        If there are no pending configurations, this should be `None`.
+    """
+
+    ids: torch.Tensor
+    x: torch.Tensor
+    y: torch.Tensor
+    budgets: torch.Tensor
+    pending_mask: torch.Tensor | None = None
+
+
+def create_border_configs(
+    ndims: int,
+    *,
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
+    max_samples: int = 2**9,
+) -> torch.Tensor:
+    n_samples = 2**ndims
+    _arange = torch.arange(n_samples, device=device, dtype=torch.int32)
+    # 2**9 is only 512 samples, so we can afford to exhaustively generate them
+    # We likely won't have this many hyperparameters anywho
+    if n_samples <= max_samples:
+        configs = _arange
+    else:
+        # Otherwise, we take a random sample of the 2**n possible border configs
+        rand_uniq_indices = torch.randperm(n_samples, device=device)[:max_samples]
+        configs = _arange[rand_uniq_indices]
+
+    # https://stackoverflow.com/a/63546308/5332072
+    bit_masks = 2 ** _arange[ndims]
+    return configs.unsqueeze(1).bitwise_and(bit_masks).ne(0).to(dtype)
+
+
+def acquire_next_from_ftpfn(
+    *,
+    ftpfn: FTPFNSurrogate,
+    data: FTPFNData,
+    encoder: ConfigEncoder,
+    budget_domain: Domain,
+    fidelity_domain: Domain,
+    seed: int | None = None,
+    acq_strategy: Callable[
+        [torch.Tensor, torch.Tensor, torch.Tensor, FTPFNSurrogate], torch.Tensor
+    ],
+    dtype: torch.dtype | None = FTPFN_DTYPE,
+    extra_acq_samples: torch.Tensor | None = None,
+) -> tuple[int | None, int | float | None, dict[str, Any]]:
+    X = torch.cat([data.ids.unsqueeze(1), data.budgets.unsqueeze(1), data.x], dim=1).to(
+        dtype
+    )
+    ys = data.y.clone().detach()
+
+    # In-fill pending with predicted performance
+    if data.pending_mask is not None:
+        not_pending = ~data.pending_mask
+        pending_ys = ftpfn.get_mean_performance(
+            train_x=X[not_pending],
+            train_y=ys[not_pending],
+            test_x=X[data.pending_mask],
+        )
+        ys[data.pending_mask] = pending_ys
+
+    # We also need to append existing configurations that are in training data, but bump up their
+    # budget by one step.
+    # 1. Exclude all configurations which are currently pending
+    acq_existing = X
+    if data.pending_mask is not None:
+        acq_existing = X[~data.pending_mask]
+
+    # 2. Remove duplicate configurations from x train, keeping only the most recent eval
+    acq_existing = _keep_highest_budget_evaluation(acq_existing, id_col=0, budget_col=1)
+
+    # 3. Remove configs that have been fully evaluated
+    acq_existing = acq_existing[acq_existing[:, 1] < budget_domain.upper]
+
+    # 4. Include the extra acquisition samples
+    if extra_acq_samples is None:
+        samples = [acq_existing]
+    else:
+        _shape = (len(extra_acq_samples), 1)
+        acq_extra = torch.cat(
+            [
+                torch.zeros(_shape, dtype=dtype, device=ftpfn.device),
+                torch.full(_shape, budget_domain.lower, dtype=dtype, device=ftpfn.device),
+                extra_acq_samples,
+            ],
+            dim=1,
+        )
+        samples = [acq_existing, acq_extra]
+
+    # 5. Now we can fuse them together
+    acq_samples = torch.cat(samples, dim=0).to(dtype=dtype)
+
+    # We keep a copy of the original budgets incase they get modified
+    # so we can return the fidelity of the sample that had the best acquisition score
+    budgets_prior_to_acq = acq_samples[:, 1].clone().detach()
+
+    # Now we offload acquisition to the caller
+    acq_scores = acq_strategy(X, ys, acq_samples, ftpfn)
+
+    # Extract out the row which had the best PI
+    best_ix = acq_scores.argmax()
+
+    best_id = int(acq_samples[best_ix, 0].round().item())
+    if best_id == 0:  # It was a new acq. sample
+        best_real_id = None
+        best_fid = None
+    else:  # It was a sample to continue, decrement the 1 added earlier
+        best_real_id = best_id - 1
+        best_fid = fidelity_domain.cast_one(
+            budgets_prior_to_acq[best_ix].item(), frm=budget_domain
+        )
+
+    best_vector = acq_samples[best_ix, 2:].unsqueeze(0)
+    best_config = encoder.decode(best_vector)[0]
+
+    return best_real_id, best_fid, best_config
+
+
 _CACHED_FTPFN_MODEL: dict[tuple[str, str], FTPFN] = {}
 
 
@@ -122,14 +378,10 @@ def get_pi(
         train_x: torch.Tensor,
         train_y: torch.Tensor,
         test_x: torch.Tensor,
-        # TODO: just calculate from train_y?
-        y_best: torch.Tensor,
+        y_best: torch.Tensor | float,
     ) -> torch.Tensor:
         logits = self._get_logits(train_x, train_y, test_x)
-        return self.ftpfn.model.criterion.pi(
-            logits.squeeze(),
-            best_f=(1 - y_best).unsqueeze(1),
-        )
+        return self.ftpfn.model.criterion.pi(logits.squeeze(), best_f=y_best)
 
     @torch.no_grad()
     def get_ei(
@@ -137,12 +389,10 @@ def get_ei(
         train_x: torch.Tensor,
         train_y: torch.Tensor,
         test_x: torch.Tensor,
-        y_best: torch.Tensor,
+        y_best: torch.Tensor | float,
     ) -> torch.Tensor:
         logits = self._get_logits(train_x, train_y, test_x)
-        return self.ftpfn.model.criterion.ei(
-            logits.squeeze(), best_f=(1 - y_best).unsqueeze(1)
-        )
+        return self.ftpfn.model.criterion.ei(logits.squeeze(), best_f=y_best)
 
     @torch.no_grad()
     def get_lcb(
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index 6cafb1f7..96d6b7e0 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -6,25 +6,31 @@
 from collections.abc import Mapping
 from functools import reduce
 from typing import TYPE_CHECKING, Any, TypeVar
+from dataclasses import dataclass
 
+from botorch.fit import fit_gpytorch_mll
+from gpytorch import ExactMarginalLogLikelihood
 import torch
 import gpytorch.constraints
-from botorch.acquisition.analytic import SingleTaskGP
-from botorch.models.gp_regression import (
-    get_covar_module_with_dim_scaled_prior,
-)
+from botorch.models import SingleTaskGP
+from botorch.models.gp_regression import Log, get_covar_module_with_dim_scaled_prior
 from botorch.models.gp_regression_mixed import CategoricalKernel, OutcomeTransform
-from botorch.models.transforms.outcome import Standardize
+from botorch.models.transforms.outcome import ChainedOutcomeTransform, Standardize
 from botorch.optim import optimize_acqf, optimize_acqf_mixed
 from gpytorch.kernels import ScaleKernel
 from botorch.optim import optimize_acqf, optimize_acqf_mixed
 from itertools import product
 
-from neps.search_spaces.encoding import (
-    CategoricalToIntegerTransformer,
-    TensorEncoder,
-    TensorPack,
+from neps.optimizers.bayesian_optimization.acquisition_functions.cost_cooling import (
+    cost_cooled_acq,
+)
+from neps.optimizers.bayesian_optimization.acquisition_functions.pibo import (
+    pibo_acquisition,
 )
+from neps.sampling.priors import Prior
+from neps.search_spaces.encoding import CategoricalToIntegerTransformer, ConfigEncoder
+from neps.search_spaces.search_space import SearchSpace
+from neps.state.trial import Trial
 
 if TYPE_CHECKING:
     from botorch.acquisition import AcquisitionFunction
@@ -35,6 +41,16 @@
 T = TypeVar("T")
 
 
+@dataclass
+class GPEncodedData:
+    """Tensor data of finished configurations."""
+
+    x: torch.Tensor
+    y: torch.Tensor
+    cost: torch.Tensor | None = None
+    x_pending: torch.Tensor | None = None
+
+
 def default_categorical_kernel(
     N: int,
     active_dims: tuple[int, ...] | None = None,
@@ -51,8 +67,9 @@ def default_categorical_kernel(
 
 
 def make_default_single_obj_gp(
-    x: TensorPack,
+    x: torch.Tensor,
     y: torch.Tensor,
+    encoder: ConfigEncoder,
     *,
     y_transform: OutcomeTransform | None = None,
 ) -> SingleTaskGP:
@@ -63,7 +80,6 @@ def make_default_single_obj_gp(
     if y_transform is None:
         y_transform = Standardize(m=1)
 
-    encoder = x.encoder
     numerics: list[int] = []
     categoricals: list[int] = []
     for hp_name, transformer in encoder.transformers.items():
@@ -74,12 +90,12 @@ def make_default_single_obj_gp(
 
     # Purely vectorial
     if len(categoricals) == 0:
-        return SingleTaskGP(train_X=x.tensor, train_Y=y, outcome_transform=y_transform)
+        return SingleTaskGP(train_X=x, train_Y=y, outcome_transform=y_transform)
 
     # Purely categorical
     if len(numerics) == 0:
         return SingleTaskGP(
-            train_X=x.tensor,
+            train_X=x,
             train_Y=y,
             covar_module=default_categorical_kernel(len(categoricals)),
             outcome_transform=y_transform,
@@ -108,16 +124,13 @@ def make_default_single_obj_gp(
     kernel = numeric_kernel + cat_kernel
 
     return SingleTaskGP(
-        train_X=x.tensor,
-        train_Y=y,
-        covar_module=kernel,
-        outcome_transform=y_transform,
+        train_X=x, train_Y=y, covar_module=kernel, outcome_transform=y_transform
     )
 
 
 def optimize_acq(
     acq_fn: AcquisitionFunction,
-    encoder: TensorEncoder,
+    encoder: ConfigEncoder,
     *,
     n_candidates_required: int = 1,
     num_restarts: int = 20,
@@ -200,3 +213,202 @@ def optimize_acq(
         fixed_features_list=fixed_cats,
         **acq_options,
     )
+
+
+def encode_trials_for_gp(
+    trials: Mapping[str, Trial],
+    space: SearchSpace,
+    *,
+    encoder: ConfigEncoder | None = None,
+    device: torch.device | None = None,
+) -> tuple[GPEncodedData, ConfigEncoder]:
+    train_configs: list[Mapping[str, Any]] = []
+    train_losses: list[float] = []
+    train_costs: list[float] = []
+    pending_configs: list[Mapping[str, Any]] = []
+
+    if encoder is None:
+        encoder = ConfigEncoder.default({**space.numerical, **space.categoricals})
+
+    for trial in trials.values():
+        if trial.report is None:
+            pending_configs.append(trial.config)
+            continue
+
+        train_configs.append(trial.config)
+
+        loss = trial.report.loss
+        train_losses.append(torch.nan if loss is None else loss)
+
+        cost = trial.report.cost
+        train_costs.append(torch.nan if cost is None else cost)
+
+    x_train = encoder.encode(train_configs, device=device)
+    y_train = torch.tensor(train_losses, dtype=torch.float64, device=device)
+    cost_train = torch.tensor(train_costs, dtype=torch.float64, device=device)
+    if len(pending_configs) > 0:
+        x_pending = encoder.encode(pending_configs, device=device)
+    else:
+        x_pending = None
+
+    data = GPEncodedData(x=x_train, y=y_train, cost=cost_train, x_pending=x_pending)
+    return data, encoder
+
+
+def fit_and_acquire_from_gp(
+    *,
+    gp: SingleTaskGP,
+    x_train: torch.Tensor,
+    y_train: torch.Tensor,
+    encoder: ConfigEncoder,
+    fantasize_pending: torch.Tensor | None = None,
+    acquisition: AcquisitionFunction,
+    prior: Prior | None = None,
+    pibo_exp_term: float | None = None,
+    cost_gp: SingleTaskGP | None = None,
+    costs: torch.Tensor | None = None,
+    cost_percentage_used: float | None = None,
+    costs_on_log_scale: bool = True,
+    seed: int | None = None,
+    n_candidates_required: int | None = None,
+    num_restarts: int = 20,
+    n_initial_start_points: int | None = None,
+    maximum_allowed_categorical_combinations: int = 30,
+    acq_options: Mapping[str, Any] | None = None,
+) -> torch.Tensor:
+    """Acquire the next configuration to evaluate using a GP.
+
+    Please see the following for:
+
+    * Making a GP to pass in:
+        [`make_default_single_obj_gp`][neps.optimizers.bayesian_optimization.models.gp.make_default_single_obj_gp]
+    * Encoding configurations:
+        [`encode_trails_for_gp`][neps.optimizers.bayesian_optimization.models.gp.encode_trails_for_gp]
+
+    Args:
+        gp: The GP model to use.
+        x_train: The encoded configurations that have already been evaluated
+        y_train: The loss of the evaluated configurations.
+
+            !!! note "NaNs"
+
+                Any y values encoded with NaNs will automatically be filled with
+                the mean loss value. This is to ensure a smoother acquisition
+                function optimization landscape. While this is a poorer
+                approximation of the landscape, this does not matter as we are
+                aiming to ensure that the GP models good areas as being better
+                than any other area, regardless of whether they are average or garbage.
+
+        encoder: The encoder used for encoding the configurations
+        fantasize_pending: The pending configurations to fantasize over. Please be aware
+            that there are more efficient strategies as some acquisition functions can
+            handle this explicitly.
+        acquisition: The acquisition function to use.
+
+            A good default is `qLogNoisyExpectedImprovement` which can
+            handle pending configurations gracefully without fantasization.
+
+        prior: The prior to use over configurations. If this is provided, the
+            acquisition function will be further weighted using the piBO acquisition.
+        pibo_exp_term: The exponential term for the piBO acquisition. If `None` is
+            provided, one will be estimated.
+        costs: The costs of evaluating the configurations. If this is provided,
+            then a secondary GP will be used to estimate the cost of a given
+            configuration and factor into the weighting during the acquisiton of a new
+            configuration.
+        cost_percentage_used: The percentage of the budget used so far. This is used to determine
+            the strength of the cost cooling. Should be between 0 and 1.
+            Must be provided if costs is provided.
+        costs_on_log_scale: Whether the costs are on a log scale.
+        encoder: The encoder used for encoding the configurations
+        seed: The seed to use.
+        n_candidates_required: The number of candidates to return. If left
+            as `None`, only the best candidate will be returned. Otherwise
+            a list of candidates will be returned.
+        num_restarts: The number of restarts to use during optimization.
+        n_initial_start_points: The number of initial start points to use during optimization.
+        maximum_allowed_categorical_combinations: The maximum number of categorical
+            combinations to allow. If the number of combinations exceeds this, an error
+            will be raised.
+        acq_options: Additional options to pass to the botorch `optimizer_acqf` function.
+
+    Returns:
+        The encoded next configuration(s) to evaluate. Use the encoder you provided
+        to decode the configuration.
+    """
+    fit_gpytorch_mll(ExactMarginalLogLikelihood(likelihood=gp.likelihood, model=gp))
+
+    if fantasize_pending is not None:
+        y_train = torch.cat([y_train, gp.posterior(fantasize_pending).mean], dim=0)
+        x_train = torch.cat([x_train, fantasize_pending], dim=0)
+
+    if prior:
+        if pibo_exp_term is None:
+            raise ValueError(
+                "If providing a prior, you must provide the `pibo_exp_term`."
+            )
+
+        acquisition = pibo_acquisition(
+            acquisition,
+            prior=prior,
+            prior_exponent=pibo_exp_term,
+            x_domain=encoder.domains,
+        )
+
+    if costs is not None:
+        if cost_percentage_used is None:
+            raise ValueError(
+                "If providing costs, you must provide `cost_percentage_used`."
+            )
+
+        # We simply ignore missing costs when training the cost GP.
+        missing_costs = torch.isnan(costs)
+        if missing_costs.any():
+            raise ValueError(
+                "Must have at least some configurations reported with a cost if using costs"
+                " with a GP."
+            )
+
+        if missing_costs.any():
+            not_missing_mask = ~missing_costs
+            x_train_cost = costs[not_missing_mask]
+            y_train_cost = x_train[not_missing_mask]
+        else:
+            x_train_cost = x_train
+            y_train_cost = costs
+
+        if costs_on_log_scale:
+            transform = ChainedOutcomeTransform(
+                log=Log(),
+                standardize=Standardize(m=1),
+            )
+        else:
+            transform = Standardize(m=1)
+
+        cost_gp = make_default_single_obj_gp(
+            x_train_cost,
+            y_train_cost,
+            encoder=encoder,
+            y_transform=transform,
+        )
+        fit_gpytorch_mll(
+            ExactMarginalLogLikelihood(likelihood=cost_gp.likelihood, model=cost_gp)
+        )
+        acquisition = cost_cooled_acq(
+            acq_fn=acquisition,
+            model=cost_gp,
+            used_budget_percentage=cost_percentage_used,
+        )
+
+    _n = n_candidates_required if n_candidates_required is not None else 1
+
+    candidates, _scores = optimize_acq(
+        acquisition,
+        encoder,
+        n_candidates_required=_n,
+        num_restarts=num_restarts,
+        n_intial_start_points=n_initial_start_points,
+        acq_options=acq_options,
+        maximum_allowed_categorical_combinations=maximum_allowed_categorical_combinations,
+    )
+    return candidates
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index ad167a8a..27411158 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -2,29 +2,21 @@
 
 import math
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any
 
 import torch
 from botorch.acquisition import LinearMCObjective
 from botorch.acquisition.logei import qLogNoisyExpectedImprovement
-from botorch.fit import fit_gpytorch_mll
-from botorch.models.transforms.outcome import ChainedOutcomeTransform, Log, Standardize
-from gpytorch import ExactMarginalLogLikelihood
 
 from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig
-from neps.optimizers.bayesian_optimization.acquisition_functions.cost_cooling import (
-    cost_cooled_acq,
-)
-from neps.optimizers.bayesian_optimization.acquisition_functions.pibo import (
-    pibo_acquisition,
-)
 from neps.optimizers.bayesian_optimization.models.gp import (
+    fit_and_acquire_from_gp,
+    encode_trials_for_gp,
     make_default_single_obj_gp,
-    optimize_acq,
 )
 from neps.optimizers.intial_design import make_initial_design
 from neps.sampling import Prior
-from neps.search_spaces.encoding import TensorEncoder
+from neps.search_spaces.encoding import ConfigEncoder
 from neps.search_spaces.hyperparameters.categorical import CategoricalParameter
 
 if TYPE_CHECKING:
@@ -34,62 +26,6 @@
     from neps.state import BudgetInfo, Trial
 
 
-def _missing_fill_strategy(
-    y: torch.Tensor,
-    strategy: Literal["mean", "worst", "3std", "nan"],
-    *,
-    lower_is_better: bool,
-) -> torch.Tensor:
-    # Assumes minimization
-    if y.ndim != 1:
-        raise ValueError("Only supports single objective optimization for now!")
-
-    match strategy:
-        case "nan":
-            return y
-        case "mean":
-            return torch.nan_to_num(y, nan=y.mean().item())
-        case "worst":
-            worst = y.min() if lower_is_better else y.max()
-            return torch.nan_to_num(y, nan=worst.item())
-        case "3std":
-            sign = 1 if lower_is_better else -1
-            std = y.std()
-            return torch.nan_to_num(y, nan=y.mean().item() + sign * 3 * std.item())
-        case _:
-            raise ValueError(f"Unknown strategy: {strategy}")
-
-
-def _missing_y_strategy(y: torch.Tensor) -> torch.Tensor:
-    # TODO: Figure out what to do if there's no reported loss value.
-    # Some strategies:
-    # 1. Replace with NaN, in which case GPYtorch ignores it
-    #   * Good if crash is random crash, in which case we do not wish to model
-    #   a performance because of it.
-    # 2. Replace with worst value seen so far
-    #   * Good if crash is systematic, in which case we wish to model it as
-    #   basically, "don't go here" while remaining in the range of possible
-    #   values for the GP.
-    # 3. Replace with mean
-    #   * Same as above but keeps the optimization of the GP landscape
-    #   smoother. Good if we have a mix of non-systematic and systematic
-    #   crashed. Likely the safest option as GP will likely be unconfident in
-    #   unsystematic crash cases, especially if it seems like a rare-event.
-    #   Will also unlikely be a candidate region if systematic and we observe
-    #   a few crashes there. However would take longer to learn of systematic
-    #   crash regions.
-    return _missing_fill_strategy(y, strategy="mean", lower_is_better=True)
-
-
-def _missing_cost_strategy(cost: torch.Tensor) -> torch.Tensor:
-    # TODO: Figure out what to do if there's no reported cost value
-    # Likely best to just fill in worst cost seen so far as this crash
-    # cost us a lot of time and we do not want to waste time on this
-    # region again. However if the crash was random, we might enter some
-    # issues.
-    return _missing_fill_strategy(cost, strategy="3std", lower_is_better=True)
-
-
 def _pibo_exp_term(
     n_sampled_already: int,
     ndims: int,
@@ -120,13 +56,6 @@ def _pibo_exp_term(
     return math.exp(-n_bo_samples / ndims)
 
 
-def _cost_used_budget_percentage(budget_info: BudgetInfo) -> float:
-    if budget_info.max_cost_budget is not None:
-        return budget_info.used_cost_budget / budget_info.max_cost_budget
-
-    raise ValueError("No cost budget provided!")
-
-
 class BayesianOptimization(BaseOptimizer):
     """Implements the basic BO loop."""
 
@@ -137,9 +66,10 @@ def __init__(  # noqa: D417
         initial_design_size: int | None = None,
         use_priors: bool = False,
         use_cost: bool = False,
+        cost_on_log_scale: bool = True,
         sample_default_first: bool = False,
         device: torch.device | None = None,
-        encoder: TensorEncoder | None = None,
+        encoder: ConfigEncoder | None = None,
         seed: int | None = None,
         budget: Any | None = None,  # TODO: remove
         surrogate_model: Any | None = None,  # TODO: remove
@@ -163,6 +93,7 @@ def __init__(  # noqa: D417
 
                     If using `cost`, cost must be provided in the reports of the trials.
 
+            cost_on_log_scale: Whether to use the log of the cost when using cost.
             sample_default_first: Whether to sample the default configuration first.
             seed: Seed to use for the random number generator of samplers.
             device: Device to use for the optimization.
@@ -181,14 +112,16 @@ def __init__(  # noqa: D417
             **pipeline_space.numerical,
             **pipeline_space.categoricals,
         }
-        self.encoder = TensorEncoder.default(params) if encoder is None else encoder
+        self.encoder = encoder or ConfigEncoder.default(params)
         self.prior = Prior.from_parameters(params) if use_priors is True else None
         self.seed = seed
         self.use_cost = use_cost
+        self.use_priors = use_priors
+        self.cost_on_log_scale = cost_on_log_scale
         self.device = device
         self.sample_default_first = sample_default_first
         self.n_initial_design = initial_design_size
-        self.initial_design_: list[dict[str, Any]] | None = None
+        self.init_design: list[dict[str, Any]] | None = None
 
     def ask(
         self,
@@ -202,13 +135,14 @@ def ask(
                 "Seed is not yet implemented for BayesianOptimization"
             )
 
-        n_trials_sampled = len(trials)
-        config_id = str(n_trials_sampled + 1)
+        n_sampled = len(trials)
+        config_id = str(n_sampled + 1)
+        space = self.pipeline_space
 
         # If we havn't passed the intial design phase
-        if self.initial_design_ is None:
-            self.initial_design_ = make_initial_design(
-                space=self.pipeline_space,
+        if self.init_design is None:
+            self.init_design = make_initial_design(
+                space=space,
                 encoder=self.encoder,
                 sample_default_first=self.sample_default_first,
                 sampler=self.prior if self.prior is not None else "sobol",
@@ -219,124 +153,53 @@ def ask(
                 sample_fidelity="max",
             )
 
-        if n_trials_sampled < len(self.initial_design_):
-            config = self.initial_design_[n_trials_sampled]
-            return SampledConfig(id=config_id, config=config)
+        if n_sampled < len(self.init_design):
+            return SampledConfig(id=config_id, config=self.init_design[n_sampled])
 
-        # Now we actually do the BO loop, start by encoding the data
-        # TODO: Lift this into runtime, let the optimizer advertise the encoding wants...
-        x_configs: list[Mapping[str, Any]] = []
-        ys: list[float] = []
-        costs: list[float] = []
-        pending: list[Mapping[str, Any]] = []
-        for trial in trials.values():
-            if trial.state.pending():
-                pending.append(trial.config)
-            else:
-                assert trial.report is not None
-                x_configs.append(trial.config)
-                ys.append(
-                    trial.report.loss if trial.report.loss is not None else torch.nan
-                )
-                if self.use_cost:
-                    cost_z_score = trial.report.cost
-                    costs.append(cost_z_score if cost_z_score is not None else torch.nan)
-
-        x = self.encoder.pack(x_configs, device=self.device)
-        maybe_x_pending_tensor = None
-        if len(pending) > 0:
-            x_pending = self.encoder.pack(pending, device=self.device)
-            maybe_x_pending_tensor = x_pending.tensor
-
-        y = torch.tensor(ys, dtype=torch.float64, device=self.device)
-        y = _missing_y_strategy(y)
-
-        # Now fit our model
-        y_model = make_default_single_obj_gp(
-            x,
-            y,
-            # TODO: We should consider applying some heurisitc to see if this should
-            # also include a log transform, similar as we do to cost if using `use_cost`.
-            y_transform=Standardize(m=1),
+        # Otherwise, we encode trials and setup to fit and acquire from a GP
+        data, encoder = encode_trials_for_gp(
+            trials, space, device=self.device, encoder=self.encoder
         )
-        y_likelihood = y_model.likelihood
 
-        fit_gpytorch_mll(
-            ExactMarginalLogLikelihood(likelihood=y_likelihood, model=y_model)
-        )
-
-        # NOTE: We use:
-        # * q - allows accounting for pending points, normally used to get a batch
-        #       of points.
-        # * log - More numerically stable
-        # * Noisy - In Deep-Learning, we shouldn't take f.min() incase it was a noise
-        #           spike. This accounts for noise in objective.
-        # * ExpectedImprovement - Cause ya know, the default.
-        acq = qLogNoisyExpectedImprovement(
-            y_model,
-            X_baseline=x.tensor,
-            X_pending=maybe_x_pending_tensor,
-            # Unfortunatly, there's no option to indicate that we minimize
-            # the AcqFunction so we need to do some kind of transformation.
-            # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607
-            objective=LinearMCObjective(weights=torch.tensor([-1.0])),
-        )
+        cost_percent = None
+        if self.use_cost:
+            if budget_info.max_cost_budget is None:
+                raise ValueError("Cost budget must be set if using cost")
+            cost_percent = budget_info.used_cost_budget / budget_info.max_cost_budget
 
         # If we should use the prior, weight the acquisition function by
         # the probability of it being sampled from the prior.
+        pibo_exp_term = None
+        prior = None
         if self.prior:
             pibo_exp_term = _pibo_exp_term(
-                n_trials_sampled,
-                self.encoder.ncols,
-                len(self.initial_design_),
+                n_sampled, encoder.ncols, len(self.init_design)
             )
+            # If the exp term is insignificant, skip prior acq. weighting
+            prior = None if pibo_exp_term < 1e-4 else self.prior
+
+        gp = make_default_single_obj_gp(x=data.x, y=data.y, encoder=encoder)
+        candidate = fit_and_acquire_from_gp(
+            gp=gp,
+            x_train=data.x,
+            y_train=data.y,
+            encoder=encoder,
+            acquisition=qLogNoisyExpectedImprovement(
+                model=gp,
+                X_baseline=data.x,
+                # Unfortunatly, there's no option to indicate that we minimize
+                # the AcqFunction so we need to do some kind of transformation.
+                # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607
+                objective=LinearMCObjective(weights=torch.tensor([-1.0])),
+                X_pending=data.x_pending,
+                prune_baseline=True,
+            ),
+            prior=prior,
+            pibo_exp_term=pibo_exp_term,
+            costs=data.cost if self.use_cost else None,
+            cost_percentage_used=cost_percent,
+            costs_on_log_scale=self.cost_on_log_scale,
+        )
 
-            # If the amount of weight derived from the pibo exponent becomes
-            # insignificant, we don't use it as it as it adds extra computational
-            # burden and introduces more chance of numerical instability.
-            significant_lower_bound = 1e-4
-            if pibo_exp_term > significant_lower_bound:
-                acq = pibo_acquisition(
-                    acq,
-                    prior=self.prior,
-                    prior_exponent=pibo_exp_term,
-                    x_domain=self.encoder.domains,
-                    X_pending=maybe_x_pending_tensor,
-                )
-
-        # If we should use cost, weight the acquisition function by the cost
-        # of the configurations.
-        if self.use_cost:
-            cost = torch.tensor(costs, dtype=torch.float64, device=self.device)
-            cost_z_score = _missing_cost_strategy(cost)
-
-            cost_model = make_default_single_obj_gp(
-                x,
-                cost_z_score,
-                y_transform=ChainedOutcomeTransform(
-                    # TODO: Maybe some way for a user to specify their cost
-                    # is on a log scale?
-                    log=Log(),
-                    standardize=Standardize(m=1),
-                ),
-            )
-            cost_likelihood = cost_model.likelihood
-
-            # Optimize the cost model
-            fit_gpytorch_mll(
-                ExactMarginalLogLikelihood(likelihood=cost_likelihood, model=cost_model)
-            )
-            acq = cost_cooled_acq(
-                acq_fn=acq,
-                model=cost_model,
-                used_budget_percentage=_cost_used_budget_percentage(budget_info),
-                X_pending=maybe_x_pending_tensor,
-            )
-
-        # Finally, optimize the acquisition function to get a configuration
-        candidates, _eis = optimize_acq(acq_fn=acq, encoder=self.encoder, acq_options={})
-
-        assert len(candidates) == 1, "Expected only one candidate!"
-        config = self.encoder.unpack(candidates)[0]
-
+        config = encoder.decode(candidate)[0]
         return SampledConfig(id=config_id, config=config)
diff --git a/neps/optimizers/intial_design.py b/neps/optimizers/intial_design.py
index f2109f00..5993f68b 100644
--- a/neps/optimizers/intial_design.py
+++ b/neps/optimizers/intial_design.py
@@ -1,25 +1,24 @@
-from collections.abc import Sequence
-from dataclasses import dataclass, field
+from __future__ import annotations
 
 from typing import Literal, Any, Mapping
 
 from neps.sampling import Sampler
 from neps.sampling.priors import Prior
-from neps.search_spaces.encoding import TensorEncoder
+from neps.search_spaces.encoding import ConfigEncoder
 from neps.search_spaces.search_space import SearchSpace
 import torch
 
 
 def make_initial_design(
     space: SearchSpace,
-    encoder: TensorEncoder,
+    encoder: ConfigEncoder,
     sampler: Literal["sobol", "prior", "uniform"] | Sampler,
     sample_size: int | Literal["ndim"] | None = "ndim",
     sample_default_first: bool = True,
     sample_fidelity: (
         Literal["min", "max", True] | int | float | dict[str, int | float]
     ) = True,
-    seed: int | None = None,
+    seed: torch.Generator | None = None,
 ) -> list[dict[str, Any]]:
     """Generate the initial design of the optimization process.
 
@@ -50,7 +49,7 @@ def make_initial_design(
             When specified as a dictionary, the keys should be the names of the
             fidelity parameters and the values should be the target fidelities.
             If set to `True`, the configuration will have its fidelity randomly sampled.
-        seed: The seed to use for the random number generator of samplers.
+        seed: The seed to use for the random number generation.
 
     """
     configs: list[dict[str, Any]] = []
@@ -124,7 +123,7 @@ def make_initial_design(
             seed=seed,
         )
         uniq_x = torch.unique(encoded_configs, dim=0)
-        sample_configs = encoder.unpack(uniq_x[:sample_size])
+        sample_configs = encoder.decode(uniq_x[:sample_size])
         configs.extend([{**config, **fids} for config in sample_configs])
 
     return configs
diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index 569e0ccf..59de10c1 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -1,15 +1,21 @@
+from functools import partial
 from typing import Any, Mapping, Literal
 
 import numpy as np
 import torch
 
+import warnings
 from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig
-from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate
+from neps.optimizers.bayesian_optimization.models.ftpfn import (
+    FTPFNSurrogate,
+    acquire_next_from_ftpfn,
+    encode_trials_for_ftpfn,
+)
 from neps.optimizers.intial_design import make_initial_design
 from neps.sampling.priors import Prior
 from neps.sampling.samplers import Sampler
 from neps.search_spaces.domain import Domain
-from neps.search_spaces.encoding import CategoricalToUnitNorm, TensorEncoder
+from neps.search_spaces.encoding import CategoricalToUnitNorm, ConfigEncoder
 from neps.search_spaces.search_space import FloatParameter, IntegerParameter, SearchSpace
 from neps.state.trial import Trial
 from neps.state.optimizer import BudgetInfo
@@ -87,7 +93,7 @@ def _tokenize(
 
 def _encode_for_ftpfn(
     trials: Mapping[str, Trial],
-    encoder: TensorEncoder,
+    encoder: ConfigEncoder,
     space: SearchSpace,
     budget_domain: Domain,
     device: torch.device | None = None,
@@ -196,36 +202,6 @@ def _keep_highest_budget_evaluation(x: torch.Tensor) -> torch.Tensor:
     return sorted_x[ii]
 
 
-def _acquire_pfn(
-    train_x: torch.Tensor,
-    train_y: torch.Tensor,
-    test_x: torch.Tensor,
-    ftpfn: FTPFNSurrogate,
-    y_to_beat: float,
-    how: Literal["pi", "ei", "ucb", "lcb"],
-) -> torch.Tensor:
-    match how:
-        case "pi":
-            y_best = torch.full(
-                size=(len(test_x),), fill_value=y_to_beat, dtype=FTPFN_DTYPE
-            )
-            return ftpfn.get_pi(train_x, train_y, test_x, y_best=y_best)
-        case "ei":
-            y_best = torch.full(
-                size=(len(test_x),), fill_value=y_to_beat, dtype=FTPFN_DTYPE
-            )
-            return ftpfn.get_ei(train_x, train_y, test_x, y_best=y_best)
-        case "ucb":
-            y_best = torch.full(
-                size=(len(test_x),), fill_value=y_to_beat, dtype=FTPFN_DTYPE
-            )
-            return ftpfn.get_ucb(train_x, train_y, test_x)
-        case "lcb":
-            return ftpfn.get_lcb(train_x, train_y, test_x)
-        case _:
-            raise ValueError(f"Unknown acquisition function {how}")
-
-
 class IFBO(BaseOptimizer):
     """Base class for MF-BO algorithms that use DyHPO-like acquisition and budgeting."""
 
@@ -284,7 +260,7 @@ def __init__(
 
         params = {**space.numerical, **space.categoricals}
         self._prior = Prior.from_parameters(params) if use_priors else None
-        self._ftpfn_encoder: TensorEncoder = TensorEncoder.default(
+        self._config_encoder: ConfigEncoder = ConfigEncoder.default(
             params,
             # FTPFN doesn't support categoricals and we were recomenned to just evenly distribute
             # in the unit norm
@@ -293,6 +269,8 @@ def __init__(
                 for cat_name, cat in space.categoricals.items()
             },
         )
+        self._border_sampler = Sampler.borders(len(params))
+        self._cached_border_configs: torch.Tensor | None = None
 
         # Domain of fidelity values, i.e. what is given in the configs that we
         # give to the user to evaluate at.
@@ -309,7 +287,7 @@ def ask(
         trials: Mapping[str, Trial],
         budget_info: BudgetInfo,
         optimizer_state: dict[str, Any],
-        seed: int | None = None,
+        seed: torch.Generator | None = None,
     ) -> SampledConfig:
         if seed is not None:
             raise NotImplementedError("Seed is not yet implemented for IFBO")
@@ -321,7 +299,7 @@ def ask(
         if self._initial_design is None:
             self._initial_design = make_initial_design(
                 space=self.pipeline_space,
-                encoder=self._ftpfn_encoder,
+                encoder=self._config_encoder,
                 sample_default_first=self.sample_default_first,
                 sampler="sobol" if self._prior is None else self._prior,
                 seed=seed,
@@ -333,128 +311,116 @@ def ask(
             return SampledConfig(id=f"{new_id}_0", config=self._initial_design[new_id])
 
         # Otherwise, we proceed to surrogate phase
-        ftpfn = FTPFNSurrogate(
-            target_path=self.surrogate_model_args.get("target_path", None),
-            version=self.surrogate_model_args.get("version", "0.0.1"),
-            device=self.device,
-        )
-        x_train, maximize_ys = _encode_for_ftpfn(
+        data = encode_trials_for_ftpfn(
             trials=trials,
-            encoder=self._ftpfn_encoder,
             space=self.pipeline_space,
+            encoder=self._config_encoder,
             budget_domain=self._budget_domain,
             device=self.device,
         )
-        # PFN uses `0` id for test configurations, we remove this later
-        x_train[:, ID_COL] = x_train[:, ID_COL] + 1
-
-        # Fantasize the result of pending trials
-        is_pending = maximize_ys.isnan()
-        maximize_ys[is_pending] = ftpfn.get_mean_performance(
-            train_x=x_train[~is_pending],
-            train_y=maximize_ys[~is_pending],
-            test_x=x_train[is_pending],
-        )
 
-        # We then sample a horizon, minimum one budget index increment and cast
-        # to the budget domain expected by the ftpfn model
-        rng = np.random.RandomState(seed)
-        lower_index = self._budget_ix_domain.lower
-        upper_index = self._budget_ix_domain.upper
-        horizon = self._budget_domain.cast_one(
-            rng.randint(lower_index, upper_index) + 1,
-            frm=self._budget_ix_domain,
-        )
+        # TODO: Very little chance mfpi_random is best but for now it's stable
+        def _mfpi_random(
+            _X: torch.Tensor,
+            _y: torch.Tensor,
+            _acq_samples: torch.Tensor,
+            _ftpfn: FTPFNSurrogate,
+            how: Literal["pi", "ei"],
+        ) -> torch.Tensor:
+            rng = np.random.RandomState(None if seed is None else seed + len(trials))
+            _low = self._budget_ix_domain.lower
+            _high = self._budget_ix_domain.upper
+            horizon_index = rng.randint(_low, _high) + 1
+            horizon = self._budget_domain.cast_one(
+                horizon_index, frm=self._budget_ix_domain
+            )
+            f_best = _y.max().item()
+            r = rng.uniform(-4, -1)
+            threshold = f_best + (10**r) * (1 - f_best)
+
+            # NOTE: If converting f_inc to be seperate per acq sample, you
+            # need to add an extra batch dimension to y_best, i.e. (n, 1)
+            # Budget column is between 0 and 1, but we want to add the horizon
+            BUDGET_COL = 1
+            _acq_samples[:, BUDGET_COL] += horizon
+            _acq_samples[:, BUDGET_COL] = torch.clamp(
+                _acq_samples[:, BUDGET_COL], max=self._budget_domain.upper
+            )
 
-        # Now we sample some new configurations into the domain expected by the FTPFN
-        if self._prior is not None:
-            acq_sampler = self._prior
-        else:
-            acq_sampler = Sampler.uniform(ndim=self._ftpfn_encoder.ncols)
+            match how:
+                case "pi":
+                    return _ftpfn.get_pi(_X, _y, _acq_samples, y_best=threshold)
+                case "ei":
+                    return _ftpfn.get_ei(_X, _y, _acq_samples, y_best=threshold)
+                case _:
+                    raise ValueError(f"Unknown acquisition strategy: {how=}")
+
+        ndims = self._config_encoder.ncols
 
-        new_acq_configs = acq_sampler.sample(
+        # Sample some configurations at uniform for acq.
+        uniform_sampler = Sampler.uniform(ndim=ndims)
+        uniform_configs = uniform_sampler.sample(
             self.n_acquisition_new_configs,
-            to=self._ftpfn_encoder.domains,
+            to=self._config_encoder.domains,
+            seed=seed,
             device=self.device,
-            seed=None,  # TODO
-        )
-        acq_new = _tokenize(
-            ids=torch.zeros(self.n_acquisition_new_configs, device=self.device),
-            budgets=torch.full(
-                size=(self.n_acquisition_new_configs,),
-                fill_value=self._budget_domain.lower,
-                device=self.device,
-            ),
-            configs=new_acq_configs,
+            dtype=FTPFN_DTYPE,
         )
 
-        # Construct all our samples for acqusition:
-        # 1. Take all non-pending configs
-        acq_existing = x_train[~is_pending].clone().detach()
-
-        # 2. We only want to include the configuration at their highest
-        # budget evaluated, i.e. don't include config_0_0 if config_0_1 is highest
-        acq_existing = _keep_highest_budget_evaluation(acq_existing)
-
-        # 3. Sub select all that are not fully evaluated
-        acq_existing = acq_existing[
-            acq_existing[:, BUDGET_COL] < self._budget_domain.upper
-        ]
-
-        # 4. Add in the new sampled configurations
-        acq_samples = torch.vstack([acq_existing, acq_new])
-
-        # 5. Add on the horizon to the budget
-        unclamped_budgets = acq_samples[:, BUDGET_COL] + horizon
-
-        # 6. Clamp to the maximum of the budget domain
-        acq_samples[:, BUDGET_COL] = torch.clamp(
-            unclamped_budgets, max=self._budget_domain.upper
-        )
+        # Also sample some border configurations for acq.
+        # OPTIM: If we are below the amount possible, there is no randomness and we can cache them
+        border_sampler = Sampler.borders(ndim=ndims)
+        N_border = 2**9  # 512, if we go over, we subselect 512 border configs
+        if N_border <= border_sampler.n_possible:
+            if self._cached_border_configs is not None:
+                border_configs = self._cached_border_configs
+            else:
+                self._cached_border_configs = border_sampler.sample(
+                    n=N_border,
+                    to=self._config_encoder.domains,
+                    seed=seed,
+                    device=self.device,
+                    dtype=FTPFN_DTYPE,
+                )
+                border_configs = self._cached_border_configs
+        else:
+            border_configs = border_sampler.sample(
+                n=N_border,
+                to=self._config_encoder.domains,
+                seed=seed,
+                device=self.device,
+                dtype=FTPFN_DTYPE,
+            )
 
-        # Now get the PI of these samples according to MFPI_Random
-        maximize_best_y = maximize_ys.max().item()
-        lu = 10 ** rng.uniform(-4, -1)
-        f_inc = maximize_best_y * (1 - lu)
-
-        acq_scores = _acquire_pfn(
-            train_x=x_train,
-            train_y=maximize_ys[~is_pending],
-            test_x=acq_samples,
-            ftpfn=ftpfn,
-            y_to_beat=f_inc,
-            how="pi",
+        id, current_fid, config = acquire_next_from_ftpfn(
+            ftpfn=FTPFNSurrogate(
+                target_path=self.surrogate_model_args.get("target_path", None),
+                version=self.surrogate_model_args.get("version", "0.0.1"),
+                device=self.device,
+            ),
+            data=data,
+            seed=seed,
+            encoder=self._config_encoder,
+            budget_domain=self._budget_domain,
+            fidelity_domain=self._fid_domain,
+            extra_acq_samples=torch.cat([uniform_configs, border_configs], dim=0),
+            acq_strategy=partial(_mfpi_random, how="ei"),
         )
-
-        # Extract out the row which had the best PI
-        best_ix = acq_scores.argmax()
-        best_id = int(acq_samples[best_ix, ID_COL].round().item())
-        best_vector = acq_samples[best_ix, 2:].unsqueeze(0)
-        best_config = self._ftpfn_encoder.unpack(best_vector)[0]
-
-        if best_id == 0:
-            # A newly sampled configuration was deemed more promising
-            config_id = f"{new_id}_0"
-            best_config[self._fidelity_name] = self._min_budget
-            previous_config_id = None
-            return SampledConfig(config_id, best_config, previous_config_id)
-
-        # To get to the next fidelity value to provide,
-        # 1. Get the budget before we added the horizon
-        budget = float(unclamped_budgets[best_ix] - horizon)
-
-        # 2. Cast to budget index domain
-        budget_ix = self._budget_ix_domain.cast_one(budget, frm=self._budget_domain)
-
-        # 3. Increment it to the next budget index
-        budget_ix += 1
-
-        # 4. And finally convert back into the fidelity domain
-        fid_value = self._fid_domain.cast_one(budget_ix, frm=self._budget_ix_domain)
-
-        real_best_id = best_id - 1  # NOTE: Remove the +1 we added to all ids earlier
-        best_config[self._fidelity_name] = fid_value
-
-        config_id = f"{real_best_id}_{budget_ix}"
-        previous_config_id = f"{real_best_id}_{budget_ix - 1}"
-        return SampledConfig(config_id, best_config, previous_config_id)
+        if current_fid is None:
+            assert id is None
+            config[self._fidelity_name] = self._fid_domain.lower
+            return SampledConfig(id=f"{new_id}_0", config=config)
+        else:
+            current_budget_ix = self._budget_ix_domain.cast_one(
+                current_fid, frm=self._fid_domain
+            )
+            next_budget_ix = current_budget_ix + 1
+            next_fid = self._fid_domain.cast_one(
+                next_budget_ix, frm=self._budget_ix_domain
+            )
+            config[self._fidelity_name] = next_fid
+            return SampledConfig(
+                id=f"{id}_{next_budget_ix}",
+                config=config,
+                previous_config_id=f"{id}_{current_budget_ix}",
+            )
diff --git a/neps/optimizers/multi_fidelity/sampling_policy.py b/neps/optimizers/multi_fidelity/sampling_policy.py
index 58d75387..bc35f300 100644
--- a/neps/optimizers/multi_fidelity/sampling_policy.py
+++ b/neps/optimizers/multi_fidelity/sampling_policy.py
@@ -3,25 +3,39 @@
 
 import logging
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Mapping
 
+from botorch.acquisition import (
+    AcquisitionFunction,
+    LinearMCObjective,
+    qLogNoisyExpectedImprovement,
+)
+from botorch.acquisition.analytic import SingleTaskGP
+from botorch.fit import fit_gpytorch_mll
+from gpytorch import ExactMarginalLogLikelihood
 import numpy as np
 import pandas as pd
 import torch
 
 from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping
+from neps.optimizers.bayesian_optimization.acquisition_functions.pibo import (
+    pibo_acquisition,
+)
 from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import (
     DecayingPriorWeightedAcquisition,
 )
 from neps.optimizers.bayesian_optimization.acquisition_samplers import (
     AcquisitionSamplerMapping,
 )
+from neps.optimizers.bayesian_optimization.models.gp import make_default_single_obj_gp
 from neps.optimizers.multi_fidelity_prior.utils import (
     compute_config_dist,
     custom_crossover,
     local_mutation,
     update_fidelity,
 )
+from neps.sampling.priors import Prior
+from neps.search_spaces.encoding import ConfigEncoder
 from neps.utils.common import instance_from_map
 
 if TYPE_CHECKING:
@@ -273,67 +287,68 @@ class ModelPolicy(SamplingPolicy):
     def __init__(
         self,
         pipeline_space: SearchSpace,
-        surrogate_model: str | Any = "gp",
-        surrogate_model_args: dict | None = None,
-        acquisition: str | BaseAcquisition = "EI",
-        log_prior_weighted: bool = False,
-        acquisition_sampler: str | AcquisitionSampler = "random",
-        patience: int = 100,
-        logger=None,
+        prior: Prior | None = None,
+        use_cost: bool = False,
+        device: torch.device | None = None,
     ):
-        super().__init__(pipeline_space=pipeline_space, logger=logger)
-
-        surrogate_model_args = surrogate_model_args or {}
-        self.surrogate_model = instance_from_map(
-            SurrogateModelMapping,
-            surrogate_model,
-            name="surrogate model",
-            kwargs=surrogate_model_args,
+        if prior:
+            raise NotImplementedError("Priors are not implemented yet.")
+        if use_cost:
+            raise NotImplementedError("Cost is not implemented yet.")
+
+        super().__init__(pipeline_space=pipeline_space)
+        self.device = device
+        self.prior = prior
+        self._encoder = ConfigEncoder.default(
+            {**pipeline_space.numerical, **pipeline_space.categoricals}
         )
+        self._model: SingleTaskGP | None = None
+        self._acq: AcquisitionFunction | None = None
 
-        self.acquisition = instance_from_map(
-            AcquisitionMapping,
-            acquisition,
-            name="acquisition function",
-        )
+    def update_model(
+        self,
+        train_x: list[SearchSpace],
+        train_y: list[float],
+        pending_x: list[SearchSpace],
+        decay_t: float | None = None,
+    ):
+        x_train = self._encoder.encode([config.hp_values() for config in train_x])
+        x_pending = self._encoder.encode([config.hp_values() for config in pending_x])
+        y_train = torch.tensor(train_y, dtype=torch.float64, device=self.device)
+
+        y_model = make_default_single_obj_gp(x_train, y_train, encoder=self._encoder)
 
-        # TODO: Enable only when a flag exists to toggle prior-based decaying of AF
-        # if pipeline_space.has_prior:
-        #     self.acquisition = DecayingPriorWeightedAcquisition(
-        #         self.acquisition, log=log_prior_weighted
-        #     )
-
-        self.acquisition_sampler = instance_from_map(
-            AcquisitionSamplerMapping,
-            acquisition_sampler,
-            name="acquisition sampler function",
-            kwargs={"patience": patience, "pipeline_space": pipeline_space},
+        fit_gpytorch_mll(
+            ExactMarginalLogLikelihood(likelihood=y_model.likelihood, model=y_model),
+        )
+        acq = qLogNoisyExpectedImprovement(
+            y_model,
+            X_baseline=x_train,
+            X_pending=x_pending,
+            # Unfortunatly, there's no option to indicate that we minimize
+            # the AcqFunction so we need to do some kind of transformation.
+            # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607
+            objective=LinearMCObjective(weights=torch.tensor([-1.0])),
         )
 
-        self.sampling_args: dict = {}
-
-    def _fantasize_pending(self, train_x, train_y, pending_x):
-        if len(pending_x) == 0:
-            return train_x, train_y
-        # fit model on finished evaluations
-        self.surrogate_model.fit(train_x, train_y)
-        # hallucinating: predict for the pending evaluations
-        _y, _ = self.surrogate_model.predict(pending_x)
-        _y = _y.detach().numpy().tolist()
-        # appending to training data
-        train_x.extend(pending_x)
-        train_y.extend(_y)
-        return train_x, train_y
-
-    def update_model(self, train_x, train_y, pending_x, decay_t=None):
-        if decay_t is None:
-            decay_t = len(train_x)
-        train_x, train_y = self._fantasize_pending(train_x, train_y, pending_x)
-        self.surrogate_model.fit(train_x, train_y)
-        self.acquisition.set_state(self.surrogate_model, decay_t=decay_t)
-        # TODO: set_state should generalize to all options
-        #  no needed to set state of sampler when using `random`
-        # self.acquisition_sampler.set_state(x=train_x, y=train_y)
+        # If we have a prior, wrap the above acquisitionm with a prior weighting
+        if self.prior is not None:
+            assert decay_t is not None
+            # TODO: Ideally we have something based on budget and dimensions, not an arbitrary term
+            # This 10 is extracted from the old DecayingWeightedPrior
+            pibo_exp_term = 10 / decay_t
+            significant_lower_bound = 1e-4  # No significant impact beyond this point
+            if pibo_exp_term < significant_lower_bound:
+                acq = pibo_acquisition(
+                    acq,
+                    prior=self.prior,
+                    prior_exponent=pibo_exp_term,
+                    x_domain=self._encoder.domains,
+                    x_pending=x_pending,
+                )
+
+        self._y_model = y_model
+        self._acq = acq
 
     def sample(
         self,
@@ -354,8 +369,6 @@ def sample(
               variable set to the same value. This value is same as that of the fidelity
               value of the configs in the training data.
         """
-        self.logger.info("Acquiring...")
-
         # sampling random configurations
         samples = [
             self.pipeline_space.sample(user_priors=False, ignore_fidelity=True)
diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py
index 62c81ed8..fc27bb6b 100644
--- a/neps/sampling/priors.py
+++ b/neps/sampling/priors.py
@@ -375,8 +375,9 @@ def sample(
         n: int | torch.Size,
         *,
         to: Domain | list[Domain],
-        seed: int | None = None,
+        seed: torch.Generator | None = None,
         device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
     ) -> torch.Tensor:
         if seed is not None:
             raise NotImplementedError("Seeding is not yet implemented.")
@@ -388,11 +389,11 @@ def sample(
         )
         _n = torch.Size((n,)) if isinstance(n, int) else n
 
-        out = torch.empty(_out_shape, device=device, dtype=torch.float64)
+        out = torch.empty(_out_shape, device=device, dtype=dtype)
         for i, dist in enumerate(self.distributions):
             out[..., i] = dist.distribution.sample(_n)
 
-        return Domain.translate(out, frm=self._distribution_domains, to=to)
+        return Domain.translate(out, frm=self._distribution_domains, to=to, dtype=dtype)
 
 
 @dataclass
@@ -422,8 +423,9 @@ def sample(
         n: int | torch.Size,
         *,
         to: Domain | list[Domain],
-        seed: int | None = None,
+        seed: torch.Generator | None = None,
         device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
     ) -> torch.Tensor:
         if seed is not None:
             raise NotImplementedError("Seeding is not yet implemented.")
@@ -433,8 +435,8 @@ def sample(
             if isinstance(n, int)
             else torch.Size((*n, self.ndims))
         )
-        samples = torch.rand(_n, device=device, dtype=torch.float64)
-        return Domain.translate(samples, frm=UNIT_FLOAT_DOMAIN, to=to)
+        samples = torch.rand(_n, device=device, dtype=dtype)
+        return Domain.translate(samples, frm=UNIT_FLOAT_DOMAIN, to=to, dtype=dtype)
 
 
 @dataclass
@@ -485,7 +487,10 @@ def sample(
         n: int | torch.Size,
         *,
         to: Domain | list[Domain],
-        seed: int | None = None,
+        seed: torch.Generator | None = None,
         device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
     ) -> torch.Tensor:
-        return self._weighted_sampler.sample(n, to=to, seed=seed, device=device)
+        return self._weighted_sampler.sample(
+            n, to=to, seed=seed, device=device, dtype=dtype
+        )
diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py
index 64105534..c5c76b8e 100644
--- a/neps/sampling/samplers.py
+++ b/neps/sampling/samplers.py
@@ -34,8 +34,9 @@ def sample(
         n: int | torch.Size,
         *,
         to: Domain | list[Domain],
-        seed: int | None = None,
+        seed: torch.Generator | None = None,
         device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
     ) -> torch.Tensor:
         """Sample `n` points and convert them to the given domain.
 
@@ -47,7 +48,7 @@ def sample(
             to: If a single domain, `.ncols` columns will be produced form that one
                 domain. If a list of domains, then it must have the same length as the
                 number of columns, with each column being in the corresponding domain.
-            seed: The seed for the random number generator.
+            seed: The seed generator
             device: The device to cast the samples to.
 
         Returns:
@@ -82,6 +83,18 @@ def uniform(cls, ndim: int) -> UniformPrior:
 
         return UniformPrior(ndims=ndim)
 
+    @classmethod
+    def borders(cls, ndim: int) -> BorderSampler:
+        """Create a border sampler.
+
+        Args:
+            ndim: The number of dimensions to sample.
+
+        Returns:
+            A border sampler.
+        """
+        return BorderSampler(ndim=ndim)
+
 
 # Technically this could be a prior with a uniform distribution
 @dataclass
@@ -112,8 +125,9 @@ def sample(
         n: int | torch.Size,
         *,
         to: Domain | list[Domain],
-        seed: int | None = None,
+        seed: torch.Generator | None = None,
         device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
     ) -> torch.Tensor:
         if seed is not None:
             raise NotImplementedError("Setting the seed is not supported yet")
@@ -123,14 +137,15 @@ def sample(
         # and reshape the output tensor to the desired shape, if needed.
         _n = n if isinstance(n, int) else reduce(lambda x, y: x * y, n)
 
+        _seed = (
+            None if seed is None else torch.randint(0, 2**31, (1,), generator=seed).item()
+        )
         sobol = torch.quasirandom.SobolEngine(
-            dimension=self.ndim,
-            scramble=self.scramble,
-            seed=seed,
+            dimension=self.ndim, scramble=self.scramble, seed=_seed
         )
 
-        out = torch.empty(_n, self.ncols, dtype=torch.float64, device=device)
-        x = sobol.draw(_n, dtype=torch.float64, out=out)
+        out = torch.empty(_n, self.ncols, dtype=dtype, device=device)
+        x = sobol.draw(_n, dtype=dtype, out=out)
 
         # If we got extra dimensions, such as batch dimensions, we need to
         # reshape the tensor to the desired shape.
@@ -185,8 +200,9 @@ def sample(
         n: int | torch.Size,
         *,
         to: Domain | list[Domain],
-        seed: int | None = None,
+        seed: torch.Generator | None = None,
         device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
     ) -> torch.Tensor:
         if seed is not None:
             raise NotImplementedError("Seeding is not yet implemented.")
@@ -205,12 +221,15 @@ def sample(
             self.probabilities,
             total_samples,
             replacement=True,
+            generator=seed,
             out=chosen_samplers,
         )
 
         # Create an empty tensor to hold all samples
         output_samples = torch.empty(
-            (total_samples, self.ncols), device=device, dtype=torch.float64
+            (total_samples, self.ncols),
+            device=device,
+            dtype=dtype,
         )
 
         # Loop through each sampler and its associated indices
@@ -221,10 +240,73 @@ def sample(
 
             if len(indices) > 0:
                 # Sample from the sampler for the required number of indices
-                samples_from_sampler = sampler.sample(len(indices), to=to, device=device)
+                samples_from_sampler = sampler.sample(
+                    len(indices),
+                    to=to,
+                    seed=seed,
+                    device=device,
+                    dtype=dtype,
+                )
                 output_samples[indices] = samples_from_sampler
 
         # Reshape to the output shape including ncols dimension
         output_samples = output_samples.view(output_shape)
 
         return Domain.translate(output_samples, frm=UNIT_FLOAT_DOMAIN, to=to)
+
+
+@dataclass
+class BorderSampler(Sampler):
+    """A sampler that samples from the border of a hypercube."""
+
+    ndim: int
+
+    @property
+    @override
+    def ncols(self) -> int:
+        return self.ndim
+
+    @property
+    def n_possible(self) -> int:
+        """The amount of possible border configurations."""
+        return 2**self.ndim
+
+    @override
+    def sample(
+        self,
+        n: int | torch.Size,
+        *,
+        to: Domain | list[Domain],
+        seed: torch.Generator | None = None,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ) -> torch.Tensor:
+        _arange = torch.arange(self.n_possible, device=device, dtype=torch.int32)
+        # Calculate the total number of samples required
+        if isinstance(n, int):
+            total_samples = min(n, self.n_possible)
+            output_shape = (total_samples, self.ncols)
+        else:
+            total_samples = reduce(lambda x, y: x * y, n)
+            if total_samples > self.n_possible:
+                raise ValueError(
+                    f"The shape of samples requested (={n}) is more than the number of "
+                    f"possible border configurations (={self.n_possible})."
+                )
+            output_shape = (*n, self.ncols)
+
+        if self.n_possible <= total_samples:
+            configs = _arange
+        else:
+            # Otherwise, we take a random sample of the 2**n possible border configs
+            rand_ix = torch.randperm(self.n_possible, generator=seed, device=device)[
+                :total_samples
+            ]
+            configs = _arange[rand_ix]
+
+        # https://stackoverflow.com/a/63546308/5332072
+        bit_masks = 2 ** _arange[: self.ndim]
+        configs = configs.unsqueeze(1).bitwise_and(bit_masks).ne(0).to(dtype)
+        # Reshape to the output shape including ncols dimension
+        configs.view(output_shape)
+        return Domain.translate(configs, frm=UNIT_FLOAT_DOMAIN, to=to)
diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py
index a5151aab..3ef203f1 100644
--- a/neps/search_spaces/domain.py
+++ b/neps/search_spaces/domain.py
@@ -89,19 +89,18 @@ class Domain(Generic[V]):
     value.
     """
 
-    dtype: torch.dtype = field(init=False, repr=False)
     is_unit_float: bool = field(init=False, repr=False)
     midpoint: V = field(init=False, repr=False)
     is_log: bool = field(init=False, repr=False)
     length: V = field(init=False, repr=False)
     cardinality: int | None = field(init=False, repr=False)
     bounds: tuple[V, V] = field(init=False, repr=False)
+    preffered_dtype: torch.dtype = field(init=False, repr=False)
 
     def __post_init__(self):
         assert isinstance(self.lower, type(self.upper))
         is_int = isinstance(self.lower, int)
         object.__setattr__(self, "is_log", self.log_bounds is not None)
-        object.__setattr__(self, "dtype", torch.int64 if is_int else torch.float64)
         object.__setattr__(
             self,
             "is_unit_float",
@@ -116,10 +115,14 @@ def __post_init__(self):
         else:
             cardinality = None
 
-        object.__setattr__(self, "cardinality", cardinality)
+        preferred_dtype = torch.int64 if is_int else torch.float64
+        object.__setattr__(self, "preffered_dtype", preferred_dtype)
+
         mid = self.from_unit(torch.tensor(0.5)).item()
-        if self.dtype == torch.int64:
+        if is_int:
             mid = int(round(mid))
+
+        object.__setattr__(self, "cardinality", cardinality)
         object.__setattr__(self, "midpoint", mid)
         object.__setattr__(self, "bounds", (self.lower, self.upper))
 
@@ -203,17 +206,23 @@ def indices(cls, n: int) -> Domain[int]:
         """
         return Domain.int(0, n - 1)
 
-    def to_unit(self, x: Tensor) -> Tensor:
+    def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor:
         """Transform a tensor of values from this domain to the unit interval [0, 1].
 
         Args:
             x: Tensor of values in this domain to convert.
+            dtype: The dtype to convert to
 
         Returns:
             Same shape tensor with the values normalized to the unit interval [0, 1].
         """
+        if dtype is None:
+            dtype = torch.float64
+        else:
+            assert dtype.is_floating_point, "Unit interval is only for floats."
+
         if self.is_unit_float:
-            return x
+            return x.to(dtype)
 
         if self.log_bounds is not None:
             x = torch.log(x)
@@ -221,19 +230,22 @@ def to_unit(self, x: Tensor) -> Tensor:
         else:
             lower, upper = self.lower, self.upper
 
-        return (x - lower) / (upper - lower)
+        x = (x - lower) / (upper - lower)
+        return x.type(dtype)
 
-    def from_unit(self, x: Tensor) -> Tensor:
+    def from_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor:
         """Transform a tensor of values from the unit interval [0, 1] to this domain.
 
         Args:
             x: A tensor of values in the unit interval [0, 1] to convert.
+            dtype: The dtype to convert to
 
         Returns:
             Same shape tensor with the lifted into this domain.
         """
+        dtype = dtype or self.preffered_dtype
         if self.is_unit_float:
-            return x
+            return x.to(dtype)
 
         bins = self.bins
         if bins is not None:
@@ -252,9 +264,9 @@ def from_unit(self, x: Tensor) -> Tensor:
         if self.round:
             x = torch.round(x)
 
-        return x.type(self.dtype)
+        return x.type(dtype)
 
-    def cast(self, x: Tensor, frm: Domain) -> Tensor:
+    def cast(self, x: Tensor, frm: Domain, *, dtype: torch.dtype | None = None) -> Tensor:
         """Cast a tensor of values frm the domain `frm` to this domain.
 
         If you need to cast a tensor of mixed domains, use
@@ -263,10 +275,12 @@ def cast(self, x: Tensor, frm: Domain) -> Tensor:
         Args:
             x: Tensor of values in the `frm` domain to cast to this domain.
             frm: The domain to cast from.
+            dtype: The dtype to convert to
 
         Returns:
             Same shape tensor with the values cast to this domain.
         """
+        dtype = dtype or self.preffered_dtype
         # NOTE: In general, we should always be able to go through the unit interval
         # [0, 1] to be able to transform between domains. However sometimes we can
         # bypass some steps, dependant on the domains, hence the ugliness...
@@ -281,12 +295,12 @@ def cast(self, x: Tensor, frm: Domain) -> Tensor:
         if same_bounds and same_log_bounds and (self.bins is None or same_bins):
             if self.round:
                 x = torch.round(x)
-            return x.type(self.dtype) if x.dtype != self.dtype else x
+            return x.type(dtype)
 
         # Shortcut 2. (From normalized)
         # The domain we are coming from is already normalized, we only need to lift
         if frm.is_unit_float:
-            return self.from_unit(x)  # type: ignore
+            return self.from_unit(x, dtype=dtype)  # type: ignore
 
         # Shortcut 3. (Log lift)
         # We can also shortcut out if the only diffrence is that we are coming frm the
@@ -296,11 +310,10 @@ def cast(self, x: Tensor, frm: Domain) -> Tensor:
             x = torch.exp(x)
             if self.round:
                 x = torch.round(x)
-            return x.type(self.dtype)
+            return x.type(dtype)
 
         # Otherwise, through the unit interval we go
-        norm = frm.to_unit(x)
-        lift = self.from_unit(norm)
+        lift = self.from_unit(frm.to_unit(x), dtype=dtype)
         return lift  # noqa: RET504
 
     @classmethod
@@ -314,6 +327,8 @@ def translate(
         x: Tensor,
         frm: Domain | Iterable[Domain],
         to: Domain | Iterable[Domain],
+        *,
+        dtype: torch.dtype | None = None,
     ) -> Tensor:
         """Cast a tensor of mixed domains to a new set of mixed domains.
 
@@ -326,6 +341,7 @@ def translate(
             to: List of domains to cast to. If list, must be length as `n_dims`,
                 otherwise we assume the single domain provided is the one to be used
                 across all dimensions.
+            dtype: The dtype of the converted tensor
 
         Returns:
             Tensor of the same shape as `x` with the last dimension casted
@@ -341,7 +357,7 @@ def translate(
 
         # If both are not a list, we can just cast the whole tensor
         if isinstance(frm, Domain) and isinstance(to, Domain):
-            return to.cast(x, frm=frm)
+            return to.cast(x, frm=frm, dtype=dtype)
 
         frm = [frm] * ndims if isinstance(frm, Domain) else list(frm)
         to = [to] * ndims if isinstance(to, Domain) else list(to)
@@ -360,9 +376,9 @@ def translate(
                 f" Expected {ndims} from last dimension of {x.shape=}, got {len(to)}."
             )
 
-        out = torch.empty_like(x)
+        out = torch.empty_like(x, dtype=dtype)
         for i, (f, t) in enumerate(zip(frm, to, strict=False)):
-            out[..., i] = t.cast(x[..., i], frm=f)
+            out[..., i] = t.cast(x[..., i], frm=f, dtype=dtype)
 
         return out
 
diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
index eef1b25b..d47c5363 100644
--- a/neps/search_spaces/encoding.py
+++ b/neps/search_spaces/encoding.py
@@ -11,10 +11,7 @@
 )
 from typing_extensions import Protocol, override
 
-import numpy as np
-import numpy.typing as npt
 import torch
-from grakel.utils import graph_from_networkx
 
 from neps.search_spaces.domain import (
     UNIT_FLOAT_DOMAIN,
@@ -25,10 +22,7 @@
 from neps.search_spaces.hyperparameters.integer import IntegerParameter
 
 if TYPE_CHECKING:
-    import networkx as nx
-
     from neps.search_spaces.parameter import Parameter
-    from neps.search_spaces.search_space import SearchSpace
 
 WLInput: TypeAlias = tuple[dict, dict | None, dict | None]
 V = TypeVar("V", int, float)
@@ -183,50 +177,7 @@ def decode(self, x: torch.Tensor) -> list[V]:
 
 
 @dataclass
-class WLInputTransformer(Transformer[WLInput]):
-    hp: str
-
-    def encode(self, x: Sequence[nx.Graph]) -> list[WLInput]:
-        return [graph_from_networkx(g) for g in x]  # type: ignore
-
-    def decode(self, x: Mapping[str, Sequence[WLInput]]) -> dict[str, list[Any]]:
-        raise NotImplementedError("Cannot decode WLInput to values.")
-
-
-@dataclass
-class GraphEncoder:
-    transformers: dict[str, WLInputTransformer]
-    column_lookup: dict[str, int] = field(init=False)
-
-    def __post_init__(self):
-        transformers = sorted(self.transformers.items(), key=lambda t: t[0])
-        self.transformers = dict(transformers)
-        self.column_lookup: dict[str, int] = {
-            name: i for i, (name, _) in enumerate(self.transformers.items())
-        }
-
-    def select(
-        self, x: npt.NDArray[np.object_], hp: str | Sequence[str]
-    ) -> npt.NDArray[np.object_]:
-        # Kind of a redundant function but made to be compatible with TensorPack
-        if isinstance(hp, str):
-            return x[:, self.column_lookup[hp]]
-
-        return x[:, [self.column_lookup[h] for h in hp]]
-
-    def encode(self, x: Sequence[Any]) -> npt.NDArray[np.object_]:
-        buffer = np.empty((len(x), len(self.transformers)), dtype=np.object_)
-        for hp, transformer in self.transformers.items():
-            values = [conf[hp] for conf in x]
-            buffer[:, self.column_lookup[hp]] = transformer.encode(values)  # type: ignore
-        return buffer
-
-    def decode_dicts(self, x: npt.NDArray[np.object_]) -> list[dict[str, Any]]:
-        raise NotImplementedError("Cannot decode graph embeddings.")
-
-
-@dataclass
-class TensorEncoder:
+class ConfigEncoder:
     transformers: dict[str, TensorTransformer]
     index_of: dict[str, int] = field(init=False)
     domain_of: dict[str, Domain] = field(init=False)
@@ -290,15 +241,7 @@ def encode(
 
         return buffer
 
-    def pack(
-        self,
-        x: Sequence[Mapping[str, Any]],
-        *,
-        device: torch.device | None = None,
-    ) -> TensorPack:
-        return TensorPack(self.encode(x, device=device), self)
-
-    def unpack(self, x: torch.Tensor) -> list[dict[str, Any]]:
+    def decode(self, x: torch.Tensor) -> list[dict[str, Any]]:
         values: dict[str, list[Any]] = {}
         for hp_name, transformer in self.transformers.items():
             lookup = self.index_of[hp_name]
@@ -317,7 +260,7 @@ def default(
         parameters: Mapping[str, Parameter],
         *,
         custom_transformers: dict[str, TensorTransformer] | None = None,
-    ) -> TensorEncoder:
+    ) -> ConfigEncoder:
         custom = custom_transformers or {}
         sorted_params = sorted(parameters.items())
         transformers: dict[str, TensorTransformer] = {}
@@ -334,59 +277,13 @@ def default(
                 case _:
                     raise ValueError(f"Unsupported parameter type: {type(hp)}")
 
-        return TensorEncoder(transformers)
+        return ConfigEncoder(transformers)
 
 
 @dataclass
-class TensorPack:
-    tensor: torch.Tensor
-    encoder: TensorEncoder
-
-    def __len__(self) -> int:
-        return len(self.tensor)
-
-    @property
-    def n_numerical(self) -> int:
-        return self.encoder.n_numerical
-
-    @property
-    def n_categorical(self) -> int:
-        return self.encoder.n_categorical
-
-    @property
-    def ncols(self) -> int:
-        return self.encoder.ncols
-
-    @property
-    def domains(self) -> dict[str, Domain]:
-        return self.encoder.domains
+class EncodedPending:
+    """Tensor data of pending configurations."""
 
-    def select(self, hp: str | Sequence[str]) -> torch.Tensor | npt.NDArray[np.object_]:
-        return self.encoder.select(self.tensor, hp)
-
-    def names(self) -> list[str]:
-        return self.encoder.names()
-
-    def to_dicts(self) -> list[dict[str, Any]]:
-        return self.encoder.unpack(self.tensor)
-
-    def split(self, index: int) -> tuple[TensorPack, TensorPack]:
-        left = TensorPack(self.encoder, tensor=self.tensor[:index])
-        right = TensorPack(self.encoder, tensor=self.tensor[index:])
-        return left, right
-
-    def join(self, *other: TensorPack) -> TensorPack:
-        assert all(o.encoder == self.encoder for o in other)
-
-        numerical = torch.cat([self.tensor, *[o.tensor for o in other]], dim=0)
-        return TensorPack(self.encoder, tensor=numerical)
-
-    @classmethod
-    def default_encoding(
-        cls,
-        x: Sequence[Mapping[str, Any]],
-        space: SearchSpace,
-    ) -> TensorPack:
-        default_encoder = TensorEncoder.default(space)
-        tensor = default_encoder.encode(x)
-        return TensorPack(default_encoder, tensor)
+    ids: torch.Tensor
+    x: torch.Tensor
+    fid: torch.Tensor | None

From 967e679bb8d17b6db91a301f7a330e020ffc52a3 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 2 Oct 2024 17:13:42 +0200
Subject: [PATCH 55/63] refactor(ifbo): Better acq function optimization

---
 .../bayesian_optimization/models/ftpfn.py     | 246 +++++++--------
 .../bayesian_optimization/optimizer.py        |   4 +-
 neps/optimizers/intial_design.py              |   8 +-
 neps/optimizers/multi_fidelity/ifbo.py        | 297 +++++-------------
 neps/sampling/priors.py                       | 120 +++----
 neps/sampling/samplers.py                     |  17 +-
 neps/search_spaces/domain.py                  |  63 +++-
 neps/search_spaces/encoding.py                |  41 +--
 8 files changed, 320 insertions(+), 476 deletions(-)

diff --git a/neps/optimizers/bayesian_optimization/models/ftpfn.py b/neps/optimizers/bayesian_optimization/models/ftpfn.py
index 6c6464a6..4df2dfb8 100644
--- a/neps/optimizers/bayesian_optimization/models/ftpfn.py
+++ b/neps/optimizers/bayesian_optimization/models/ftpfn.py
@@ -1,16 +1,14 @@
 from __future__ import annotations
 
 from collections.abc import Callable, Mapping
-from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Literal
-
+from typing import Any
 import torch
 from ifbo import FTPFN
 
 from neps.sampling.samplers import Sampler
 from neps.search_spaces.domain import Domain
-from neps.search_spaces.encoding import CategoricalToUnitNorm, ConfigEncoder
+from neps.search_spaces.encoding import ConfigEncoder
 from neps.search_spaces.search_space import SearchSpace
 from neps.state.trial import Trial
 
@@ -101,7 +99,7 @@ def _cast_tensor_shapes(x: torch.Tensor) -> torch.Tensor:
 FTPFN_DTYPE = torch.float32
 
 
-def encode_trials_for_ftpfn(
+def encode_ftpfn(
     trials: Mapping[str, Trial],
     space: SearchSpace,
     budget_domain: Domain,
@@ -110,16 +108,14 @@ def encode_trials_for_ftpfn(
     device: torch.device | None = None,
     dtype: torch.dtype = FTPFN_DTYPE,
     error_value: float = 0.0,
-) -> FTPFNData:
+    pending_value: float = torch.nan,
+) -> tuple[torch.Tensor, torch.Tensor]:
     """Encode the trials into a format that the FTPFN model can understand.
 
     !!! warning "Pending trials"
 
-        For trials which do not have a loss reported yet, they are considered pending
-        and will have `torch.nan` as their score inside the returned y values.
-        If using
-        [`acquire_next_from_ftpfn()`][neps.optimizers.bayesian_optimization.models.ftpfn.acquire_next_from_ftpfn],
-        the result of these configurations will be fantasized.
+        For trials which do not have a loss reported yet, they are considered pending.
+        By default this is torch.nan and we recommend fantasizing these values.
 
     !!! warning "Error values"
 
@@ -144,7 +140,9 @@ def encode_trials_for_ftpfn(
     assert space.fidelity_name is not None
     assert space.fidelity is not None
     assert 0 <= error_value <= 1
-    train_configs = encoder.encode([t.config for t in selected.values()], device=device)
+    train_configs = encoder.encode(
+        [t.config for t in selected.values()], device=device, dtype=dtype
+    )
     ids = torch.tensor(
         [int(config_id.split("_", maxsplit=1)[0]) for config_id in selected.keys()],
         device=device,
@@ -158,13 +156,15 @@ def encode_trials_for_ftpfn(
         device=device,
         dtype=dtype,
     )
-    train_budgets = budget_domain.cast(train_fidelities, frm=space.fidelity.domain)
+    train_budgets = budget_domain.cast(
+        train_fidelities, frm=space.fidelity.domain, dtype=dtype
+    )
 
     # TODO: Document that it's on the user to ensure these are already all bounded
     # We could possibly include some bounded transform to assert this.
     minimize_ys = torch.tensor(
         [
-            torch.nan
+            pending_value
             if trial.report is None
             else (error_value if trial.report.loss is None else trial.report.loss)
             for trial in trials.values()
@@ -179,149 +179,119 @@ def encode_trials_for_ftpfn(
             f"\n{minimize_ys}"
         )
     maximize_ys = 1 - minimize_ys
-    return FTPFNData(
-        ids=ids,
-        x=train_configs,
-        y=maximize_ys,
-        budgets=train_budgets,
-        pending_mask=minimize_ys.isnan(),
+    x_train = torch.cat(
+        [ids.unsqueeze(1), train_budgets.unsqueeze(1), train_configs], dim=1
     )
+    return x_train, maximize_ys
 
 
-@dataclass
-class FTPFNData:
-    """Dataclass to hold the data for the FTPFN model.
-
-    The layout of the data is as follows:
-
-    * `ids`: The configuration ids. These will have +1 added to them as FTPFN uses `0`
-    for test configurations, but NePS starts ids at `0`.
-    * `x`: The encoded configurations, includes everything that was encoded by the encoder
-        passed to
-        [`encode_trials_for_ftpfn()`][neps.optimizers.bayesian_optimization.models.ftpfn.encode_trials_for_ftpfn]
-    * `y`: The scores of the configurations, these are inverted such they are to be maximized, where 1 is the maximum
-        score obtainable and 0 is the minimum. Any configuration which did not have a loss gets a score of `nan`.
-    * `budgets`: The budgets of the configurations, normalized to the range [0, 1].
-        These are normalized such that the lower bound of the fidelity domain maps to `1/max_fid`
-        while the upper bound maps to `1`.
-    * `pending_mask`: A mask to indicate which configurations are pending, i.e. have not been evaluated yet.
-        If there are no pending configurations, this should be `None`.
-    """
-
-    ids: torch.Tensor
-    x: torch.Tensor
-    y: torch.Tensor
-    budgets: torch.Tensor
-    pending_mask: torch.Tensor | None = None
-
-
-def create_border_configs(
-    ndims: int,
-    *,
-    dtype: torch.dtype | None = None,
-    device: torch.device | None = None,
-    max_samples: int = 2**9,
-) -> torch.Tensor:
-    n_samples = 2**ndims
-    _arange = torch.arange(n_samples, device=device, dtype=torch.int32)
-    # 2**9 is only 512 samples, so we can afford to exhaustively generate them
-    # We likely won't have this many hyperparameters anywho
-    if n_samples <= max_samples:
-        configs = _arange
-    else:
-        # Otherwise, we take a random sample of the 2**n possible border configs
-        rand_uniq_indices = torch.randperm(n_samples, device=device)[:max_samples]
-        configs = _arange[rand_uniq_indices]
+def decode_ftpfn_data(
+    x: torch.Tensor,
+    encoder: ConfigEncoder,
+    budget_domain: Domain,
+    fidelity_domain: Domain,
+) -> list[tuple[int | None, int | float, dict[str, Any]]]:
+    if x.ndim == 1:
+        x = x.unsqueeze(0)
 
-    # https://stackoverflow.com/a/63546308/5332072
-    bit_masks = 2 ** _arange[ndims]
-    return configs.unsqueeze(1).bitwise_and(bit_masks).ne(0).to(dtype)
+    _raw_ids = x[:, 0].tolist()
+    # Here, we subtract 1 to get the real id, otherwise if it was a test ID, we say it had None
+    real_ids = [None if _id == 0 else int(_id) - 1 for _id in _raw_ids]
+    fidelities = fidelity_domain.cast(x[:, 1], frm=budget_domain).tolist()
+    configs = encoder.decode(x[:, 2:])
+    return list(zip(real_ids, fidelities, configs))
 
 
 def acquire_next_from_ftpfn(
     *,
     ftpfn: FTPFNSurrogate,
-    data: FTPFNData,
+    continuation_samples: torch.Tensor,
     encoder: ConfigEncoder,
     budget_domain: Domain,
     fidelity_domain: Domain,
-    seed: int | None = None,
-    acq_strategy: Callable[
-        [torch.Tensor, torch.Tensor, torch.Tensor, FTPFNSurrogate], torch.Tensor
-    ],
+    initial_samplers: list[tuple[Sampler, int]],
+    local_search_sample_size: int = 128,
+    local_search_confidence: float = 0.95,  # [0, 1]
+    acq_function: Callable[[torch.Tensor], torch.Tensor],
+    seed: torch.Generator | None = None,
     dtype: torch.dtype | None = FTPFN_DTYPE,
-    extra_acq_samples: torch.Tensor | None = None,
-) -> tuple[int | None, int | float | None, dict[str, Any]]:
-    X = torch.cat([data.ids.unsqueeze(1), data.budgets.unsqueeze(1), data.x], dim=1).to(
-        dtype
+) -> torch.Tensor:
+    # 1. Remove duplicate configurations from continuation_samples, keeping only the most recent eval
+    acq_existing = _keep_highest_budget_evaluation(
+        continuation_samples, id_col=0, budget_col=1
     )
-    ys = data.y.clone().detach()
-
-    # In-fill pending with predicted performance
-    if data.pending_mask is not None:
-        not_pending = ~data.pending_mask
-        pending_ys = ftpfn.get_mean_performance(
-            train_x=X[not_pending],
-            train_y=ys[not_pending],
-            test_x=X[data.pending_mask],
-        )
-        ys[data.pending_mask] = pending_ys
-
-    # We also need to append existing configurations that are in training data, but bump up their
-    # budget by one step.
-    # 1. Exclude all configurations which are currently pending
-    acq_existing = X
-    if data.pending_mask is not None:
-        acq_existing = X[~data.pending_mask]
-
-    # 2. Remove duplicate configurations from x train, keeping only the most recent eval
-    acq_existing = _keep_highest_budget_evaluation(acq_existing, id_col=0, budget_col=1)
 
-    # 3. Remove configs that have been fully evaluated
+    # 2. Remove configs that have been fully evaluated
     acq_existing = acq_existing[acq_existing[:, 1] < budget_domain.upper]
-
-    # 4. Include the extra acquisition samples
-    if extra_acq_samples is None:
-        samples = [acq_existing]
+    if len(acq_existing) != 0:
+        # We keep a copy of the original budgets incase they get modified
+        # so we can return the fidelity of the sample that had the best acquisition score
+        budgets_prior_to_acq = acq_existing[:, 1].clone().detach()
+
+        # Get the best configuration for continuation
+        acq_scores = acq_function(acq_existing)
+        best_ix = acq_scores.argmax()
+
+        best_score = acq_scores[best_ix].item()
+        best_row = acq_existing[best_ix].clone().detach()
+        del acq_existing
+        del acq_scores
     else:
-        _shape = (len(extra_acq_samples), 1)
-        acq_extra = torch.cat(
-            [
-                torch.zeros(_shape, dtype=dtype, device=ftpfn.device),
-                torch.full(_shape, budget_domain.lower, dtype=dtype, device=ftpfn.device),
-                extra_acq_samples,
-            ],
-            dim=1,
-        )
-        samples = [acq_existing, acq_extra]
-
-    # 5. Now we can fuse them together
-    acq_samples = torch.cat(samples, dim=0).to(dtype=dtype)
-
-    # We keep a copy of the original budgets incase they get modified
-    # so we can return the fidelity of the sample that had the best acquisition score
-    budgets_prior_to_acq = acq_samples[:, 1].clone().detach()
-
-    # Now we offload acquisition to the caller
-    acq_scores = acq_strategy(X, ys, acq_samples, ftpfn)
-
-    # Extract out the row which had the best PI
-    best_ix = acq_scores.argmax()
+        best_score = -float("inf")
+        best_row = torch.tensor([])
+
+    # We'll be re-using 0 id and min budget alot, just create them once and re-use
+    _N = max(max(s[1] for s in initial_samplers), local_search_sample_size)
+    ids = torch.zeros((_N, 1), dtype=dtype, device=ftpfn.device)
+    min_budget = torch.full(
+        size=(_N, 1), fill_value=budget_domain.lower, dtype=dtype, device=ftpfn.device
+    )
 
-    best_id = int(acq_samples[best_ix, 0].round().item())
-    if best_id == 0:  # It was a new acq. sample
-        best_real_id = None
-        best_fid = None
-    else:  # It was a sample to continue, decrement the 1 added earlier
-        best_real_id = best_id - 1
-        best_fid = fidelity_domain.cast_one(
-            budgets_prior_to_acq[best_ix].item(), frm=budget_domain
+    # Now begin acquisition maximization by sampling from given samplers and performing an additional
+    # round of local sampling around the best point
+    local_sample_confidence = [local_search_confidence] * len(encoder.domains)
+    for sampler, size in initial_samplers:
+        # 1. Use provided sampler and eval samples with acq
+        samples = sampler.sample(
+            size, to=encoder.domains, seed=seed, device=ftpfn.device, dtype=dtype
         )
-
-    best_vector = acq_samples[best_ix, 2:].unsqueeze(0)
-    best_config = encoder.decode(best_vector)[0]
-
-    return best_real_id, best_fid, best_config
+        _N = len(samples)
+        X_test = torch.cat([ids[:_N], min_budget[:_N], samples], dim=1)
+        acq_scores = acq_function(X_test)
+
+        # ... update best if needed
+        sample_best_ix = acq_scores.argmax()
+        sample_best_score = acq_scores[sample_best_ix]
+        sample_best_row = X_test[sample_best_ix].clone().detach()
+        if sample_best_score > best_score:
+            best_score = sample_best_score
+            best_row = sample_best_row
+
+        # 2. Sample around best point from above samples and eval acq.
+        _mode = sample_best_row[2:]
+        local_sampler = Sampler.centered(
+            centers=list(zip(_mode.tolist(), local_sample_confidence)),
+            domains=encoder.domains,
+        )
+        samples = local_sampler.sample(
+            local_search_sample_size,
+            to=encoder.domains,
+            seed=seed,
+            device=ftpfn.device,
+            dtype=dtype,
+        )
+        _N = len(samples)
+        X_test = torch.cat([ids[:_N], min_budget[:_N], samples], dim=1)
+        acq_scores = acq_function(X_test)
+
+        local_best_ix = acq_scores.argmax()
+        local_best_score = acq_scores[local_best_ix].clone().detach()
+        if local_best_score > best_score:
+            best_score = local_best_score
+            best_row = X_test[local_best_ix].clone().detach()
+
+    # Finally, if the best
+    return best_row
 
 
 _CACHED_FTPFN_MODEL: dict[tuple[str, str], FTPFN] = {}
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index 27411158..a0137171 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -113,7 +113,9 @@ def __init__(  # noqa: D417
             **pipeline_space.categoricals,
         }
         self.encoder = encoder or ConfigEncoder.default(params)
-        self.prior = Prior.from_parameters(params) if use_priors is True else None
+        self.prior = (
+            Prior.from_parameters(params.values()) if use_priors is True else None
+        )
         self.seed = seed
         self.use_cost = use_cost
         self.use_priors = use_priors
diff --git a/neps/optimizers/intial_design.py b/neps/optimizers/intial_design.py
index 5993f68b..a2159eb0 100644
--- a/neps/optimizers/intial_design.py
+++ b/neps/optimizers/intial_design.py
@@ -113,15 +113,11 @@ def make_initial_design(
             case "uniform":
                 sampler = Sampler.uniform(ndim=len(params))
             case "prior":
-                sampler = Prior.from_parameters(params)
+                sampler = Prior.from_parameters(params.values())
             case _:
                 sampler = sampler
 
-        encoded_configs = sampler.sample(
-            sample_size * 2,
-            to=encoder.domains,
-            seed=seed,
-        )
+        encoded_configs = sampler.sample(sample_size * 2, to=encoder.domains, seed=seed)
         uniq_x = torch.unique(encoded_configs, dim=0)
         sample_configs = encoder.decode(uniq_x[:sample_size])
         configs.extend([{**config, **fids} for config in sample_configs])
diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index 59de10c1..9c72ae46 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -1,15 +1,15 @@
-from functools import partial
+from __future__ import annotations
 from typing import Any, Mapping, Literal
 
 import numpy as np
 import torch
 
-import warnings
 from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig
 from neps.optimizers.bayesian_optimization.models.ftpfn import (
     FTPFNSurrogate,
     acquire_next_from_ftpfn,
-    encode_trials_for_ftpfn,
+    decode_ftpfn_data,
+    encode_ftpfn,
 )
 from neps.optimizers.intial_design import make_initial_design
 from neps.sampling.priors import Prior
@@ -23,8 +23,6 @@
 
 # NOTE: Ifbo was trained using 32 bit
 FTPFN_DTYPE = torch.float32
-ID_COL = 0
-BUDGET_COL = 1
 
 
 def _adjust_pipeline_space_to_match_stepsize(
@@ -81,127 +79,6 @@ def _adjust_pipeline_space_to_match_stepsize(
     )
 
 
-def _tokenize(
-    ids: torch.Tensor,
-    budgets: torch.Tensor,
-    configs: torch.Tensor,
-) -> torch.Tensor:
-    return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1).to(
-        FTPFN_DTYPE
-    )
-
-
-def _encode_for_ftpfn(
-    trials: Mapping[str, Trial],
-    encoder: ConfigEncoder,
-    space: SearchSpace,
-    budget_domain: Domain,
-    device: torch.device | None = None,
-    dtype: torch.dtype = FTPFN_DTYPE,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """Encode the trials into a format that the FTPFN model can understand.
-
-    !!! warning "loss values reported"
-
-        The `ys` are a single dimension but consist of the losses inverted to scores.
-        As result, we have to assert that the loss values provided in the trials are
-        in the range [0, 1].
-
-    !!! note "X layout"
-
-        The layout of the X is:
-
-        ```
-        | config_id | budget (normalized from fidelity) | hp_1 | hp_2 | ... | hp_n |
-        ```
-
-        Here the `budget` is normalized to the range [0, 1] while the hp parameters
-        are encoded according to the provided encoder, which should map the parameter
-        values from the original domain to some domain in [0, 1].
-
-    !!! warning "Pending and Error trials"
-
-        We currently do not handle error cases, **and they are ignored**.
-        For trials which do not have a loss reported yet, they are considered pending
-        and will have `torch.nan` as their score inside the returned y values.
-
-    Args:
-        trials: The trials to encode
-        encoder: The encoder to use
-        space: The search space
-        budget_domain: The domain to use for the budgets of the FTPFN
-        device: The device to use
-        dtype: The dtype to use
-
-    Returns:
-        The encoded trials and their corresponding **scores**
-    """
-    # Select all trials which have something we can actually use for modelling
-    # The absence of a report signifies pending
-    selected = {
-        trial_id: trial
-        for trial_id, trial in trials.items()
-        if trial.report is None or trial.report.loss is not None
-    }
-    assert space.fidelity_name is not None
-    assert space.fidelity is not None
-    train_configs = encoder.encode([t.config for t in selected.values()], device=device)
-    ids = torch.tensor(
-        [int(config_id.split("_", maxsplit=1)[0]) for config_id in selected.keys()],
-        device=device,
-        dtype=torch.float64,
-    )
-    train_fidelities = torch.tensor(
-        [t.config[space.fidelity_name] for t in selected.values()],
-        device=device,
-        dtype=torch.float64,
-    )
-    train_budgets = budget_domain.cast(train_fidelities, frm=space.fidelity.domain)
-    X = _tokenize(
-        ids=torch.tensor(ids, device=device),
-        budgets=train_budgets,
-        configs=train_configs,
-    ).to(dtype)
-
-    # TODO: Document that it's on the user to ensure these are already all bounded
-    # We could possibly include some bounded transform to assert this.
-    minimize_ys = torch.tensor(
-        [
-            trial.report.loss
-            if trial.report is not None and trial.report.loss is not None
-            else np.nan
-            for trial in trials.values()
-        ],
-        device=device,
-        dtype=FTPFN_DTYPE,
-    )
-    if minimize_ys.max() > 1 or minimize_ys.min() < 0:
-        raise RuntimeError(
-            "ifBO requires that all loss values reported lie in the interval [0, 1]"
-            " but recieved loss value outside of that range!"
-            f"\n{minimize_ys}"
-        )
-    maximize_ys = 1 - minimize_ys
-    return X, maximize_ys
-
-
-def _keep_highest_budget_evaluation(x: torch.Tensor) -> torch.Tensor:
-    # Does a lexsort, same as if we sorted by (config_id, budget), where
-    # theyre are sorted according to increasing config_id and then increasing budget.
-    # x[i2] -> sorted by config id and budget
-    i1 = torch.argsort(x[:, BUDGET_COL])
-    i2 = i1[torch.argsort(x[i1][:, ID_COL], stable=True)]
-    sorted_x = x[i2]
-
-    # Now that it's sorted, we essentially want to count the occurence of each id into counts
-    _, counts = torch.unique_consecutive(sorted_x[:, ID_COL], return_counts=True)
-
-    # Now we can use these counts to get to the last occurence of each id
-    # The -1 is because we want to index from 0 but sum starts at 1.
-    ii = counts.cumsum(0) - 1
-    return sorted_x[ii]
-
-
 class IFBO(BaseOptimizer):
     """Base class for MF-BO algorithms that use DyHPO-like acquisition and budgeting."""
 
@@ -259,7 +136,7 @@ def __init__(
         self._initial_design: list[dict[str, Any]] | None = None
 
         params = {**space.numerical, **space.categoricals}
-        self._prior = Prior.from_parameters(params) if use_priors else None
+        self._prior = Prior.from_parameters(params.values()) if use_priors else None
         self._config_encoder: ConfigEncoder = ConfigEncoder.default(
             params,
             # FTPFN doesn't support categoricals and we were recomenned to just evenly distribute
@@ -311,116 +188,98 @@ def ask(
             return SampledConfig(id=f"{new_id}_0", config=self._initial_design[new_id])
 
         # Otherwise, we proceed to surrogate phase
-        data = encode_trials_for_ftpfn(
+        ftpfn = FTPFNSurrogate(
+            target_path=self.surrogate_model_args.get("target_path", None),
+            version=self.surrogate_model_args.get("version", "0.0.1"),
+            device=self.device,
+        )
+        X, y = encode_ftpfn(
             trials=trials,
             space=self.pipeline_space,
             encoder=self._config_encoder,
             budget_domain=self._budget_domain,
             device=self.device,
+            pending_value=torch.nan,
         )
 
-        # TODO: Very little chance mfpi_random is best but for now it's stable
-        def _mfpi_random(
-            _X: torch.Tensor,
-            _y: torch.Tensor,
-            _acq_samples: torch.Tensor,
-            _ftpfn: FTPFNSurrogate,
-            how: Literal["pi", "ei"],
-        ) -> torch.Tensor:
-            rng = np.random.RandomState(None if seed is None else seed + len(trials))
-            _low = self._budget_ix_domain.lower
-            _high = self._budget_ix_domain.upper
-            horizon_index = rng.randint(_low, _high) + 1
-            horizon = self._budget_domain.cast_one(
-                horizon_index, frm=self._budget_ix_domain
-            )
-            f_best = _y.max().item()
-            r = rng.uniform(-4, -1)
-            threshold = f_best + (10**r) * (1 - f_best)
-
-            # NOTE: If converting f_inc to be seperate per acq sample, you
-            # need to add an extra batch dimension to y_best, i.e. (n, 1)
-            # Budget column is between 0 and 1, but we want to add the horizon
-            BUDGET_COL = 1
-            _acq_samples[:, BUDGET_COL] += horizon
-            _acq_samples[:, BUDGET_COL] = torch.clamp(
-                _acq_samples[:, BUDGET_COL], max=self._budget_domain.upper
+        # Fantasize if needed
+        pending_mask = torch.isnan(y)
+        if pending_mask.any():
+            not_pending_mask = ~pending_mask
+            not_pending_X = X[not_pending_mask]
+            y[pending_mask] = ftpfn.get_mean_performance(
+                train_x=not_pending_X,
+                train_y=y[not_pending_mask],
+                test_x=X[pending_mask],
             )
-
-            match how:
-                case "pi":
-                    return _ftpfn.get_pi(_X, _y, _acq_samples, y_best=threshold)
-                case "ei":
-                    return _ftpfn.get_ei(_X, _y, _acq_samples, y_best=threshold)
-                case _:
-                    raise ValueError(f"Unknown acquisition strategy: {how=}")
-
-        ndims = self._config_encoder.ncols
-
-        # Sample some configurations at uniform for acq.
-        uniform_sampler = Sampler.uniform(ndim=ndims)
-        uniform_configs = uniform_sampler.sample(
-            self.n_acquisition_new_configs,
-            to=self._config_encoder.domains,
-            seed=seed,
-            device=self.device,
-            dtype=FTPFN_DTYPE,
+        else:
+            not_pending_X = X
+
+        # NOTE: Can't really abstract this, requires knowledge that:
+        # 1. The encoding is such that the loss is 1 - loss
+        # 2. The budget is the second column
+        # 3. The budget is encoded between 1/max_fid and 1
+        rng = np.random.RandomState(None if seed is None else seed + len(trials))
+        # Cast the a random budget index into the ftpfn budget domain
+        horizon_increment = self._budget_domain.cast_one(
+            rng.randint(*self._budget_ix_domain.bounds) + 1,
+            frm=self._budget_ix_domain,
         )
+        f_best = y.max().item()
+        threshold = f_best + (10 ** rng.uniform(-4, -1)) * (1 - f_best)
 
-        # Also sample some border configurations for acq.
-        # OPTIM: If we are below the amount possible, there is no randomness and we can cache them
-        border_sampler = Sampler.borders(ndim=ndims)
-        N_border = 2**9  # 512, if we go over, we subselect 512 border configs
-        if N_border <= border_sampler.n_possible:
-            if self._cached_border_configs is not None:
-                border_configs = self._cached_border_configs
-            else:
-                self._cached_border_configs = border_sampler.sample(
-                    n=N_border,
-                    to=self._config_encoder.domains,
-                    seed=seed,
-                    device=self.device,
-                    dtype=FTPFN_DTYPE,
-                )
-                border_configs = self._cached_border_configs
-        else:
-            border_configs = border_sampler.sample(
-                n=N_border,
-                to=self._config_encoder.domains,
-                seed=seed,
-                device=self.device,
-                dtype=FTPFN_DTYPE,
-            )
+        def _mfpi_random(samples: torch.Tensor) -> torch.Tensor:
+            # HACK: Because we are modifying the samples inplace, we do, and then undo the addition
+            original_budget_column = samples[..., 1].clone()
+            samples[..., 1].add_(horizon_increment).clamp_max_(self._budget_domain.upper)
 
-        id, current_fid, config = acquire_next_from_ftpfn(
-            ftpfn=FTPFNSurrogate(
-                target_path=self.surrogate_model_args.get("target_path", None),
-                version=self.surrogate_model_args.get("version", "0.0.1"),
-                device=self.device,
-            ),
-            data=data,
-            seed=seed,
+            scores = ftpfn.get_pi(X, y, samples, y_best=threshold)
+
+            samples[..., 1] = original_budget_column
+            return scores
+
+        # Do acquisition on ftpfn
+        sample_dims = self._config_encoder.ncols
+        best_row = acquire_next_from_ftpfn(
+            ftpfn=ftpfn,
+            # How to encode
             encoder=self._config_encoder,
             budget_domain=self._budget_domain,
             fidelity_domain=self._fid_domain,
-            extra_acq_samples=torch.cat([uniform_configs, border_configs], dim=0),
-            acq_strategy=partial(_mfpi_random, how="ei"),
+            # Acquisition function
+            acq_function=_mfpi_random,
+            # Which acquisition samples to consider for continuation
+            continuation_samples=not_pending_X,
+            # How to generate some initial samples
+            initial_samplers=[
+                (Sampler.sobol(ndim=sample_dims), 512),
+                (Sampler.uniform(ndim=sample_dims), 512),
+                (Sampler.borders(ndim=sample_dims), 256),
+            ],
+            seed=seed,
+            # A next step local sampling around best point found by initial_samplers
+            local_search_sample_size=256,
+            local_search_confidence=0.95,
         )
-        if current_fid is None:
-            assert id is None
-            config[self._fidelity_name] = self._fid_domain.lower
+        _id, fid, config = decode_ftpfn_data(
+            best_row,
+            self._config_encoder,
+            budget_domain=self._budget_domain,
+            fidelity_domain=self._fid_domain,
+        )[0]
+
+        if _id is None:
+            config[self._fidelity_name] = fid
             return SampledConfig(id=f"{new_id}_0", config=config)
         else:
-            current_budget_ix = self._budget_ix_domain.cast_one(
-                current_fid, frm=self._fid_domain
-            )
-            next_budget_ix = current_budget_ix + 1
-            next_fid = self._fid_domain.cast_one(
-                next_budget_ix, frm=self._budget_ix_domain
-            )
+            # Convert fidelity to budget index, bump by 1 and convert back
+            budget_ix = self._budget_ix_domain.cast_one(fid, frm=self._fid_domain)
+            next_ix = budget_ix + 1
+            next_fid = self._fid_domain.cast_one(next_ix, frm=self._budget_ix_domain)
+
             config[self._fidelity_name] = next_fid
             return SampledConfig(
-                id=f"{id}_{next_budget_ix}",
+                id=f"{_id}_{next_ix}",
                 config=config,
-                previous_config_id=f"{id}_{current_budget_ix}",
+                previous_config_id=f"{_id}_{budget_ix}",
             )
diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py
index fc27bb6b..4a77ca79 100644
--- a/neps/sampling/priors.py
+++ b/neps/sampling/priors.py
@@ -9,7 +9,7 @@
 
 from __future__ import annotations
 
-from collections.abc import Container, Iterable, Mapping, Sequence
+from collections.abc import Iterable, Sequence
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Protocol
 from typing_extensions import override
@@ -111,7 +111,7 @@ def uniform(cls, ncols: int) -> UniformPrior:
     @classmethod
     def from_parameters(
         cls,
-        parameters: dict[str, CategoricalParameter | FloatParameter | IntegerParameter],
+        parameters: Iterable[CategoricalParameter | FloatParameter | IntegerParameter],
     ) -> Prior:
         """Please refer to [`make_centered()`][neps.priors.Prior.make_centered]
         for more details. This is a shortcut method.
@@ -124,16 +124,13 @@ def from_parameters(
         # accordingly in a `CenteredPrior`
         _mapping = {"low": 0.25, "medium": 0.5, "high": 0.75}
 
-        domains: dict[str, Domain] = {}
-        centers: dict[str, tuple[Any, float]] = {}
-        categoricals: set[str] = set()
-        for name, hp in parameters.items():
-            domains[name] = hp.domain  # type: ignore
-
-            if isinstance(hp, CategoricalParameter):
-                categoricals.add(name)
+        domains: list[Domain] = []
+        centers: list[tuple[Any, float] | None] = []
+        for hp in parameters:
+            domains.append(hp.domain)
 
             if hp.default is None:
+                centers.append(None)
                 continue
 
             confidence_str = hp.default_confidence_choice
@@ -141,31 +138,25 @@ def from_parameters(
             center = (
                 hp._default_index if isinstance(hp, CategoricalParameter) else hp.default
             )
+            centers.append((center, confidence_score))
 
-            centers[name] = (center, confidence_score)
-
-        # Uses truncnorms for numerical and weighted choices categoricals
-        return Prior.make_centered(
-            domains=domains,
-            centers=centers,
-            categoricals=categoricals,
-        )
+        return Prior.make_centered(domains=domains, centers=centers)
 
     @classmethod
     def make_centered(
         cls,
-        domains: Mapping[str, Domain],
-        centers: Mapping[str, tuple[Any, float]],
+        domains: Iterable[Domain],
+        centers: Iterable[None | tuple[int | float, float]],
         *,
-        categoricals: Container[str] = (),
         device: torch.device | None = None,
     ) -> CenteredPrior:
         """Create a prior for a given list of domains.
 
         Will use a `TruncatedNormal` distribution for all parameters,
-        except those contained within `categoricals`, which will
-        use a `Categorical` instead. If no center is given for a domain,
-        a uniform prior will be used.
+        except those who have a domain marked with `is_categorical=True`,
+        using a `Categorical` distribution instead.
+        If the center for a given domain is `None`, a uniform prior
+        will be used instead.
 
         For non-categoricals, this will be interpreted as the mean and
         std `(1 - confidence)` for a truncnorm. For categorical values,
@@ -180,68 +171,57 @@ def make_centered(
 
         Args:
             domains: domains over which to have a centered prior.
-            centers: centers for the priors. Should be a mapping
-                from the domain name to the center value and confidence level.
-                If no center is given, a uniform prior will be used.
+            centers: centers for the priors, i.e. the mode of the prior for that
+                domain, along with the confidence of that mode, which get's
+                re-interpreted as the std of the truncnorm or the probability
+                mass for the categorical.
+
+                If `None`, a uniform prior will be used.
 
                 !!! warning
 
                     The values contained in centers should be contained within the
                     domain. All confidence levels should be within the `[0, 1]` range.
 
-            categoricals: The names of the domains that are categorical and which
-                a `Categorical` distribution will be used, rather than a
-                `TruncatedNormal`.
-
-                !!! warning
-
-                    Categoricals require that the corresponding domain has a
-                    `.cardinality`, i.e. it is not a float/continuous domain.
-
-            device: Device to place the tensors on.
-
+            confidence: The confidence level for the center. Entries containing `None`
+                should match with `centers` that are `None`. If not, this is considered an
+                error.
+            device: Device to place the tensors on for distributions.
 
         Returns:
             A prior for the search space.
         """
-        for name, (_, confidence) in centers.items():
-            if not 0 <= confidence <= 1:
-                raise ValueError(
-                    f"Confidence level for {name} must be in the range [0, 1]."
-                    f" Got {confidence}."
-                )
+        domains = list(domains)
 
         distributions: list[TorchDistributionWithDomain] = []
-        for name, domain in domains.items():
-            center_confidence = centers.get(name)
-            if center_confidence is None:
+        for domain, center_conf in zip(domains, centers, strict=True):
+            # If the center is None, we use a uniform distribution. We try to match
+            # the distributions to all be unit uniform as it can speed up sampling when
+            # consistentaly the same. This still works for categoricals
+            if center_conf is None:
                 distributions.append(UNIT_UNIFORM_DIST)
                 continue
 
-            center, confidence = center_confidence
-            if name in categoricals:
-                if domain.cardinality is None:
-                    raise ValueError(
-                        f"{name} is not a finite domain and cannot be used as a"
-                        " categorical. Please remove it from the categoricals list."
-                    )
-
-                if not isinstance(center, int):
-                    raise ValueError(
-                        f"{name} is a categorical domain and should have an integer"
-                        f" center. Got {center} of type {type(center)}."
-                    )
-
-                remaining_weight = 1 - confidence
-                distributed_weight = remaining_weight / (domain.cardinality - 1)
+            center, conf = center_conf
+            assert 0 <= conf <= 1
+
+            # If categorical, treat it as a weighted distribution over integers
+            if domain.is_categorical:
+                domain_as_ints = domain.as_integer_domain()
+                assert domain_as_ints.cardinality is not None
+
+                weight_for_choice = conf
+                remaining_weight = 1 - weight_for_choice
+
+                distributed_weight = remaining_weight / (domain_as_ints.cardinality - 1)
                 weights = torch.full(
-                    (domain.cardinality,),
+                    (domain_as_ints.cardinality,),
                     distributed_weight,
                     device=device,
                     dtype=torch.float64,
                 )
-
-                weights[center] = confidence
+                center_index = domain_as_ints.cast_one(center, frm=domain)
+                weights[int(center_index)] = conf
 
                 dist = TorchDistributionWithDomain(
                     distribution=torch.distributions.Categorical(
@@ -252,11 +232,9 @@ def make_centered(
                 distributions.append(dist)
                 continue
 
-            # We place a truncnorm over a unitnorm
-            unit_center = domain.to_unit(
-                torch.tensor(center, device=device, dtype=torch.float64)
-            )
-            scale = torch.tensor(1 - confidence, device=device, dtype=torch.float64)
+            # Otherwise, we use a continuous truncnorm
+            unit_center = domain.to_unit_one(center)
+            scale = torch.tensor(1 - conf, device=device, dtype=torch.float64)
             a = torch.tensor(0.0, device=device, dtype=torch.float64)
             b = torch.tensor(1.0, device=device, dtype=torch.float64)
             dist = TorchDistributionWithDomain(
diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py
index c5c76b8e..dcf369a9 100644
--- a/neps/sampling/samplers.py
+++ b/neps/sampling/samplers.py
@@ -6,7 +6,7 @@
 
 from __future__ import annotations
 
-from collections.abc import Sequence
+from collections.abc import Iterable, Sequence
 from dataclasses import dataclass, field
 from functools import reduce
 from typing import TYPE_CHECKING, Protocol
@@ -18,7 +18,7 @@
 from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
 
 if TYPE_CHECKING:
-    from neps.sampling.priors import UniformPrior
+    from neps.sampling.priors import CenteredPrior, UniformPrior
 
 
 class Sampler(Protocol):
@@ -95,6 +95,19 @@ def borders(cls, ndim: int) -> BorderSampler:
         """
         return BorderSampler(ndim=ndim)
 
+    @classmethod
+    def centered(
+        cls,
+        domains: list[Domain],
+        centers: Iterable[None | tuple[int | float, float]],
+        *,
+        device: torch.device | None = None,
+    ) -> CenteredPrior:
+        """See [`Prior.make_centered`][neps.sampling.priors.Prior.make_centered]."""
+        from neps.sampling.priors import Prior
+
+        return Prior.make_centered(domains=domains, centers=centers, device=device)
+
 
 # Technically this could be a prior with a uniform distribution
 @dataclass
diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py
index 3ef203f1..cc360a1c 100644
--- a/neps/search_spaces/domain.py
+++ b/neps/search_spaces/domain.py
@@ -89,9 +89,19 @@ class Domain(Generic[V]):
     value.
     """
 
+    is_categorical: bool = False
+    """Whether the domain is representing a categorical.
+
+    The domain does not use this information directly, but it can be useful for external
+    classes that consume Domain objects. This can only be set to `True` if the
+    `cardinality` of the domain is finite, i.e. `bins` is not `None` OR `round`
+    is `True` or the boundaries are both integers.
+    """
+
     is_unit_float: bool = field(init=False, repr=False)
-    midpoint: V = field(init=False, repr=False)
+    is_int: bool = field(init=False, repr=False)
     is_log: bool = field(init=False, repr=False)
+    midpoint: V = field(init=False, repr=False)
     length: V = field(init=False, repr=False)
     cardinality: int | None = field(init=False, repr=False)
     bounds: tuple[V, V] = field(init=False, repr=False)
@@ -100,6 +110,7 @@ class Domain(Generic[V]):
     def __post_init__(self):
         assert isinstance(self.lower, type(self.upper))
         is_int = isinstance(self.lower, int)
+        object.__setattr__(self, "is_int", is_int)
         object.__setattr__(self, "is_log", self.log_bounds is not None)
         object.__setattr__(
             self,
@@ -114,6 +125,12 @@ def __post_init__(self):
             cardinality = int(self.upper - self.lower + 1)
         else:
             cardinality = None
+            if self.is_categorical:
+                raise ValueError(
+                    "Categorical domain must have finite cardinality but"
+                    " `bins` is `None` and `round` is `False` and"
+                    " boundaries are not integers."
+                )
 
         preferred_dtype = torch.int64 if is_int else torch.float64
         object.__setattr__(self, "preffered_dtype", preferred_dtype)
@@ -134,6 +151,7 @@ def float(
         *,
         log: bool = False,
         bins: int | None = None,
+        is_categorical: bool = False,
     ) -> Domain[float]:
         """Create a domain for a range of float values.
 
@@ -142,6 +160,7 @@ def float(
             upper: The upper bound of the domain.
             log: Whether the domain is in log space.
             bins: The number of discrete bins to split the domain into.
+            is_categorical: Whether the domain is representing a categorical.
 
         Returns:
             A domain for a range of float values.
@@ -152,6 +171,7 @@ def float(
             log_bounds=(math.log(lower), math.log(upper)) if log else None,
             bins=bins,
             round=False,
+            is_categorical=is_categorical,
         )
 
     @classmethod
@@ -162,6 +182,7 @@ def int(
         *,
         log: bool = False,
         bins: int | None = None,
+        is_categorical: bool = False,
     ) -> Domain[int]:
         """Create a domain for a range of integer values.
 
@@ -170,6 +191,7 @@ def int(
             upper: The upper bound of the domain.
             log: Whether the domain is in log space.
             bins: The number of discrete bins to split the domain into.
+            is_categorical: Whether the domain is representing a categorical.
 
         Returns:
             A domain for a range of integer values.
@@ -180,19 +202,11 @@ def int(
             log_bounds=(math.log(lower), math.log(upper)) if log else None,
             round=True,
             bins=bins,
+            is_categorical=is_categorical,
         )
 
-    def next_value(self, x: Tensor) -> Tensor:
-        """Get the next value for a tensor of values."""
-        if self.cardinality is None:
-            raise ValueError("Domain is non-finite, cannot get next value.")
-        cardinality_domain = Domain.indices(self.cardinality)
-        current_step = cardinality_domain.cast(x, frm=self)
-        bounded_next_step = (current_step + 1).clamp_max(self.cardinality - 1)
-        return self.cast(bounded_next_step, frm=cardinality_domain)
-
     @classmethod
-    def indices(cls, n: int) -> Domain[int]:
+    def indices(cls, n: int, *, is_categorical: bool = False) -> Domain[int]:
         """Create a domain for a range of indices.
 
         Like range based functions this domain is inclusive of the lower bound
@@ -200,11 +214,21 @@ def indices(cls, n: int) -> Domain[int]:
 
         Args:
             n: The number of indices.
+            is_categorical: Whether the domain is representing a categorical.
 
         Returns:
             A domain for a range of indices.
         """
-        return Domain.int(0, n - 1)
+        return Domain.int(0, n - 1, is_categorical=is_categorical)
+
+    def next_value(self, x: Tensor) -> Tensor:
+        """Get the next value for a tensor of values."""
+        if self.cardinality is None:
+            raise ValueError("Domain is non-finite, cannot get next value.")
+        cardinality_domain = Domain.indices(self.cardinality)
+        current_step = cardinality_domain.cast(x, frm=self)
+        bounded_next_step = (current_step + 1).clamp_max(self.cardinality - 1)
+        return self.cast(bounded_next_step, frm=cardinality_domain)
 
     def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor:
         """Transform a tensor of values from this domain to the unit interval [0, 1].
@@ -416,5 +440,20 @@ def to_unit_one(self, x: float | int) -> float:
         """
         return self.to_unit(torch.tensor(x)).item()
 
+    def as_integer_domain(self) -> Domain:
+        """Get the integer version of this domain.
+
+        !!! warning
+
+            This is only possible if this domain has a finite cardinality
+        """
+        if self.cardinality is None:
+            raise ValueError(
+                "Cannot get integer representation of this domain as its"
+                " cardinality is non-finite."
+            )
+
+        return Domain.indices(self.cardinality, is_categorical=self.is_categorical)
+
 
 UNIT_FLOAT_DOMAIN = Domain.float(0.0, 1.0)
diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
index d47c5363..01c37720 100644
--- a/neps/search_spaces/encoding.py
+++ b/neps/search_spaces/encoding.py
@@ -2,21 +2,12 @@
 
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass, field
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Generic,
-    TypeAlias,
-    TypeVar,
-)
+from typing import TYPE_CHECKING, Any, Generic, TypeAlias, TypeVar
 from typing_extensions import Protocol, override
 
 import torch
 
-from neps.search_spaces.domain import (
-    UNIT_FLOAT_DOMAIN,
-    Domain,
-)
+from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain
 from neps.search_spaces.hyperparameters.categorical import CategoricalParameter
 from neps.search_spaces.hyperparameters.float import FloatParameter
 from neps.search_spaces.hyperparameters.integer import IntegerParameter
@@ -58,7 +49,7 @@ class CategoricalToIntegerTransformer(TensorTransformer):
     def __post_init__(self):
         assert len(self.choices) > 0
 
-        self.domain = Domain.indices(len(self.choices))
+        self.domain = Domain.indices(len(self.choices), categorical=True)
         self._lookup = None
         if len(self.choices) > 3:
             try:
@@ -104,6 +95,7 @@ class CategoricalToUnitNorm(TensorTransformer):
     _integer_transformer: CategoricalToIntegerTransformer = field(init=False)
 
     def __post_init__(self):
+        self._domain = Domain.float(0.0, 1.0, bins=len(self.choices), categorical=True)
         self._integer_transformer = CategoricalToIntegerTransformer(self.choices)
 
     @override
@@ -119,12 +111,14 @@ def encode(
             x,
             dtype=dtype if dtype is not None else torch.float64,
             device=device,
-            out=out,
+        )
+        binned_floats = self.domain.cast(
+            integers, frm=self._integer_transformer.domain, dtype=dtype
         )
         if out is not None:
-            return integers.div_(len(self.choices) - 1)
+            return out.copy_(binned_floats)
 
-        return integers / (len(self.choices) - 1)
+        return binned_floats
 
     @override
     def decode(self, x: torch.Tensor) -> list[Any]:
@@ -191,7 +185,7 @@ def __post_init__(self):
         n_numerical = 0
         n_categorical = 0
         for _, transformer in transformers:
-            if isinstance(transformer, CategoricalToIntegerTransformer):
+            if transformer.domain.is_categorical:
                 n_categorical += 1
             else:
                 n_numerical += 1
@@ -223,9 +217,11 @@ def encode(
         x: Sequence[Mapping[str, Any]],
         *,
         device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
     ) -> torch.Tensor:
+        dtype = torch.float64 if dtype is None else dtype
         width = len(self.transformers)
-        buffer = torch.empty((len(x), width), dtype=torch.float64, device=device)
+        buffer = torch.empty((len(x), width), dtype=dtype, device=device)
 
         for hp_name, transformer in self.transformers.items():
             values = [conf[hp_name] for conf in x]
@@ -235,7 +231,7 @@ def encode(
             transformer.encode(
                 values,
                 out=buffer[:, lookup],
-                dtype=torch.float64,
+                dtype=dtype,
                 device=device,
             )
 
@@ -278,12 +274,3 @@ def default(
                     raise ValueError(f"Unsupported parameter type: {type(hp)}")
 
         return ConfigEncoder(transformers)
-
-
-@dataclass
-class EncodedPending:
-    """Tensor data of pending configurations."""
-
-    ids: torch.Tensor
-    x: torch.Tensor
-    fid: torch.Tensor | None

From 07eb8f2c347f3bc645804fc45ebf9e2e2608f66a Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Wed, 2 Oct 2024 19:12:21 +0200
Subject: [PATCH 56/63] test: Fixups

---
 .../doc_yamls/customizing_neps_optimizer.yaml |   5 -
 docs/doc_yamls/loading_own_optimizer.yaml     |   2 -
 docs/doc_yamls/set_up_optimizer.yaml          |   8 +-
 neps/api.py                                   |   8 +-
 .../bayesian_optimization/models/ftpfn.py     |   5 -
 .../bayesian_optimization/models/gp.py        |  22 +-
 .../bayesian_optimization/optimizer.py        |  11 +-
 .../bayesian_optimization.yaml                |   3 +-
 neps/optimizers/default_searchers/pibo.yaml   |   3 +-
 neps/optimizers/intial_design.py              |   1 -
 neps/optimizers/multi_fidelity/hyperband.py   |  22 +-
 neps/optimizers/multi_fidelity/ifbo.py        |   7 +-
 .../multi_fidelity/sampling_policy.py         |  24 +-
 .../multi_fidelity_prior/async_priorband.py   |   8 +-
 .../multi_fidelity_prior/priorband.py         |  14 +-
 neps/sampling/distributions.py                |  32 +-
 neps/sampling/priors.py                       |   8 +-
 neps/sampling/samplers.py                     |   5 +-
 neps/search_spaces/domain.py                  |  10 +-
 neps/search_spaces/encoding.py                | 167 +++++++++--
 neps/search_spaces/hyperparameters/float.py   |   2 +-
 neps/search_spaces/hyperparameters/integer.py |   2 +-
 .../hyperparameters/numerical.py              |   3 +-
 neps/search_spaces/neighborhoods.py           | 281 ------------------
 neps/state/optimizer.py                       |   1 +
 neps/utils/common.py                          |   5 -
 neps/utils/run_args.py                        |   6 +-
 neps_examples/basic_usage/hyperparameters.py  |  40 +--
 .../solution_yamls/bo_neps_decided.yaml       |  11 +-
 .../solution_yamls/pibo_neps_decided.yaml     |  14 +-
 .../solution_yamls/user_yaml_bo.yaml          |  10 +-
 tests/test_neps_api/test_api.py               | 107 ++-----
 .../testing_yaml/optimizer_test.yaml          |  10 +-
 .../run_args_optimizer_outside.yaml           |   4 -
 tests/test_settings/test_settings.py          |  40 ++-
 tests/test_state/test_neps_state.py           |   4 +
 .../run_args_optional_loading_format.yaml     |   1 -
 .../customizing_neps_optimizer.yaml           |   9 +-
 .../loading_own_optimizer.yaml                |   2 -
 .../set_up_optimizer.yaml                     |   8 +-
 .../optimizer_yamls/select_bo_run_args.yaml   |  11 +-
 41 files changed, 309 insertions(+), 627 deletions(-)
 delete mode 100644 neps/search_spaces/neighborhoods.py

diff --git a/docs/doc_yamls/customizing_neps_optimizer.yaml b/docs/doc_yamls/customizing_neps_optimizer.yaml
index a176dc74..93596fc8 100644
--- a/docs/doc_yamls/customizing_neps_optimizer.yaml
+++ b/docs/doc_yamls/customizing_neps_optimizer.yaml
@@ -19,8 +19,3 @@ searcher:
   name: "my_bayesian"                 # optional; changing the searcher_name for better recognition
   # Specific arguments depending on the searcher
   initial_design_size: 7
-  surrogate_model: gp
-  acquisition: EI
-  acquisition_sampler: random
-  random_interleave_prob: 0.1
-
diff --git a/docs/doc_yamls/loading_own_optimizer.yaml b/docs/doc_yamls/loading_own_optimizer.yaml
index b23cd082..7a26a123 100644
--- a/docs/doc_yamls/loading_own_optimizer.yaml
+++ b/docs/doc_yamls/loading_own_optimizer.yaml
@@ -19,5 +19,3 @@ searcher:
   name: CustomOptimizer               # class name within the file
   # Specific arguments depending on your searcher
   initial_design_size: 7
-  surrogate_model: gp
-  acquisition: EI
diff --git a/docs/doc_yamls/set_up_optimizer.yaml b/docs/doc_yamls/set_up_optimizer.yaml
index f65af743..90b52671 100644
--- a/docs/doc_yamls/set_up_optimizer.yaml
+++ b/docs/doc_yamls/set_up_optimizer.yaml
@@ -1,11 +1,5 @@
 strategy: bayesian_optimization
 # Specific arguments depending on the searcher
 initial_design_size: 7
-surrogate_model: gp
-acquisition: EI
-log_prior_weighted: false
-acquisition_sampler: random
-random_interleave_prob: 0.1
-disable_priors: false
-prior_confidence: high
+use_priors: true
 sample_default_first: false
diff --git a/neps/api.py b/neps/api.py
index 4f81b0cf..d1a1bd2a 100644
--- a/neps/api.py
+++ b/neps/api.py
@@ -1,7 +1,5 @@
 """API for the neps package."""
 
-
-
 import inspect
 import logging
 import warnings
@@ -31,11 +29,7 @@ def run(
     run_pipeline: Callable | None = Default(None),
     root_directory: str | Path | None = Default(None),
     pipeline_space: (
-        dict[str, Parameter | CS.ConfigurationSpace]
-        | str
-        | Path
-        | CS.ConfigurationSpace
-        | None
+        dict[str, Parameter] | str | Path | CS.ConfigurationSpace | None
     ) = Default(None),
     run_args: str | Path | None = Default(None),
     overwrite_working_directory: bool = Default(False),
diff --git a/neps/optimizers/bayesian_optimization/models/ftpfn.py b/neps/optimizers/bayesian_optimization/models/ftpfn.py
index 4df2dfb8..a6fe8b1e 100644
--- a/neps/optimizers/bayesian_optimization/models/ftpfn.py
+++ b/neps/optimizers/bayesian_optimization/models/ftpfn.py
@@ -208,7 +208,6 @@ def acquire_next_from_ftpfn(
     continuation_samples: torch.Tensor,
     encoder: ConfigEncoder,
     budget_domain: Domain,
-    fidelity_domain: Domain,
     initial_samplers: list[tuple[Sampler, int]],
     local_search_sample_size: int = 128,
     local_search_confidence: float = 0.95,  # [0, 1]
@@ -224,10 +223,6 @@ def acquire_next_from_ftpfn(
     # 2. Remove configs that have been fully evaluated
     acq_existing = acq_existing[acq_existing[:, 1] < budget_domain.upper]
     if len(acq_existing) != 0:
-        # We keep a copy of the original budgets incase they get modified
-        # so we can return the fidelity of the sample that had the best acquisition score
-        budgets_prior_to_acq = acq_existing[:, 1].clone().detach()
-
         # Get the best configuration for continuation
         acq_scores = acq_function(acq_existing)
         best_ix = acq_scores.argmax()
diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index 96d6b7e0..8b22a513 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -134,7 +134,7 @@ def optimize_acq(
     *,
     n_candidates_required: int = 1,
     num_restarts: int = 20,
-    n_intial_start_points: int | None = None,
+    n_intial_start_points: int = 256,
     acq_options: Mapping[str, Any] | None = None,
     maximum_allowed_categorical_combinations: int = 30,
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -146,9 +146,7 @@ def optimize_acq(
     bounds = torch.tensor([lower, upper], dtype=torch.float64)
 
     cat_transformers = {
-        name: t
-        for name, t in encoder.transformers.items()
-        if isinstance(t, CategoricalToIntegerTransformer)
+        name: t for name, t in encoder.transformers.items() if t.domain.is_categorical
     }
     if not any(cat_transformers):
         # Small heuristic to increase the number of candidates as our dimensionality
@@ -172,7 +170,9 @@ def optimize_acq(
     # We need to generate the product of all possible combinations of categoricals,
     # first we do a sanity check
     n_combos = reduce(
-        lambda x, y: x * y, [len(t.choices) for t in cat_transformers.values()]
+        lambda x, y: x * y,  # type: ignore
+        [t.domain.cardinality for t in cat_transformers.values()],
+        1,
     )
     if n_combos > maximum_allowed_categorical_combinations:
         raise ValueError(
@@ -187,7 +187,10 @@ def optimize_acq(
     # First, just collect the possible values per cat column
     # NOTE: Botorchs optim requires them to be as floats
     cats: dict[int, list[float]] = {
-        encoder.index_of[name]: [float(i) for i in range(len(transformer.choices))]
+        encoder.index_of[name]: [
+            float(i)
+            for i in range(len(transformer.domain.cardinality))  # type: ignore
+        ]
         for name, transformer in cat_transformers.items()
     }
 
@@ -228,7 +231,10 @@ def encode_trials_for_gp(
     pending_configs: list[Mapping[str, Any]] = []
 
     if encoder is None:
-        encoder = ConfigEncoder.default({**space.numerical, **space.categoricals})
+        encoder = ConfigEncoder.default(
+            {**space.numerical, **space.categoricals},
+            constants=space.constants,
+        )
 
     for trial in trials.values():
         if trial.report is None:
@@ -272,7 +278,7 @@ def fit_and_acquire_from_gp(
     seed: int | None = None,
     n_candidates_required: int | None = None,
     num_restarts: int = 20,
-    n_initial_start_points: int | None = None,
+    n_initial_start_points: int = 256,
     maximum_allowed_categorical_combinations: int = 30,
     acq_options: Mapping[str, Any] | None = None,
 ) -> torch.Tensor:
diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
index a0137171..f0d2addd 100644
--- a/neps/optimizers/bayesian_optimization/optimizer.py
+++ b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -106,13 +106,22 @@ def __init__(  # noqa: D417
         """
         if any(pipeline_space.graphs):
             raise NotImplementedError("Only supports flat search spaces for now!")
+        if any(pipeline_space.fidelities):
+            raise ValueError(
+                "Fidelities are not supported for BayesianOptimization."
+                " Please consider setting the fidelity to a constant value."
+                f" Got: {pipeline_space.fidelities}"
+            )
+
         super().__init__(pipeline_space=pipeline_space)
 
         params: dict[str, CategoricalParameter | FloatParameter | IntegerParameter] = {
             **pipeline_space.numerical,
             **pipeline_space.categoricals,
         }
-        self.encoder = encoder or ConfigEncoder.default(params)
+        self.encoder = encoder or ConfigEncoder.default(
+            params, constants=pipeline_space.constants
+        )
         self.prior = (
             Prior.from_parameters(params.values()) if use_priors is True else None
         )
diff --git a/neps/optimizers/default_searchers/bayesian_optimization.yaml b/neps/optimizers/default_searchers/bayesian_optimization.yaml
index c3525cc4..2c34a3a3 100644
--- a/neps/optimizers/default_searchers/bayesian_optimization.yaml
+++ b/neps/optimizers/default_searchers/bayesian_optimization.yaml
@@ -2,5 +2,6 @@ strategy: bayesian_optimization
 # Arguments that can be modified by the user
 initial_design_size: null  # Defaults to depending on number or hyperparameters
 use_cost: false  # Whether to factor in cost when selecting new configurations
-sample_default_first: # Whether to sample the default configuration first
+use_priors: false  # Whether to use user set priors in optimization
+sample_default_first: false # Whether to sample the default configuration first
 device: null  # Device to load the gaussian process model on with torch
diff --git a/neps/optimizers/default_searchers/pibo.yaml b/neps/optimizers/default_searchers/pibo.yaml
index 36bff8b2..cac0e8f8 100644
--- a/neps/optimizers/default_searchers/pibo.yaml
+++ b/neps/optimizers/default_searchers/pibo.yaml
@@ -2,5 +2,6 @@ strategy: pibo
 # Arguments that can be modified by the user
 initial_design_size: null  # Defaults to depending on number or hyperparameters
 use_cost: false  # Whether to factor in cost when selecting new configurations
-sample_default_first: # Whether to sample the default configuration first
+use_priors: true  # Whether to use user set priors in optimization
+sample_default_first: true # Whether to sample the default configuration first
 device: null  # Device to load the gaussian process model on with torch
diff --git a/neps/optimizers/intial_design.py b/neps/optimizers/intial_design.py
index a2159eb0..dcfdbee3 100644
--- a/neps/optimizers/intial_design.py
+++ b/neps/optimizers/intial_design.py
@@ -105,7 +105,6 @@ def make_initial_design(
             "The sample size should be a positive integer if passing an int."
         )
 
-    print("sample", sample_size, ndims)
     if sample_size is not None:
         match sampler:
             case "sobol":
diff --git a/neps/optimizers/multi_fidelity/hyperband.py b/neps/optimizers/multi_fidelity/hyperband.py
index f6c445ac..9319c50c 100644
--- a/neps/optimizers/multi_fidelity/hyperband.py
+++ b/neps/optimizers/multi_fidelity/hyperband.py
@@ -23,6 +23,7 @@
     SuccessiveHalving,
     SuccessiveHalvingBase,
 )
+from neps.sampling.priors import Prior
 
 if typing.TYPE_CHECKING:
     from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
@@ -514,15 +515,6 @@ def __init__(
 
         self.pipeline_space.has_prior = self.use_priors
 
-        bo_args = {
-            "surrogate_model": surrogate_model,
-            "domain_se_kernel": domain_se_kernel,
-            "hp_kernels": hp_kernels,
-            "surrogate_model_args": surrogate_model_args,
-            "acquisition": acquisition,
-            "log_prior_weighted": log_prior_weighted,
-            "acquisition_sampler": acquisition_sampler,
-        }
         # counting non-fidelity dimensions in search space
         ndims = sum(
             1
@@ -531,7 +523,17 @@ def __init__(
         )
         n_min = ndims + 1
         self.init_size = n_min + 1  # in BOHB: init_design >= N_min + 2
-        self.model_policy = model_policy(pipeline_space, **bo_args)
+
+        if self.use_priors:
+            parameters = {
+                **self.pipeline_space.numerical,
+                **self.pipeline_space.categoricals,
+            }
+            prior = Prior.from_parameters(parameters.values())
+        else:
+            prior = None
+
+        self.model_policy = model_policy(pipeline_space, prior=prior)
 
         for _, sh in self.sh_brackets.items():
             sh.model_policy = self.model_policy
diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py
index 9c72ae46..bd0014a2 100755
--- a/neps/optimizers/multi_fidelity/ifbo.py
+++ b/neps/optimizers/multi_fidelity/ifbo.py
@@ -139,6 +139,7 @@ def __init__(
         self._prior = Prior.from_parameters(params.values()) if use_priors else None
         self._config_encoder: ConfigEncoder = ConfigEncoder.default(
             params,
+            constants=self.pipeline_space.constants,
             # FTPFN doesn't support categoricals and we were recomenned to just evenly distribute
             # in the unit norm
             custom_transformers={
@@ -154,7 +155,7 @@ def __init__(
         self._fid_domain = space.fidelity.domain
 
         # Domain in which we should pass budgets to ifbo model
-        self._budget_domain = Domain.float(1 / self._max_budget, 1)
+        self._budget_domain = Domain.floating(1 / self._max_budget, 1)
 
         # Domain from which we assign an index to each budget
         self._budget_ix_domain = Domain.indices(fid_bins)
@@ -185,7 +186,9 @@ def ask(
             )
 
         if new_id < len(self._initial_design):
-            return SampledConfig(id=f"{new_id}_0", config=self._initial_design[new_id])
+            config = self._initial_design[new_id]
+            config[self._fidelity_name] = self._min_budget
+            return SampledConfig(id=f"{new_id}_0", config=config)
 
         # Otherwise, we proceed to surrogate phase
         ftpfn = FTPFNSurrogate(
diff --git a/neps/optimizers/multi_fidelity/sampling_policy.py b/neps/optimizers/multi_fidelity/sampling_policy.py
index bc35f300..784e067b 100644
--- a/neps/optimizers/multi_fidelity/sampling_policy.py
+++ b/neps/optimizers/multi_fidelity/sampling_policy.py
@@ -3,7 +3,7 @@
 
 import logging
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Mapping
+from typing import TYPE_CHECKING
 
 from botorch.acquisition import (
     AcquisitionFunction,
@@ -17,16 +17,9 @@
 import pandas as pd
 import torch
 
-from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping
 from neps.optimizers.bayesian_optimization.acquisition_functions.pibo import (
     pibo_acquisition,
 )
-from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import (
-    DecayingPriorWeightedAcquisition,
-)
-from neps.optimizers.bayesian_optimization.acquisition_samplers import (
-    AcquisitionSamplerMapping,
-)
 from neps.optimizers.bayesian_optimization.models.gp import make_default_single_obj_gp
 from neps.optimizers.multi_fidelity_prior.utils import (
     compute_config_dist,
@@ -36,15 +29,8 @@
 )
 from neps.sampling.priors import Prior
 from neps.search_spaces.encoding import ConfigEncoder
-from neps.utils.common import instance_from_map
 
 if TYPE_CHECKING:
-    from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
-        BaseAcquisition,
-    )
-    from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import (
-        AcquisitionSampler,
-    )
     from neps.search_spaces.search_space import SearchSpace
 
 TOLERANCE = 1e-2  # 1%
@@ -291,8 +277,6 @@ def __init__(
         use_cost: bool = False,
         device: torch.device | None = None,
     ):
-        if prior:
-            raise NotImplementedError("Priors are not implemented yet.")
         if use_cost:
             raise NotImplementedError("Cost is not implemented yet.")
 
@@ -300,7 +284,8 @@ def __init__(
         self.device = device
         self.prior = prior
         self._encoder = ConfigEncoder.default(
-            {**pipeline_space.numerical, **pipeline_space.categoricals}
+            {**pipeline_space.numerical, **pipeline_space.categoricals},
+            constants=pipeline_space.constants,
         )
         self._model: SingleTaskGP | None = None
         self._acq: AcquisitionFunction | None = None
@@ -316,6 +301,8 @@ def update_model(
         x_pending = self._encoder.encode([config.hp_values() for config in pending_x])
         y_train = torch.tensor(train_y, dtype=torch.float64, device=self.device)
 
+        # TODO: Most of this just copies BO and the duplication can be replaced
+        # once we don't have the two stage `update_model()` and `sample()`
         y_model = make_default_single_obj_gp(x_train, y_train, encoder=self._encoder)
 
         fit_gpytorch_mll(
@@ -344,7 +331,6 @@ def update_model(
                     prior=self.prior,
                     prior_exponent=pibo_exp_term,
                     x_domain=self._encoder.domains,
-                    x_pending=x_pending,
                 )
 
         self._y_model = y_model
diff --git a/neps/optimizers/multi_fidelity_prior/async_priorband.py b/neps/optimizers/multi_fidelity_prior/async_priorband.py
index 0a859dec..a4963dce 100644
--- a/neps/optimizers/multi_fidelity_prior/async_priorband.py
+++ b/neps/optimizers/multi_fidelity_prior/async_priorband.py
@@ -13,6 +13,7 @@
     AsynchronousSuccessiveHalvingWithPriors,
 )
 from neps.optimizers.multi_fidelity_prior.priorband import PriorBandBase
+from neps.sampling.priors import Prior
 
 if typing.TYPE_CHECKING:
     from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
@@ -121,7 +122,12 @@ def __init__(
         self.init_size = n_min + 1  # in BOHB: init_design >= N_dim + 2
         if self.modelling_type == "joint" and self.initial_design_size is not None:
             self.init_size = self.initial_design_size
-        self.model_policy = model_policy(pipeline_space, **bo_args)
+
+        parameters = {**self.pipeline_space.numerical, **self.pipeline_space.categoricals}
+        self.model_policy = model_policy(
+            pipeline_space,
+            prior=Prior.from_parameters(parameters.values()),
+        )
 
     def get_config_and_ids(
         self,
diff --git a/neps/optimizers/multi_fidelity_prior/priorband.py b/neps/optimizers/multi_fidelity_prior/priorband.py
index f4bc067b..6b2f84be 100644
--- a/neps/optimizers/multi_fidelity_prior/priorband.py
+++ b/neps/optimizers/multi_fidelity_prior/priorband.py
@@ -15,6 +15,7 @@
     compute_scores,
     get_prior_weight_for_decay,
 )
+from neps.sampling.priors import Prior
 
 if typing.TYPE_CHECKING:
     from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import (
@@ -335,13 +336,6 @@ def __init__(
             },
         }
 
-        bo_args = {
-            "surrogate_model": surrogate_model,
-            "surrogate_model_args": surrogate_model_args,
-            "acquisition": acquisition,
-            "log_prior_weighted": log_prior_weighted,
-            "acquisition_sampler": acquisition_sampler,
-        }
         self.model_based = model_based
         self.modelling_type = modelling_type
         self.initial_design_size = initial_design_size
@@ -355,7 +349,11 @@ def __init__(
         self.init_size = n_min + 1  # in BOHB: init_design >= N_min + 2
         if self.modelling_type == "joint" and self.initial_design_size is not None:
             self.init_size = self.initial_design_size
-        self.model_policy = model_policy(pipeline_space, **bo_args)
+        parameters = {**self.pipeline_space.numerical, **self.pipeline_space.categoricals}
+        self.model_policy = model_policy(
+            pipeline_space,
+            prior=Prior.from_parameters(parameters.values()),
+        )
 
         for _, sh in self.sh_brackets.items():
             sh.sampling_policy = self.sampling_policy
diff --git a/neps/sampling/distributions.py b/neps/sampling/distributions.py
index f865d173..946bca77 100644
--- a/neps/sampling/distributions.py
+++ b/neps/sampling/distributions.py
@@ -98,22 +98,22 @@ def __init__(
         )
         self._entropy = CONST_LOG_SQRT_2PI_E + self._log_Z - 0.5 * self._lpbb_m_lpaa_d_Z
 
-    @constraints.dependent_property
-    @override
+    @constraints.dependent_property  # type: ignore
+    @override  # type: ignore
     def support(self) -> constraints._Interval:
         return constraints.interval(self.a, self.b)
 
     @property
-    @override
+    @override  # type: ignore
     def mean(self) -> torch.Tensor:
         return self._mean
 
     @property
-    @override
+    @override  # type: ignore
     def variance(self) -> torch.Tensor:
         return self._variance
 
-    @override
+    @override  # type: ignore
     def entropy(self) -> torch.Tensor:
         return self._entropy
 
@@ -129,25 +129,25 @@ def _big_phi(self, x: torch.Tensor) -> torch.Tensor:
     def _inv_big_phi(x: torch.Tensor) -> torch.Tensor:
         return CONST_SQRT_2 * (2 * x - 1).erfinv()
 
-    @override
+    @override  # type: ignore
     def cdf(self, value: torch.Tensor) -> torch.Tensor:
         if self._validate_args:
             self._validate_sample(value)
         return ((self._big_phi(value) - self._big_phi_a) / self._Z).clamp(0, 1)
 
-    @override
+    @override  # type: ignore
     def icdf(self, value: torch.Tensor) -> torch.Tensor:
         y = self._big_phi_a + value * self._Z
         y = y.clamp(self.eps, 1 - self.eps)
         return self._inv_big_phi(y)
 
-    @override
+    @override  # type: ignore
     def log_prob(self, value: torch.Tensor) -> torch.Tensor:
         if self._validate_args:
             self._validate_sample(value)
         return CONST_LOG_INV_SQRT_2PI - self._log_Z - (value**2) * 0.5
 
-    @override
+    @override  # type: ignore
     def rsample(self, sample_shape: torch.Size | None = None) -> torch.Tensor:
         if sample_shape is None:
             sample_shape = torch.Size([])
@@ -199,18 +199,18 @@ def __init__(
         self._variance = self._variance * self.scale**2
         self._entropy += self._log_scale
 
-    def _to_std_rv(self, value):
+    def _to_std_rv(self, value: torch.Tensor) -> torch.Tensor:
         return (value - self.loc) / self.scale
 
-    def _from_std_rv(self, value):
+    def _from_std_rv(self, value: torch.Tensor) -> torch.Tensor:
         return value * self.scale + self.loc
 
     @override
-    def cdf(self, value):
+    def cdf(self, value: torch.Tensor) -> torch.Tensor:
         return super().cdf(self._to_std_rv(value))
 
     @override
-    def icdf(self, value):
+    def icdf(self, value: torch.Tensor) -> torch.Tensor:
         sample = self._from_std_rv(super().icdf(value))
 
         # clamp data but keep gradients
@@ -224,7 +224,7 @@ def icdf(self, value):
         return sample
 
     @override
-    def log_prob(self, value):
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
         value = self._to_std_rv(value)
         return super().log_prob(value) - self._log_scale
 
@@ -240,7 +240,7 @@ class UniformWithUpperBound(Uniform):
     # OPTIM: This could probably be optimized a lot but I'm not sure how it effects
     # gradients. Could probably do a different path depending on if `value` requires
     # gradients or not.
-    @override
+    @override  # type: ignore
     def log_prob(self, value: torch.Tensor) -> torch.Tensor:
         if self._validate_args:
             self._validate_sample(value)
@@ -252,6 +252,8 @@ def log_prob(self, value: torch.Tensor) -> torch.Tensor:
 
 @dataclass
 class TorchDistributionWithDomain:
+    """A torch distribution with an associated domain it samples over."""
+
     distribution: Distribution
     domain: Domain
 
diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py
index 4a77ca79..e17db04b 100644
--- a/neps/sampling/priors.py
+++ b/neps/sampling/priors.py
@@ -289,7 +289,7 @@ class CenteredPrior(Prior):
     _meaningful_doms: list[Domain] = field(init=False)
     _meaningful_dists: list[Distribution] = field(init=False)
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         self._distribution_domains = [dist.domain for dist in self.distributions]
 
         rest: list[tuple[int, Domain, Distribution]] = []
@@ -303,8 +303,8 @@ def __post_init__(self):
             self._meaningful_dists = []
             return
 
-        self._meaningful_ixs, self._meaningful_doms, self._meaningful_dists = zip(
-            *rest, strict=False
+        self._meaningful_ixs, self._meaningful_doms, self._meaningful_dists = zip(  # type: ignore
+            *rest, strict=True
         )
 
     @property
@@ -429,7 +429,7 @@ class WeightedPrior(Prior):
 
     _weighted_sampler: WeightedSampler = field(init=False, repr=False)
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         from neps.sampling.samplers import WeightedSampler
 
         self._weighted_sampler = WeightedSampler(
diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py
index dcf369a9..cf1c1e7a 100644
--- a/neps/sampling/samplers.py
+++ b/neps/sampling/samplers.py
@@ -49,6 +49,7 @@ def sample(
                 domain. If a list of domains, then it must have the same length as the
                 number of columns, with each column being in the corresponding domain.
             seed: The seed generator
+            dtype: The dtype of the output tensor.
             device: The device to cast the samples to.
 
         Returns:
@@ -120,7 +121,7 @@ class Sobol(Sampler):
     scramble: bool = True
     """Whether to scramble the Sobol sequence."""
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         if self.ndim < 1:
             raise ValueError(
                 "The number of dimensions must be at least 1."
@@ -181,7 +182,7 @@ class WeightedSampler(Sampler):
     probabilities: torch.Tensor = field(init=False, repr=False)
     """The probabilities for each sampler. Normalized weights."""
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         if len(self.samplers) < 2:
             raise ValueError(
                 f"At least two samplers must be given. Got {len(self.samplers)}"
diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py
index cc360a1c..6c6c1b75 100644
--- a/neps/search_spaces/domain.py
+++ b/neps/search_spaces/domain.py
@@ -107,7 +107,7 @@ class Domain(Generic[V]):
     bounds: tuple[V, V] = field(init=False, repr=False)
     preffered_dtype: torch.dtype = field(init=False, repr=False)
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         assert isinstance(self.lower, type(self.upper))
         is_int = isinstance(self.lower, int)
         object.__setattr__(self, "is_int", is_int)
@@ -144,7 +144,7 @@ def __post_init__(self):
         object.__setattr__(self, "bounds", (self.lower, self.upper))
 
     @classmethod
-    def float(
+    def floating(
         cls,
         lower: Number,
         upper: Number,
@@ -175,7 +175,7 @@ def float(
         )
 
     @classmethod
-    def int(
+    def integer(
         cls,
         lower: Number,
         upper: Number,
@@ -219,7 +219,7 @@ def indices(cls, n: int, *, is_categorical: bool = False) -> Domain[int]:
         Returns:
             A domain for a range of indices.
         """
-        return Domain.int(0, n - 1, is_categorical=is_categorical)
+        return Domain.integer(0, n - 1, is_categorical=is_categorical)
 
     def next_value(self, x: Tensor) -> Tensor:
         """Get the next value for a tensor of values."""
@@ -456,4 +456,4 @@ def as_integer_domain(self) -> Domain:
         return Domain.indices(self.cardinality, is_categorical=self.is_categorical)
 
 
-UNIT_FLOAT_DOMAIN = Domain.float(0.0, 1.0)
+UNIT_FLOAT_DOMAIN = Domain.floating(0.0, 1.0)
diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
index 01c37720..c20f60b3 100644
--- a/neps/search_spaces/encoding.py
+++ b/neps/search_spaces/encoding.py
@@ -1,3 +1,11 @@
+"""Encoding of hyperparameter configurations into tensors.
+
+For the most part, you can just use
+[`ConfigEncoder.default()`][neps.search_spaces.encoding.ConfigEncoder.default]
+to create an encoder over a list of hyperparameters, along with any constants you
+want to include when decoding configurations.
+"""
+
 from __future__ import annotations
 
 from collections.abc import Mapping, Sequence
@@ -17,16 +25,11 @@
 
 WLInput: TypeAlias = tuple[dict, dict | None, dict | None]
 V = TypeVar("V", int, float)
-T = TypeVar("T")
-
 
-class Transformer(Protocol[T]):
-    def encode(self, x: Sequence[Any]) -> T: ...
 
-    def decode(self, x: T) -> list[Any]: ...
+class TensorTransformer(Protocol):
+    """A protocol for encoding and decoding hyperparameter values into tensors."""
 
-
-class TensorTransformer(Transformer[torch.Tensor], Protocol):
     domain: Domain
 
     def encode(
@@ -36,20 +39,45 @@ def encode(
         out: torch.Tensor | None = None,
         dtype: torch.dtype | None = None,
         device: torch.device | None = None,
-    ) -> torch.Tensor: ...
+    ) -> torch.Tensor:
+        """Encode a sequence of hyperparameter values into a tensor.
+
+        Args:
+            x: A sequence of hyperparameter values.
+            out: An optional tensor to write the encoded values to.
+            dtype: The dtype of the tensor.
+            device: The device of the tensor.
+
+        Returns:
+            The encoded tensor.
+        """
+        ...
+
+    def decode(self, x: torch.Tensor) -> list[Any]:
+        """Decode a tensor of hyperparameter values into a sequence of values.
+
+        Args:
+            x: A tensor of hyperparameter values.
+
+        Returns:
+            A sequence of hyperparameter values.
+        """
+        ...
 
 
 @dataclass
 class CategoricalToIntegerTransformer(TensorTransformer):
+    """A transformer that encodes categorical values into integers."""
+
     choices: Sequence[Any]
 
     domain: Domain = field(init=False)
     _lookup: dict[Any, int] | None = field(init=False)
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         assert len(self.choices) > 0
 
-        self.domain = Domain.indices(len(self.choices), categorical=True)
+        self.domain = Domain.indices(len(self.choices), is_categorical=True)
         self._lookup = None
         if len(self.choices) > 3:
             try:
@@ -89,13 +117,23 @@ def decode(self, x: torch.Tensor) -> list[Any]:
 
 @dataclass
 class CategoricalToUnitNorm(TensorTransformer):
+    """A transformer that encodes categorical values into a unit normalized tensor.
+
+    If there are `n` choices, the tensor will have `n` bins between `0` and `1`.
+    """
+
     choices: Sequence[Any]
 
     domain: Domain = field(init=False)
     _integer_transformer: CategoricalToIntegerTransformer = field(init=False)
 
-    def __post_init__(self):
-        self._domain = Domain.float(0.0, 1.0, bins=len(self.choices), categorical=True)
+    def __post_init__(self) -> None:
+        self.domain = Domain.floating(
+            0.0,
+            1.0,
+            bins=len(self.choices),
+            is_categorical=True,
+        )
         self._integer_transformer = CategoricalToIntegerTransformer(self.choices)
 
     @override
@@ -130,21 +168,23 @@ def decode(self, x: torch.Tensor) -> list[Any]:
 # and `-0.5` as lower bound with `0.5` as upper bound.
 @dataclass
 class MinMaxNormalizer(TensorTransformer, Generic[V]):
+    """A transformer that normalizes values to the unit interval."""
+
     original_domain: Domain[V]
     bins: int | None = None
 
     domain: Domain[float] = field(init=False)
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         if self.bins is None:
             self.domain = UNIT_FLOAT_DOMAIN
         else:
-            self.domain = Domain.float(0.0, 1.0, bins=self.bins)
+            self.domain = Domain.floating(0.0, 1.0, bins=self.bins)
 
     @override
     def encode(
         self,
-        x: list[V],
+        x: Sequence[V],
         *,
         out: torch.Tensor | None = None,
         dtype: torch.dtype | None = None,
@@ -172,13 +212,41 @@ def decode(self, x: torch.Tensor) -> list[V]:
 
 @dataclass
 class ConfigEncoder:
+    """An encoder for hyperparameter configurations.
+
+    This class is used to encode and decode hyperparameter configurations into tensors
+    and back. It's main uses currently are to support surrogate models that require
+    tensors.
+
+    The primary methods/properties to be aware of are:
+    * [`default()`](neps.search_spaces.encoding.ConfigEncoder.default]: Create a default
+        encoder over a list of hyperparameters. Please see the method docs for more
+        details on how it encodes different types of hyperparameters.
+    * [`encode()`]]neps.search_spaces.encoding.ConfigEncoder.encode]: Encode a list of
+        configurations into a single tensor using the transforms of the encoder.
+    * [`decode()`][neps.search_spaces.encoding.ConfigEncoder.decode]: Decode a 2d tensor
+        of length `N` into a list of `N` configurations.
+    * [`domains`][neps.search_spaces.encoding.ConfigEncoder.domains): The
+        [`Domain`][neps.search_spaces.domain.Domain] that each hyperparameter is encoded
+        into. This is useful in combination with classes like
+        [`Sampler`][neps.sampling.samplers.Sampler],
+        [`Prior`][neps.sampling.priors.Prior], and
+        [`TorchDistributionWithDomain`][neps.sampling.distributions.TorchDistributionWithDomain],
+        which require knowledge of the
+        domains of each column for the tensor, for example, to sample values directly
+        into the encoded space, getting log probabilities of the encoded values.
+    * [`ncols`][neps.search_spaces.encoding.ConfigEncoder.ncols]: The number of columns
+        in the encoded tensor, useful for initializing some `Sampler`s.
+    """
+
     transformers: dict[str, TensorTransformer]
     index_of: dict[str, int] = field(init=False)
     domain_of: dict[str, Domain] = field(init=False)
+    constants: Mapping[str, Any] = field(default_factory=dict)
     n_numerical: int = field(init=False)
     n_categorical: int = field(init=False)
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         transformers = sorted(self.transformers.items(), key=lambda t: t[0])
         self.transformers = dict(transformers)
 
@@ -197,21 +265,14 @@ def __post_init__(self):
 
     @property
     def ncols(self) -> int:
+        """The number of columns in the encoded tensor."""
         return len(self.transformers)
 
     @property
     def domains(self) -> list[Domain]:
+        """The domains of the encoded hyperparameters."""
         return list(self.domain_of.values())
 
-    def names(self) -> list[str]:
-        return list(self.transformers.keys())
-
-    def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor:
-        if isinstance(hp, str):
-            return x[:, self.index_of[hp]]
-
-        return x[:, [self.index_of[h] for h in hp]]
-
     def encode(
         self,
         x: Sequence[Mapping[str, Any]],
@@ -219,6 +280,26 @@ def encode(
         device: torch.device | None = None,
         dtype: torch.dtype | None = None,
     ) -> torch.Tensor:
+        """Encode a list of hyperparameter configurations into a tensor.
+
+        !!! warning "Constants"
+
+            Constants included in configurations will not be encoded into the tensor,
+            but are included when decoding.
+
+        !!! warning "Parameters with no transformers"
+
+            Any parameters in the configurations, whos key is not in
+            `self.transformers`, will be ignored.
+
+        Args:
+            x: A list of hyperparameter configurations.
+            device: The device of the tensor.
+            dtype: The dtype of the tensor.
+
+        Returns:
+            A tensor of shape `(len(x), ncols)` containing the encoded configurations.
+        """
         dtype = torch.float64 if dtype is None else dtype
         width = len(self.transformers)
         buffer = torch.empty((len(x), width), dtype=dtype, device=device)
@@ -238,12 +319,25 @@ def encode(
         return buffer
 
     def decode(self, x: torch.Tensor) -> list[dict[str, Any]]:
+        """Decode a tensor of hyperparameter configurations into a list of configurations.
+
+        Args:
+            x: A tensor of shape `(N, ncols)` containing the encoded configurations.
+
+        Returns:
+            A list of `N` configurations, including any constants that were included
+            when creating the encoder.
+        """
         values: dict[str, list[Any]] = {}
+        N = len(x)
         for hp_name, transformer in self.transformers.items():
             lookup = self.index_of[hp_name]
             tensor = x[:, lookup]
             values[hp_name] = transformer.decode(tensor)
 
+        constants = {name: [v] * N for name, v in self.constants.items()}
+        values.update(constants)
+
         keys = list(values.keys())
         return [
             dict(zip(keys, vals, strict=False))
@@ -254,9 +348,28 @@ def decode(self, x: torch.Tensor) -> list[dict[str, Any]]:
     def default(
         cls,
         parameters: Mapping[str, Parameter],
+        constants: Mapping[str, Any] | None = None,
         *,
         custom_transformers: dict[str, TensorTransformer] | None = None,
     ) -> ConfigEncoder:
+        """Create a default encoder over a list of hyperparameters.
+
+        This method creates a default encoder over a list of hyperparameters. It
+        automatically creates transformers for each hyperparameter based on its type.
+        The transformers are as follows:
+
+        * `FloatParameter` and `IntegerParameter` are normalized to the unit interval.
+        * `CategoricalParameter` is transformed into an integer.
+
+        Args:
+            parameters: A mapping of hyperparameter names to hyperparameters.
+            constants: A mapping of constant hyperparameters to include when decoding.
+            custom_transformers: A mapping of hyperparameter names to custom transformers.
+
+        Returns:
+            A `ConfigEncoder` instance
+        """
+        constants = constants or {}
         custom = custom_transformers or {}
         sorted_params = sorted(parameters.items())
         transformers: dict[str, TensorTransformer] = {}
@@ -267,10 +380,10 @@ def default(
 
             match hp:
                 case FloatParameter() | IntegerParameter():
-                    transformers[name] = MinMaxNormalizer(hp.domain)
+                    transformers[name] = MinMaxNormalizer(hp.domain)  # type: ignore
                 case CategoricalParameter():
                     transformers[name] = CategoricalToIntegerTransformer(hp.choices)
                 case _:
                     raise ValueError(f"Unsupported parameter type: {type(hp)}")
 
-        return ConfigEncoder(transformers)
+        return ConfigEncoder(transformers, constants=constants)
diff --git a/neps/search_spaces/hyperparameters/float.py b/neps/search_spaces/hyperparameters/float.py
index a0ab83e8..bf7768c4 100644
--- a/neps/search_spaces/hyperparameters/float.py
+++ b/neps/search_spaces/hyperparameters/float.py
@@ -72,7 +72,7 @@ def __init__(
             default=float(default) if default is not None else None,
             default_confidence=default_confidence,
             is_fidelity=is_fidelity,
-            domain=Domain.float(lower, upper, log=log),
+            domain=Domain.floating(lower, upper, log=log),
         )
 
     @override
diff --git a/neps/search_spaces/hyperparameters/integer.py b/neps/search_spaces/hyperparameters/integer.py
index 2f534323..fdebcce7 100644
--- a/neps/search_spaces/hyperparameters/integer.py
+++ b/neps/search_spaces/hyperparameters/integer.py
@@ -77,7 +77,7 @@ def __init__(
             is_fidelity=is_fidelity,
             default=int(np.rint(default)) if default is not None else None,
             default_confidence=default_confidence,
-            domain=Domain.int(lower, upper, log=log),
+            domain=Domain.integer(lower, upper, log=log),
         )
 
         # We subtract/add 0.499999 from lower/upper bounds respectively, such that
diff --git a/neps/search_spaces/hyperparameters/numerical.py b/neps/search_spaces/hyperparameters/numerical.py
index 2e98de9f..f403feb0 100644
--- a/neps/search_spaces/hyperparameters/numerical.py
+++ b/neps/search_spaces/hyperparameters/numerical.py
@@ -94,6 +94,7 @@ def __init__(
             log: Whether the hyperparameter is in log space.
             default: The default value of the hyperparameter.
             is_fidelity: Whether the hyperparameter is a fidelity parameter.
+            domain: The domain of the hyperparameter.
             default_confidence: The default confidence choice.
         """
         super().__init__(value=None, default=default, is_fidelity=is_fidelity)  # type: ignore
@@ -136,7 +137,7 @@ def __init__(
         self.lower: T = lower
         self.upper: T = upper
         self.log: bool = log
-        self.domain = domain
+        self.domain: Domain[T] = domain
         self.log_value: float | None = None
         self.log_bounds: tuple[float, float] | None = None
         self.log_default: float | None = None
diff --git a/neps/search_spaces/neighborhoods.py b/neps/search_spaces/neighborhoods.py
deleted file mode 100644
index 91c34a6f..00000000
--- a/neps/search_spaces/neighborhoods.py
+++ /dev/null
@@ -1,281 +0,0 @@
-from __future__ import annotations
-
-from typing import TypeVar
-
-import numpy as np
-
-from neps.search_spaces.domain import Domain
-from neps.utils.types import Arr, f64, i64
-
-V = TypeVar("V", f64, i64)
-
-UNIQUE_NEIGHBOR_GENERATOR_N_RETRIES = 8
-UNIQUE_NEIGHBOR_GENERATOR_SAMPLE_MULTIPLIER = 4
-
-NON_UNIQUE_NEIGHBORS_N_RETRIES = 8
-NON_UNIQUE_NEIGHBORS_SAMPLE_MULTIPLIER = 4
-
-# Small enough but prevents needing to keep re-allocating temporary memory
-# 50 * 8 = 400 bytes
-_SMALL = 50
-_SMALL_CACHED_ARANGE = np.arange(_SMALL, dtype=i64)
-
-
-def unorded_finite_neighbors(
-    pivot: V,
-    domain: Domain[V],
-    *,
-    n: int,
-    seed: np.random.Generator,
-) -> Arr[V]:
-    N = domain.cardinality
-    assert N is not None, "Domain must be finite."
-    if N <= _SMALL:
-        full_range = _SMALL_CACHED_ARANGE[: domain.cardinality]
-    else:
-        full_range = np.arange(N, dtype=i64)
-
-    range_domain = Domain.indices(N)
-    _pivot = range_domain.cast(pivot, frm=domain)
-
-    left = full_range[:_pivot]
-    right = full_range[_pivot + 1 :]
-    _range = np.concatenate((left, right))
-
-    seed.shuffle(_range)
-
-    return domain.cast(_range[:n], frm=range_domain)
-
-
-def neighbors(
-    pivot: V,
-    domain: Domain[V],
-    *,
-    n: int,
-    std: float,
-    seed: np.random.Generator,
-    n_retries: int = NON_UNIQUE_NEIGHBORS_N_RETRIES,
-    sample_multiplier: int = NON_UNIQUE_NEIGHBORS_SAMPLE_MULTIPLIER,
-) -> Arr[V]:
-    """Create a neighborhood of `n` neighbors around `pivot` with a normal distribution.
-
-    If you need unique neighbors, you should use
-    [`unique_neighborhood`][neps.search_spaces.neighborhoods.unique_neighborhood].
-
-    !!! tip
-
-        [`unique_neighborhood`][neps.search_spaces.neighborhoods.unique_neighborhood]
-        is quite expensive in certain situations as it has to repeatedly sample and check
-        for uniqueness. If you can afford duplicates, use this function instead.
-
-        If [`domain.cardinality == None`][neps.search_spaces.domain.Domain.cardinality],
-        and you can afford an infentesimally small percentage change of duplicates,
-        you should use this function instead.
-
-    !!! warning
-
-        It is up to the caller to ensure that the pivot lies within the domain,
-        including at one of the bins if the domain is quantized.
-
-    Args:
-        pivot: The center of the neighborhood.
-        domain: The domain to get neighbors from.
-        n: The number of neighbors to generate.
-        std: The standard deviation of the normal distribution.
-        seed: The random seed to use.
-        n_retries:
-            The number of retries to attempt to generate unique neighbors.
-            Each retry increases the standard deviation of the normal distribution to
-            prevent rejection sampling from failing.
-        sample_multiplier:
-            A multiplier which multiplies by `n` to determine the number of samples to
-            generate for try. By oversampling, we prevent having to repeated calls to
-            sampling. This prevents having to do more rounds of sampling when too many
-            samples are out of bounds, useful for when the `pivot` is near the bounds.
-
-            Tuning this may be beneficial in unique circumstances, however we advise
-            leaving this as a default.
-
-    Returns:
-        An array of `n` neighbors around `pivot`.
-    """
-    # Generate batches of n * BUFFER_MULTIPLIER candidates, filling the above
-    # buffer until we have enough valid candidates.
-    # We should not overflow as the buffer
-    offset = 0
-    SAMPLE_SIZE = n * sample_multiplier
-    BUFFER_SIZE = (n + 1) * sample_multiplier
-
-    # We extend the range of stds to try to find neighbors
-    neighbors: Arr[V] = np.empty(BUFFER_SIZE, dtype=domain.dtype)
-    stds = np.linspace(std, 1.0, n_retries + 1, endpoint=True)
-
-    lower = domain.lower
-    upper = domain.upper
-    range_size = upper - lower
-    sample_domain = Domain.float(lower, upper)
-
-    for _std in stds:
-        candidates = seed.normal(pivot, _std * range_size, size=(SAMPLE_SIZE,))
-
-        bounded_candidates = candidates[(candidates >= lower) & (candidates <= upper)]
-        maybe_valid = domain.cast(bounded_candidates, frm=sample_domain)
-
-        # High chance of overlap with original point if there's a finite amount of
-        # possible elements
-        if domain.cardinality is not None:
-            valid = maybe_valid[maybe_valid != pivot]
-        else:
-            valid = maybe_valid
-
-        n_candidates = len(valid)
-        neighbors[offset : offset + n_candidates] = valid
-        offset += n_candidates
-
-        if offset >= n:
-            return neighbors[:n]
-
-    raise ValueError(
-        f"Failed to find enough neighbors with {n_retries} retries."
-        f" Given {n} neighbors, we only found {offset}."
-        f" The `Normals` for sampling neighbors were"
-        f" Normal(mu={pivot}, sigma={list(stds)})"
-        f" which were meant to find vectorized neighbors of the vector {pivot},"
-        " which was expected to be in the range"
-        f" ({lower}, {lower}).",
-    )
-
-
-def unique_neighborhood(
-    pivot: V,
-    domain: Domain[V],
-    *,
-    n: int,
-    seed: np.random.Generator,
-    std: float,
-    n_retries: int = UNIQUE_NEIGHBOR_GENERATOR_N_RETRIES,
-    sample_multiplier: int = UNIQUE_NEIGHBOR_GENERATOR_SAMPLE_MULTIPLIER,
-) -> Arr[V]:
-    """Create a neighborhood of `n` neighbors around `pivot` with a normal distribution.
-
-    The neighborhood is created by sampling from a normal distribution centered around
-    `pivot` with a standard deviation of `std`. The samples are then quantized to the
-    range `[lower, upper]` with `bins` bins. The number of samples is `n`.
-
-    !!! tip
-
-        [`unique_neighborhood`][neps.search_spaces.neighborhoods.unique_neighborhood]
-        is quite expensive in certain situations as it has to repeatedly sample and check
-        for uniqueness. If you can afford duplicates, use this function instead.
-
-        If [`domain.cardinality == None`][neps.search_spaces.domain.Domain.cardinality],
-        and you can afford an infentesimally small percentage change of duplicates,
-        you should use [`neighbors`][neps.search_spaces.neighborhoods.neighbors] instead.
-
-    !!! warning
-
-        If there are not enough unique neighbors to sample from, the function will
-        return less than `n` neighbors.
-
-    !!! warning
-
-        It is up to the caller to ensure that the pivot lies within the domain,
-        including at one of the bins if the domain is quantized.
-
-
-    Args:
-        pivot: The center of the neighborhood.
-        domain: The domain to get neighbors from.
-        n: The number of neighbors to generate.
-        std: The standard deviation of the normal distribution.
-        seed: The random seed to use.
-        n_retries:
-            The number of retries to attempt to generate unique neighbors.
-            Each retry increases the standard deviation of the normal distribution to prevent
-            rejection sampling from failing.
-        sample_multiplier:
-            A multiplier which multiplies by `n` to determine the number of samples to
-            generate for try. By oversampling, we prevent having to repeated calls to
-            both sampling and unique checking.
-
-            However, oversampling makes a tradeoff when the `std` is not high enough to
-            generate `n` unique neighbors, effectively sampling more of the same duplicates.
-
-            Tuning this may be beneficial in unique circumstances, however we advise leaving
-            this as a default.
-
-    Returns:
-        An array of `n` neighbors around `pivot`, or less than `n` if not enough unique
-        neighbors could be generated.
-    """  # noqa: E501
-    # Different than other neighborhoods as it's unnormalized and
-    # the quantization is directly integers.
-    assert n < 1000000, "Can only generate less than 1 million neighbors."
-    assert 0 < std < 1.0, "Standard deviation must be in the range (0, 1)."
-    lower = domain.lower
-    upper = domain.upper
-
-    # In the easiest case, we have a domain with finite elements and we need
-    # more neighbors than are possible. We then generate all of them.
-    # We can do this simply with a range and removing the pivot.
-    if domain.cardinality is not None and n >= domain.cardinality - 1:
-        range_domain = Domain.indices(domain.cardinality)
-        int_pivot = range_domain.cast(pivot, frm=domain)
-
-        if int_pivot == 0:
-            _range = np.arange(1, domain.cardinality, dtype=i64)
-            return domain.cast(_range, frm=range_domain)
-
-        if int_pivot == domain.cardinality - 1:
-            _range = np.arange(0, domain.cardinality - 1, dtype=i64)
-            return domain.cast(_range, frm=range_domain)
-
-        left = np.arange(0, int_pivot, dtype=i64)
-        right = np.arange(int_pivot + 1, domain.cardinality, dtype=i64)
-        _range = np.concatenate((left, right))
-
-        return domain.cast(_range, frm=range_domain)
-
-    # Otherwise, we use a repeated sampling strategy where we slowly increase the
-    # std of a normal, centered on `center`, slowly expanding `std` such that
-    # rejection won't fail.
-
-    # We set up a buffer that can hold the number of neighbors we need, plus some
-    # extra excess from sampling, preventing us from having to reallocate memory.
-    # We also include the initial value in the buffer, as we will remove it later.
-    SAMPLE_SIZE = n * sample_multiplier
-    BUFFER_SIZE = n * (sample_multiplier + 1)
-    neighbors = np.empty(BUFFER_SIZE + 1, dtype=domain.dtype)
-    neighbors[0] = pivot
-    offset = 1  # Indexes into current progress of filling buffer
-    stds = np.linspace(std, 1.0, n_retries + 1, endpoint=True)
-    sample_domain = Domain.float(lower, upper)
-
-    range_size = upper - lower
-    for _std in stds:
-        # Generate candidates in vectorized space
-        candidates = seed.normal(pivot, _std * range_size, size=SAMPLE_SIZE)
-        valid = (candidates >= lower) & (candidates <= upper)
-
-        candidates = domain.cast(x=candidates[valid], frm=sample_domain)
-
-        # Find new unique neighbors
-        uniq = np.unique(candidates)
-        new_uniq = np.setdiff1d(uniq, neighbors[:offset], assume_unique=True)
-
-        n_new_unique = len(new_uniq)
-        neighbors[offset : offset + n_new_unique] = new_uniq
-        offset += n_new_unique
-
-        # We have enough neighbors, we can stop
-        if offset - 1 >= n:
-            # Ensure we don't include the initial value point
-            return neighbors[1 : n + 1]
-
-    raise ValueError(
-        f"Failed to find enough neighbors with {n_retries} retries."
-        f" Given {n=} neighbors to generate, we only found {offset - 1}."
-        f" The normal's for sampling neighbors were Normal({pivot}, {list(stds)})"
-        f" which were meant to find neighbors of {pivot}. in the range"
-        f" ({lower}, {upper}).",
-    )
diff --git a/neps/state/optimizer.py b/neps/state/optimizer.py
index 07155015..92accddb 100644
--- a/neps/state/optimizer.py
+++ b/neps/state/optimizer.py
@@ -24,6 +24,7 @@ def remaining_cost_budget(self) -> float | None:
         return self.max_cost_budget - self.used_cost_budget
 
     def clone(self) -> BudgetInfo:
+        """Create a copy of the budget info."""
         return BudgetInfo(
             max_cost_budget=self.max_cost_budget,
             used_cost_budget=self.used_cost_budget,
diff --git a/neps/utils/common.py b/neps/utils/common.py
index d0fb2137..643ecc6d 100644
--- a/neps/utils/common.py
+++ b/neps/utils/common.py
@@ -3,16 +3,11 @@
 from __future__ import annotations
 
 import inspect
-import random
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
 from pathlib import Path
 from typing import Any
-from functools import partial
-from pathlib import Path
-from typing import Any, Iterable, Mapping, Sequence
 
-import numpy as np
 import torch
 import yaml
 
diff --git a/neps/utils/run_args.py b/neps/utils/run_args.py
index bd2664e1..9f5cc60d 100644
--- a/neps/utils/run_args.py
+++ b/neps/utils/run_args.py
@@ -41,7 +41,7 @@
 MAX_EVALUATIONS_PER_RUN = "max_evaluations_per_run"
 
 
-def get_run_args_from_yaml(path: str) -> dict:
+def get_run_args_from_yaml(path: str | Path) -> dict:
     """Load and validate NEPS run arguments from a specified YAML configuration file
     provided via run_args.
 
@@ -116,7 +116,7 @@ def get_run_args_from_yaml(path: str) -> dict:
     return settings
 
 
-def config_loader(path: str) -> dict:
+def config_loader(path: str | Path) -> dict:
     """Loads a YAML file and returns the contents under the 'run_args' key.
 
     Args:
@@ -506,7 +506,7 @@ class Settings:
     arguments (run_args (yaml) and neps func_args).
     """
 
-    def __init__(self, func_args: dict, yaml_args: str | Default | None = None):
+    def __init__(self, func_args: dict, yaml_args: Path | str | Default | None = None):
         """Initializes the Settings object by merging function arguments with YAML
         configuration settings and assigning them to class attributes. It checks for
         necessary configurations and handles default values where specified.
diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py
index 3f346949..164b49cb 100644
--- a/neps_examples/basic_usage/hyperparameters.py
+++ b/neps_examples/basic_usage/hyperparameters.py
@@ -2,51 +2,29 @@
 import time
 
 import numpy as np
-import math
-import random
 
 import neps
 
-PRINT = False
 
-
-def run_pipeline(float1, float2, float3, integer1, integer2):
-    if PRINT:
-        print("float1:", float1)
-        print("float2:", float2)
-        print("float3:", float3)
-        # print("categorical:", categorical)
-        print("integer1:", integer1)
-        print("integer2:", integer2)
-    loss = -float(
-        integer2
-        * np.sum(
-            [
-                (float1 * float2 / (float3 + 1)),  # * (int(categorical) + 1),
-                integer1,
-            ]
-        )
-    )  # Random noise
+def run_pipeline(float1, float2, categorical, integer1, integer2):
+    loss = -float(np.sum([float1, float2, int(categorical), integer1, integer2]))
     # time.sleep(0.7)  # For demonstration purposes
-    return {"loss": loss, "cost": float(integer2)}
+    return loss
 
 
 pipeline_space = dict(
-    float1=neps.FloatParameter(lower=0, upper=1, default=0.95),
-    float2=neps.FloatParameter(lower=0, upper=20, default=19.5),
-    float3=neps.FloatParameter(lower=0, upper=5, default=0.5),
-    # categorical=neps.CategoricalParameter(choices=[0, 1]),
-    integer1=neps.IntegerParameter(lower=0, upper=1, default=1),
-    integer2=neps.IntegerParameter(lower=1, upper=1000, log=True, default=950),
+    float1=neps.FloatParameter(lower=0, upper=1),
+    float2=neps.FloatParameter(lower=-10, upper=10),
+    categorical=neps.CategoricalParameter(choices=[0, 1]),
+    integer1=neps.IntegerParameter(lower=0, upper=1),
+    integer2=neps.IntegerParameter(lower=1, upper=1000, log=True),
 )
 
 logging.basicConfig(level=logging.INFO)
 neps.run(
     run_pipeline=run_pipeline,
-    searcher="bayesian_optimization",
     pipeline_space=pipeline_space,
     root_directory="results/hyperparameters_example",
     post_run_summary=True,
-    max_evaluations_total=50,
-    use_priors=True,
+    max_evaluations_total=15,
 )
diff --git a/tests/test_neps_api/solution_yamls/bo_neps_decided.yaml b/tests/test_neps_api/solution_yamls/bo_neps_decided.yaml
index 76935d6c..98be780b 100644
--- a/tests/test_neps_api/solution_yamls/bo_neps_decided.yaml
+++ b/tests/test_neps_api/solution_yamls/bo_neps_decided.yaml
@@ -3,11 +3,8 @@ searcher_alg: bayesian_optimization
 searcher_selection: neps-default
 neps_decision_tree: true
 searcher_args:
-  initial_design_size: 10
-  surrogate_model: gp
-  acquisition: EI
-  log_prior_weighted: false
-  acquisition_sampler: mutation
-  random_interleave_prob: 0.0
-  disable_priors: true
+  initial_design_size: null
+  use_priors: false
+  use_cost: false
   sample_default_first: false
+  device: null
diff --git a/tests/test_neps_api/solution_yamls/pibo_neps_decided.yaml b/tests/test_neps_api/solution_yamls/pibo_neps_decided.yaml
index 7d5f19da..3b7c36b2 100644
--- a/tests/test_neps_api/solution_yamls/pibo_neps_decided.yaml
+++ b/tests/test_neps_api/solution_yamls/pibo_neps_decided.yaml
@@ -3,12 +3,8 @@ searcher_alg: pibo
 searcher_selection: neps-default
 neps_decision_tree: true
 searcher_args:
-  initial_design_size: 10
-  surrogate_model: gp
-  acquisition: EI
-  log_prior_weighted: false
-  acquisition_sampler: mutation
-  random_interleave_prob: 0.0
-  disable_priors: false
-  prior_confidence: medium
-  sample_default_first: false
+  initial_design_size: null
+  use_priors: true
+  use_cost: false
+  sample_default_first: true
+  device: null
diff --git a/tests/test_neps_api/solution_yamls/user_yaml_bo.yaml b/tests/test_neps_api/solution_yamls/user_yaml_bo.yaml
index 156d67e4..1a20bc12 100644
--- a/tests/test_neps_api/solution_yamls/user_yaml_bo.yaml
+++ b/tests/test_neps_api/solution_yamls/user_yaml_bo.yaml
@@ -4,11 +4,5 @@ searcher_selection: user-yaml
 neps_decision_tree: false
 searcher_args:
   initial_design_size: 5
-  surrogate_model: gp
-  acquisition: EI
-  log_prior_weighted: false
-  acquisition_sampler: random
-  random_interleave_prob: 0.1
-  disable_priors: false
-  prior_confidence: high
-  sample_default_first: false
+  use_priors: true
+  sample_default_first: true
diff --git a/tests/test_neps_api/test_api.py b/tests/test_neps_api/test_api.py
index b4a54940..cebbdcc5 100644
--- a/tests/test_neps_api/test_api.py
+++ b/tests/test_neps_api/test_api.py
@@ -26,105 +26,32 @@ def no_logs_gte_error(caplog):
     assert not errors
 
 
-testing_scripts = [
-    "default_neps",
-    "baseoptimizer_neps",
-    "user_yaml_neps",
-]
+HERE = Path(__file__).resolve().parent
 
-examples_folder = Path(__file__, "..", "testing_scripts").resolve()
-solution_folder = Path(__file__, "..", "solution_yamls").resolve()
+testing_scripts = ["default_neps", "baseoptimizer_neps", "user_yaml_neps"]
+EXAMPLES_FOLDER = HERE / "testing_scripts"
+SOLUTION_FOLDER = HERE / "solution_yamls"
 neps_api_example_script = [
-    examples_folder / f"{example}.py" for example in testing_scripts
+    EXAMPLES_FOLDER / f"{example}.py" for example in testing_scripts
 ]
 
 
 @pytest.mark.neps_api
-def test_default_examples(tmp_path):
+@pytest.mark.parametrize("example_script", neps_api_example_script)
+def test_default_examples(tmp_path: Path, example_script: Path) -> None:
     # Running the example files holding multiple neps.run commands.
-
-    runpy.run_path(
-        neps_api_example_script[0],
-        run_name="__main__",
-    )
-
-    # Testing each folder with its corresponding expected dictionary
-    for folder_name in os.listdir(tmp_path):
-        folder_path = os.path.join(tmp_path, folder_name)
-
-        assert os.path.exists(folder_path), f"Directory does not exist: {folder_path}"
-
-        info_yaml_path = os.path.join(folder_path, ".optimizer_info", "info.yaml")
-
-        assert os.path.exists(
-            str(info_yaml_path)
-        ), f"File does not exist: {info_yaml_path}\n{os.listdir(folder_path)}"
-
-        # Load the YAML file
-        with open(str(info_yaml_path)) as yaml_config:
-            loaded_data = yaml.safe_load(yaml_config)
-
-        with open(str(solution_folder / (folder_name + ".yaml"))) as solution_yaml:
-            expected_data = yaml.safe_load(solution_yaml)
-
-        assert loaded_data == expected_data
-
-
-@pytest.mark.neps_api
-def test_baseoptimizer_examples(tmp_path):
-    # Running the example files holding multiple neps.run commands.
-
-    runpy.run_path(
-        neps_api_example_script[1],
-        run_name="__main__",
-    )
+    runpy.run_path(str(example_script), run_name="__main__")
 
     # Testing each folder with its corresponding expected dictionary
-    for folder_name in os.listdir(tmp_path):
-        folder_path = os.path.join(tmp_path, folder_name)
-
-        assert os.path.exists(folder_path), f"Directory does not exist: {folder_path}"
-
-        info_yaml_path = os.path.join(folder_path, ".optimizer_info", "info.yaml")
-
-        assert os.path.exists(
-            str(info_yaml_path)
-        ), f"File does not exist: {info_yaml_path}"
-
-        # Load the YAML file
-        with open(str(info_yaml_path)) as yaml_config:
-            loaded_data = yaml.safe_load(yaml_config)
-
-        with open(str(solution_folder / (folder_name + ".yaml"))) as solution_yaml:
-            expected_data = yaml.safe_load(solution_yaml)
-
-        assert loaded_data == expected_data
-
-
-@pytest.mark.neps_api
-def test_user_created_yaml_examples(tmp_path):
-    runpy.run_path(
-        neps_api_example_script[2],
-        run_name="__main__",
-    )
-
-    # Testing each folder with its corresponding expected dictionary
-    for folder_name in os.listdir(tmp_path):
-        folder_path = os.path.join(tmp_path, folder_name)
-
-        assert os.path.exists(folder_path), f"Directory does not exist: {folder_path}"
-
-        info_yaml_path = os.path.join(folder_path, ".optimizer_info", "info.yaml")
-
-        assert os.path.exists(
-            str(info_yaml_path)
-        ), f"File does not exist: {info_yaml_path}"
+    for folder in tmp_path.iterdir():
+        info_yaml_path = folder / ".optimizer_info" / "info.yaml"
 
-        # Load the YAML file
-        with open(str(info_yaml_path)) as yaml_config:
-            loaded_data = yaml.safe_load(yaml_config)
+        assert info_yaml_path.exists()
+        loaded_data = yaml.safe_load(info_yaml_path.read_text())
 
-        with open(str(solution_folder / (folder_name + ".yaml"))) as solution_yaml:
-            expected_data = yaml.safe_load(solution_yaml)
+        solution_yaml_path = SOLUTION_FOLDER / (folder.name + ".yaml")
+        solution_data = yaml.safe_load(solution_yaml_path.read_text())
 
-        assert loaded_data == expected_data
+        assert (
+            loaded_data == solution_data
+        ), f"Solution Path: {solution_yaml_path}\nLoaded Path: {info_yaml_path}\n"
diff --git a/tests/test_neps_api/testing_yaml/optimizer_test.yaml b/tests/test_neps_api/testing_yaml/optimizer_test.yaml
index f65af743..a4deff20 100644
--- a/tests/test_neps_api/testing_yaml/optimizer_test.yaml
+++ b/tests/test_neps_api/testing_yaml/optimizer_test.yaml
@@ -1,11 +1,5 @@
 strategy: bayesian_optimization
 # Specific arguments depending on the searcher
 initial_design_size: 7
-surrogate_model: gp
-acquisition: EI
-log_prior_weighted: false
-acquisition_sampler: random
-random_interleave_prob: 0.1
-disable_priors: false
-prior_confidence: high
-sample_default_first: false
+use_priors: true
+sample_default_first: true
diff --git a/tests/test_settings/run_args_optimizer_outside.yaml b/tests/test_settings/run_args_optimizer_outside.yaml
index 1dbfce01..4380904e 100644
--- a/tests/test_settings/run_args_optimizer_outside.yaml
+++ b/tests/test_settings/run_args_optimizer_outside.yaml
@@ -12,9 +12,5 @@ searcher:
   name: my_bayesian
   # Specific arguments depending on the searcher
   initial_design_size: 7
-  surrogate_model: gp
-  acquisition: EI
-  acquisition_sampler: random
-  random_interleave_prob: 0.1
 
 overwrite_working_directory: True
diff --git a/tests/test_settings/test_settings.py b/tests/test_settings/test_settings.py
index fe649563..1244bcf6 100644
--- a/tests/test_settings/test_settings.py
+++ b/tests/test_settings/test_settings.py
@@ -1,7 +1,8 @@
+from __future__ import annotations
+
 from neps.utils.run_args import Settings, Default
 import pytest
-import neps
-from neps.utils.run_args import get_run_args_from_yaml
+from pathlib import Path
 from tests.test_yaml_run_args.test_yaml_run_args import (
     run_pipeline,
     hook1,
@@ -9,9 +10,8 @@
     pipeline_space,
 )
 from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization
-from typing import Union, Callable, Dict, List, Type
 
-BASE_PATH = "tests/test_settings"
+BASE_PATH = Path("tests") / "test_settings"
 run_pipeline = run_pipeline
 hook1 = hook1
 hook2 = hook2
@@ -86,7 +86,7 @@
                 "searcher": Default("default"),
                 "searcher_kwargs": {},
             },
-            "/run_args_required.yaml",
+            "run_args_required.yaml",
             {
                 "run_pipeline": run_pipeline,
                 "root_directory": "path/to/root_directory",
@@ -128,7 +128,7 @@
                 "searcher": Default("default"),
                 "searcher_kwargs": {},
             },
-            "/run_args_optional.yaml",
+            "run_args_optional.yaml",
             {
                 "run_pipeline": run_pipeline,
                 "root_directory": "path/to/root_directory",
@@ -170,7 +170,7 @@
                 "searcher": "default",
                 "searcher_kwargs": {},
             },
-            "/overwrite_run_args.yaml",
+            "overwrite_run_args.yaml",
             {
                 "run_pipeline": run_pipeline,
                 "root_directory": "path/to/root_directory",
@@ -218,7 +218,7 @@
                     "sample_default_at_target": False,
                 },
             },
-            "/run_args_optimizer_settings.yaml",
+            "run_args_optimizer_settings.yaml",
             {
                 "run_pipeline": run_pipeline,
                 "root_directory": "path/to/root_directory",
@@ -273,11 +273,10 @@
                 "pre_load_hooks": Default(None),
                 "searcher": Default("default"),
                 "searcher_kwargs": {
-                    "random_interleave_prob": 0.2,
                     "initial_design_size": 9,
                 },
             },
-            "/run_args_optimizer_outside.yaml",
+            "run_args_optimizer_outside.yaml",
             {
                 "run_pipeline": run_pipeline,
                 "root_directory": "path/to/root_directory",
@@ -295,24 +294,21 @@
                 "cost_value_on_error": None,
                 "pre_load_hooks": None,
                 "searcher": my_bayesian,
-                "searcher_kwargs": {
-                    "acquisition": "EI",
-                    "acquisition_sampler": "random",
-                    "random_interleave_prob": 0.2,
-                    "initial_design_size": 9,
-                },
+                "searcher_kwargs": {"initial_design_size": 9},
             },
         ),
     ],
 )
-def test_check_settings(func_args: Dict, yaml_args: str, expected_output: Dict) -> None:
+def test_check_settings(func_args: dict, yaml_args: str, expected_output: dict) -> None:
     """
     Check if expected settings are set
     """
-    if not isinstance(yaml_args, Default):
-        yaml_args = BASE_PATH + yaml_args
-    settings = Settings(func_args, yaml_args)
-    print(settings)
+    if isinstance(yaml_args, str):
+        args = BASE_PATH / yaml_args
+    else:
+        args = yaml_args
+
+    settings = Settings(func_args, args)
     for key, value in expected_output.items():
         assert getattr(settings, key) == value
 
@@ -347,7 +343,7 @@ def test_check_settings(func_args: Dict, yaml_args: str, expected_output: Dict)
     ],
 )
 def test_settings_initialization_error(
-    func_args: Dict, yaml_args: Union[str, Default], error: Exception
+    func_args: dict, yaml_args: str | Default, error: Exception
 ) -> None:
     """
     Test if Settings raises Error when essential arguments are missing
diff --git a/tests/test_state/test_neps_state.py b/tests/test_state/test_neps_state.py
index ab3a6b6a..641b54ba 100644
--- a/tests/test_state/test_neps_state.py
+++ b/tests/test_state/test_neps_state.py
@@ -79,6 +79,10 @@ def case_search_space_fid_with_prior() -> SearchSpace:
 #
 OPTIMIZER_FAILS_WITH_FIDELITY = [
     "random_search",
+    "bayesian_optimization",
+    "pibo",
+    "cost_cooling_bayesian_optimization",
+    "cost_cooling",
 ]
 
 # There's no programattic way to check if a class requires a fidelity.
diff --git a/tests/test_yaml_run_args/run_args_optional_loading_format.yaml b/tests/test_yaml_run_args/run_args_optional_loading_format.yaml
index 26bdad83..aa96558f 100644
--- a/tests/test_yaml_run_args/run_args_optional_loading_format.yaml
+++ b/tests/test_yaml_run_args/run_args_optional_loading_format.yaml
@@ -20,6 +20,5 @@ searcher: # Optional Loading
   path: "neps/optimizers/bayesian_optimization/optimizer.py"
   name: BayesianOptimization
   initial_design_size: 5
-  surrogate_model: gp
 pre_load_hooks:
   hook1: "tests/test_yaml_run_args/test_yaml_run_args.py"
diff --git a/tests/test_yaml_run_args/test_declarative_usage_docs/customizing_neps_optimizer.yaml b/tests/test_yaml_run_args/test_declarative_usage_docs/customizing_neps_optimizer.yaml
index 5ddaf23e..da0e7460 100644
--- a/tests/test_yaml_run_args/test_declarative_usage_docs/customizing_neps_optimizer.yaml
+++ b/tests/test_yaml_run_args/test_declarative_usage_docs/customizing_neps_optimizer.yaml
@@ -7,10 +7,7 @@ pipeline_space:
     lower: 1e-5
     upper: 1e-1
     log: True  # Log scale for learning rate
-  epochs:
-    lower: 5
-    upper: 20
-    is_fidelity: True
+  epochs: 20
   optimizer:
     choices: [adam, sgd, adamw]
   batch_size: 64
@@ -22,9 +19,5 @@ searcher:
   name: "my_bayesian"
   # Specific arguments depending on the searcher
   initial_design_size: 7
-  surrogate_model: gp
-  acquisition: EI
-  acquisition_sampler: random
-  random_interleave_prob: 0.1
 
 overwrite_working_directory: True
diff --git a/tests/test_yaml_run_args/test_declarative_usage_docs/loading_own_optimizer.yaml b/tests/test_yaml_run_args/test_declarative_usage_docs/loading_own_optimizer.yaml
index fce52034..9c0a4864 100644
--- a/tests/test_yaml_run_args/test_declarative_usage_docs/loading_own_optimizer.yaml
+++ b/tests/test_yaml_run_args/test_declarative_usage_docs/loading_own_optimizer.yaml
@@ -18,7 +18,5 @@ searcher:
   name: BayesianOptimization
   # Specific arguments depending on your searcher
   initial_design_size: 7
-  surrogate_model: gp
-  acquisition: EI
 
 overwrite_working_directory: True
diff --git a/tests/test_yaml_run_args/test_declarative_usage_docs/set_up_optimizer.yaml b/tests/test_yaml_run_args/test_declarative_usage_docs/set_up_optimizer.yaml
index f65af743..90b52671 100644
--- a/tests/test_yaml_run_args/test_declarative_usage_docs/set_up_optimizer.yaml
+++ b/tests/test_yaml_run_args/test_declarative_usage_docs/set_up_optimizer.yaml
@@ -1,11 +1,5 @@
 strategy: bayesian_optimization
 # Specific arguments depending on the searcher
 initial_design_size: 7
-surrogate_model: gp
-acquisition: EI
-log_prior_weighted: false
-acquisition_sampler: random
-random_interleave_prob: 0.1
-disable_priors: false
-prior_confidence: high
+use_priors: true
 sample_default_first: false
diff --git a/tests/test_yaml_run_args/test_run_args_by_neps_run/optimizer_yamls/select_bo_run_args.yaml b/tests/test_yaml_run_args/test_run_args_by_neps_run/optimizer_yamls/select_bo_run_args.yaml
index af5259d0..9871ca63 100644
--- a/tests/test_yaml_run_args/test_run_args_by_neps_run/optimizer_yamls/select_bo_run_args.yaml
+++ b/tests/test_yaml_run_args/test_run_args_by_neps_run/optimizer_yamls/select_bo_run_args.yaml
@@ -3,11 +3,8 @@ searcher_alg: bayesian_optimization
 searcher_selection: user-run_args-yaml
 neps_decision_tree: false
 searcher_args:
-  initial_design_size: 10
-  surrogate_model: gp
-  acquisition: EI
-  log_prior_weighted: false
-  acquisition_sampler: mutation
-  random_interleave_prob: 0.0
-  disable_priors: true
+  initial_design_size: null
+  use_priors: false
+  use_cost: false
   sample_default_first: false
+  device: null

From 39ffe48a6db0ed53227356ea422721b5ed2d6843 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Fri, 4 Oct 2024 12:32:48 +0200
Subject: [PATCH 57/63] fix(optimize_acq): Generation of fixed categorical
 values

---
 neps/optimizers/bayesian_optimization/models/gp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
index 8b22a513..0f5fd7b7 100644
--- a/neps/optimizers/bayesian_optimization/models/gp.py
+++ b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -189,7 +189,7 @@ def optimize_acq(
     cats: dict[int, list[float]] = {
         encoder.index_of[name]: [
             float(i)
-            for i in range(len(transformer.domain.cardinality))  # type: ignore
+            for i in range(transformer.domain.cardinality)  # type: ignore
         ]
         for name, transformer in cat_transformers.items()
     }

From c3aeb30f17bcde35a7ea6e40e0685aa9d4ab2f99 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Fri, 4 Oct 2024 13:41:55 +0200
Subject: [PATCH 58/63] test: Fixup examples

---
 neps_examples/__init__.py | 23 +++++++++++++++++++----
 tests/test_examples.py    |  1 +
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/neps_examples/__init__.py b/neps_examples/__init__.py
index df7d3589..a48933d7 100644
--- a/neps_examples/__init__.py
+++ b/neps_examples/__init__.py
@@ -1,7 +1,23 @@
 all_main_examples = {  # Used for printing in python -m neps_examples
-    "basic_usage": ["analyse", "architecture", "architecture_and_hyperparameters", "hpo_usage_example", "hyperparameters"],
-    "convenience": ["logging_additional_info", "neps_tblogger_tutorial", "running_on_slurm_scripts", "neps_x_lightning", "working_directory_per_pipeline"],
-    "efficiency": ["expert_priors_for_hyperparameters", "multi_fidelity", "multi_fidelity_and_expert_priors"],
+    "basic_usage": [
+        "analyse",
+        "architecture",
+        "architecture_and_hyperparameters",
+        "hpo_usage_example",
+        "hyperparameters",
+    ],
+    "convenience": [
+        "logging_additional_info",
+        "neps_tblogger_tutorial",
+        "running_on_slurm_scripts",
+        "neps_x_lightning",
+        "working_directory_per_pipeline",
+    ],
+    "efficiency": [
+        "expert_priors_for_hyperparameters",
+        "multi_fidelity",
+        "multi_fidelity_and_expert_priors",
+    ],
 }
 
 core_examples = [  # Run locally and on github actions
@@ -15,7 +31,6 @@
     "basic_usage/architecture_and_hyperparameters",
     "experimental/hierarchical_architecture",
     "efficiency/expert_priors_for_hyperparameters",
-    "experimental/hierarchical_architecture_hierarchical_GP",
     "convenience/logging_additional_info",
     "convenience/working_directory_per_pipeline",
     "convenience/neps_tblogger_tutorial",
diff --git a/tests/test_examples.py b/tests/test_examples.py
index 5575eb4d..6942e8d7 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -40,6 +40,7 @@ def test_core_examples(example):
 
     if example.name in (
         "architecture.py",
+        "architecture_and_hyperparameters.py",
         "hierarchical_architecture.py",
         "expert_priors_for_architecture_and_hyperparameters.py",
     ):

From d1596518f3d52fbca85dcf95b812fa213949e67e Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Mon, 7 Oct 2024 16:45:50 +0200
Subject: [PATCH 59/63] test(domain): Initial tests

---
 neps/search_spaces/domain.py |  21 ++--
 tests/test_domain.py         | 205 +++++++++++++++++++++++++++++++++++
 2 files changed, 214 insertions(+), 12 deletions(-)
 create mode 100644 tests/test_domain.py

diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py
index 6c6c1b75..6f6e2693 100644
--- a/neps/search_spaces/domain.py
+++ b/neps/search_spaces/domain.py
@@ -221,15 +221,6 @@ def indices(cls, n: int, *, is_categorical: bool = False) -> Domain[int]:
         """
         return Domain.integer(0, n - 1, is_categorical=is_categorical)
 
-    def next_value(self, x: Tensor) -> Tensor:
-        """Get the next value for a tensor of values."""
-        if self.cardinality is None:
-            raise ValueError("Domain is non-finite, cannot get next value.")
-        cardinality_domain = Domain.indices(self.cardinality)
-        current_step = cardinality_domain.cast(x, frm=self)
-        bounded_next_step = (current_step + 1).clamp_max(self.cardinality - 1)
-        return self.cast(bounded_next_step, frm=cardinality_domain)
-
     def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor:
         """Transform a tensor of values from this domain to the unit interval [0, 1].
 
@@ -242,10 +233,11 @@ def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor:
         """
         if dtype is None:
             dtype = torch.float64
-        else:
-            assert dtype.is_floating_point, "Unit interval is only for floats."
+        elif not dtype.is_floating_point:
+            raise ValueError(f"Unit interval only allows floating dtypes, got {dtype}.")
 
-        if self.is_unit_float:
+        bins = self.bins
+        if self.is_unit_float and self.bins is not None:
             return x.to(dtype)
 
         if self.log_bounds is not None:
@@ -255,6 +247,11 @@ def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor:
             lower, upper = self.lower, self.upper
 
         x = (x - lower) / (upper - lower)
+
+        if bins is not None:
+            quantization_levels = torch.floor(x * bins).clip(0, bins - 1)
+            x = quantization_levels / (bins - 1)
+
         return x.type(dtype)
 
     def from_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor:
diff --git a/tests/test_domain.py b/tests/test_domain.py
new file mode 100644
index 00000000..0893337f
--- /dev/null
+++ b/tests/test_domain.py
@@ -0,0 +1,205 @@
+from pytest_cases import parametrize
+
+import torch
+import pytest
+from neps.search_spaces.domain import Domain
+
+T = torch.tensor
+
+
+@parametrize(
+    "x, frm, expected",
+    [
+        # Remains unchanged if from unit-float
+        (T([0, 0.5, 1.0]), Domain.unit_float(), T([0, 0.5, 1.0])),
+        # Converts integers to float
+        (T([0, 1]), Domain.unit_float(), T([0.0, 1.0])),
+        # Integer conversion
+        (T([0, 1, 2, 3, 4]), Domain.integer(0, 4), T([0.0, 0.25, 0.5, 0.75, 1.0])),
+        # Negatives
+        (
+            T([-0.5, -0.25, 0.0, 0.25, 0.5]),
+            Domain.floating(-0.5, 0.5),
+            T([0.0, 0.25, 0.5, 0.75, 1.0]),
+        ),
+        # Log scale
+        (
+            T([1e-4, 1e-3, 1e-2, 1e-1, 1]),
+            Domain.floating(1e-4, 1, log=True),
+            T([0.0, 0.25, 0.5, 0.75, 1.0]),
+        ),
+        # Binned
+        (
+            torch.arange(10),
+            Domain.integer(0, 10, bins=5),
+            T([0.0, 0.0, 0.25, 0.25, 0.5, 0.5, 0.75, 0.75, 1.0, 1.0]),
+        ),
+    ],
+)
+def test_domain_to_unit(x: torch.Tensor, frm: Domain, expected: torch.Tensor) -> None:
+    y = frm.to_unit(x)
+    assert y.dtype == torch.float64
+    torch.testing.assert_close(y, expected, check_dtype=False, msg=f"{y} != {expected}")
+
+
+def test_domain_to_unit_dtype_with_floating() -> None:
+    domain = Domain.integer(0, 4)
+    x = T([0, 1, 2, 3, 4], dtype=torch.int32)
+
+    expected_64 = T([0.0, 0.25, 0.5, 0.75, 1.0], dtype=torch.float64)
+    y_64 = domain.to_unit(x, dtype=torch.float64)
+    torch.testing.assert_close(y_64, expected_64, check_dtype=True)
+
+    expected_32 = T([0.0, 0.25, 0.5, 0.75, 1.0], dtype=torch.float32)
+    y_32 = domain.to_unit(x, dtype=torch.float32)
+    torch.testing.assert_close(y_32, expected_32, check_dtype=True)
+
+
+def test_domain_to_unit_dtype_with_integer_fails() -> None:
+    domain = Domain.integer(0, 4)
+    x = T([0, 1, 2, 3, 4], dtype=torch.int32)
+
+    with pytest.raises(ValueError, match="only allows floating dtypes"):
+        domain.to_unit(x, dtype=torch.int32)
+
+
+@parametrize(
+    "x, to, expected",
+    [
+        # Remains unchanged if from unit-float
+        (
+            T([0, 0.5, 1.0]),
+            Domain.unit_float(),
+            T([0, 0.5, 1.0], dtype=torch.float64),
+        ),
+        # Converts floats to integers
+        (
+            T([0.0, 1.0]),
+            Domain.integer(0, 1),
+            T([0, 1], dtype=torch.int64),
+        ),
+        # Integer range
+        (
+            T([0, 0.25, 0.5, 0.75, 1.0]),
+            Domain.integer(0, 4),
+            T([0, 1, 2, 3, 4], dtype=torch.int64),
+        ),
+        # Negatives
+        (
+            T([0.0, 0.25, 0.5, 0.75, 1.0]),
+            Domain.floating(-0.5, 0.5),
+            T([-0.5, -0.25, 0.0, 0.25, 0.5], dtype=torch.float64),
+        ),
+        # Log scale
+        (
+            T([0.0, 0.25, 0.5, 0.75, 1.0]),
+            Domain.floating(1e-4, 1, log=True),
+            T([1e-4, 1e-3, 1e-2, 1e-1, 1], dtype=torch.float64),
+        ),
+        # Binned
+        (
+            T([0.0, 0.25, 0.5, 0.75, 1.0]),
+            Domain.integer(0, 20, bins=5),
+            T([0, 5, 10, 15, 20], dtype=torch.int64),
+        ),
+    ],
+)
+def test_domain_from_unit(x: torch.Tensor, to: Domain, expected: torch.Tensor) -> None:
+    x = x.to(dtype=torch.float64)
+    y = to.from_unit(x)
+    torch.testing.assert_close(y, expected, check_dtype=True, msg=f"{y} != {expected}")
+
+
+def test_domain_from_unit_dtype() -> None:
+    x = T([0.0, 0.25, 0.5, 0.75, 1.0], dtype=torch.float64)
+    domain = Domain.integer(0, 4)
+
+    expected_f64 = T([0.0, 1.0, 2.0, 3.0, 4.0], dtype=torch.float64)
+    y_f64 = domain.from_unit(x, dtype=torch.float64)
+    torch.testing.assert_close(y_f64, expected_f64, check_dtype=True)
+
+    expected_f32 = T([0, 1, 2, 3, 4], dtype=torch.float32)
+    y_f32 = domain.from_unit(x, dtype=torch.float32)
+    torch.testing.assert_close(y_f32, expected_f32, check_dtype=True)
+
+    expected_i32 = T([0, 1, 2, 3, 4], dtype=torch.int32)
+    y_i32 = domain.from_unit(x, dtype=torch.int32)
+    torch.testing.assert_close(y_i32, expected_i32, check_dtype=True)
+
+    expected_i64 = T([0, 1, 2, 3, 4], dtype=torch.int64)
+    y_i64 = domain.from_unit(x, dtype=torch.int64)
+    torch.testing.assert_close(y_i64, expected_i64, check_dtype=True)
+
+
+@parametrize(
+    "x, frm, to, expected",
+    [
+        (
+            T([1e-2, 1e-1, 1e0, 1e1, 1e2], dtype=torch.float64),
+            Domain.floating(1e-2, 1e2, log=True),
+            Domain.floating(-2, 2),
+            T([-2, -1, 0, 1, 2], dtype=torch.float64),
+        ),
+        (
+            T([0, 2, 4, 6, 8], dtype=torch.int64),
+            Domain.integer(0, 8, bins=5),
+            Domain.integer(0, 4),
+            T([0, 1, 2, 3, 4], dtype=torch.int64),
+        ),
+        (
+            T([10, 12.5, 15], dtype=torch.float64),
+            Domain.floating(10, 15),
+            Domain.floating(2, 3),
+            T([2, 2.5, 3.0], dtype=torch.float64),
+        ),
+    ],
+)
+def test_domain_casting(
+    x: torch.Tensor, frm: Domain, to: Domain, expected: torch.Tensor
+) -> None:
+    y = to.cast(x, frm=frm)
+    torch.testing.assert_close(y, expected, check_dtype=True, msg=f"{y} != {expected}")
+
+    x_back = frm.cast(y, frm=to)
+    torch.testing.assert_close(x_back, x, check_dtype=True, msg=f"{x_back} != {x}")
+
+
+@parametrize(
+    "x, frm, to, expected",
+    [
+        (
+            # This test combines all the previous cast domains in one go as a single tensor
+            T(
+                [
+                    [1e-2, 1e-1, 1e0, 1e1, 1e2],
+                    [0, 2, 4, 6, 8],
+                    [10, 12.5, 15, 17.5, 20],
+                ]
+            ).transpose(0, 1),
+            [
+                Domain.floating(1e-2, 1e2, log=True),
+                Domain.integer(0, 8, bins=5),
+                Domain.floating(10, 20),
+            ],  # from
+            [Domain.floating(-2, 2), Domain.integer(0, 4), Domain.floating(2, 4)],  # to
+            T(
+                [
+                    [-2, -1, 0, 1, 2],
+                    [0, 1, 2, 3, 4],
+                    [2, 2.5, 3, 3.5, 4],
+                ]
+            ).transpose(0, 1),
+        ),
+    ],
+)
+def test_translate(
+    x: torch.Tensor,
+    frm: list[Domain],
+    to: list[Domain],
+    expected: torch.Tensor,
+) -> None:
+    y = Domain.translate(x, frm=frm, to=to)
+    torch.testing.assert_close(y, expected, check_dtype=True, msg=f"{y} != {expected}")
+
+    x_back = Domain.translate(y, frm=to, to=frm)
+    torch.testing.assert_close(x_back, x, check_dtype=True, msg=f"{x_back} != {x}")

From 65d99189818de04a5a140bbf36126b8c428afe00 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Mon, 7 Oct 2024 17:59:33 +0200
Subject: [PATCH 60/63] test(ConfigEncoder): Initial tests

---
 neps/search_spaces/encoding.py |  26 +++++-
 tests/test_config_encoder.py   | 143 +++++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_config_encoder.py

diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py
index c20f60b3..78958b90 100644
--- a/neps/search_spaces/encoding.py
+++ b/neps/search_spaces/encoding.py
@@ -240,9 +240,10 @@ class ConfigEncoder:
     """
 
     transformers: dict[str, TensorTransformer]
+    constants: Mapping[str, Any] = field(default_factory=dict)
+
     index_of: dict[str, int] = field(init=False)
     domain_of: dict[str, Domain] = field(init=False)
-    constants: Mapping[str, Any] = field(default_factory=dict)
     n_numerical: int = field(init=False)
     n_categorical: int = field(init=False)
 
@@ -369,7 +370,23 @@ def default(
         Returns:
             A `ConfigEncoder` instance
         """
-        constants = constants or {}
+        if constants is not None:
+            overlap = set(parameters) & set(constants)
+            if any(overlap):
+                raise ValueError(
+                    "`constants=` and `parameters=` cannot have overlapping"
+                    f" keys: {overlap=}"
+                )
+            if custom_transformers is not None:
+                overlap = set(custom_transformers) & set(constants)
+                if any(overlap):
+                    raise ValueError(
+                        f"Can not apply `custom_transformers=`"
+                        f" to `constants=`: {overlap=}"
+                    )
+        else:
+            constants = {}
+
         custom = custom_transformers or {}
         sorted_params = sorted(parameters.items())
         transformers: dict[str, TensorTransformer] = {}
@@ -384,6 +401,9 @@ def default(
                 case CategoricalParameter():
                     transformers[name] = CategoricalToIntegerTransformer(hp.choices)
                 case _:
-                    raise ValueError(f"Unsupported parameter type: {type(hp)}")
+                    raise ValueError(
+                        f"Unsupported parameter type: {type(hp)}. If hp is a constant, "
+                        " please provide it as `constants=`."
+                    )
 
         return ConfigEncoder(transformers, constants=constants)
diff --git a/tests/test_config_encoder.py b/tests/test_config_encoder.py
new file mode 100644
index 00000000..6ed7d344
--- /dev/null
+++ b/tests/test_config_encoder.py
@@ -0,0 +1,143 @@
+import torch
+import pytest
+from neps.search_spaces.domain import Domain
+from neps.search_spaces.encoding import (
+    CategoricalToIntegerTransformer,
+    ConfigEncoder,
+    MinMaxNormalizer,
+)
+from neps.search_spaces.hyperparameters import (
+    CategoricalParameter,
+    FloatParameter,
+    IntegerParameter,
+)
+
+
+def test_config_encoder_default() -> None:
+    parameters = {
+        "b": IntegerParameter(5, 6),
+        "a": FloatParameter(5, 6),
+        "c": CategoricalParameter(["cat", "mouse", "dog"]),
+    }
+
+    encoder = ConfigEncoder.default(parameters)
+
+    # Min-max numericals, integer categoricals.
+    assert encoder.transformers == {
+        "a": MinMaxNormalizer(parameters["a"].domain),
+        "b": MinMaxNormalizer(parameters["b"].domain),
+        "c": CategoricalToIntegerTransformer(parameters["c"].choices),
+    }
+
+    # Domains, (of each column) match those of the transformers
+    assert encoder.domains == [
+        Domain.unit_float(),
+        Domain.unit_float(),
+        Domain.indices(n=len(parameters["c"].choices), is_categorical=True),
+    ]
+
+    assert encoder.ncols == len(parameters)
+    assert encoder.n_numerical == 2
+    assert encoder.n_categorical == 1
+    assert encoder.index_of == {"a": 0, "b": 1, "c": 2}
+    assert encoder.domain_of == {
+        "a": Domain.unit_float(),
+        "b": Domain.unit_float(),
+        "c": Domain.indices(n=len(parameters["c"].choices), is_categorical=True),
+    }
+    assert encoder.constants == {}
+
+    configs = [
+        {"a": 5.5, "b": 5, "c": "cat"},
+        {"a": 5.5, "b": 5, "c": "dog"},
+        {"a": 6, "b": 6, "c": "mouse"},
+    ]
+    encoded = encoder.encode(configs)
+    expcected_encoding = torch.tensor(
+        [
+            # a,   b,   c
+            [0.5, 0.0, 0.0],  # config 1
+            [0.5, 0.0, 2.0],  # config 2
+            [1.0, 1.0, 1.0],  # config 3
+        ],
+        dtype=torch.float64,
+    )
+    torch.testing.assert_close(encoded, expcected_encoding, check_dtype=True)
+
+    decoded = encoder.decode(encoded)
+    assert decoded == configs
+
+
+def test_config_encoder_accepts_custom_transformers() -> None:
+    parameters = {
+        "b": IntegerParameter(5, 6),
+        "a": FloatParameter(5, 6),
+        "c": CategoricalParameter(["cat", "mouse", "dog"]),
+    }
+    encoder = ConfigEncoder.default(
+        parameters,
+        custom_transformers={
+            "c": CategoricalToIntegerTransformer(parameters["c"].choices)
+        },
+    )
+    assert encoder.transformers["c"] == CategoricalToIntegerTransformer(
+        parameters["c"].choices
+    )
+
+
+def test_config_encoder_removes_constants_in_encoding_and_includes_in_decoding() -> None:
+    parameters = {
+        "b": IntegerParameter(5, 6),
+        "a": FloatParameter(5, 6),
+        "c": CategoricalParameter(["cat", "mouse", "dog"]),
+    }
+
+    x = "raspberry"
+
+    encoder = ConfigEncoder.default(parameters, constants={"x": x})
+    assert encoder.constants == {"x": x}
+
+    enc_x = encoder.encode([{"a": 5.5, "b": 5, "c": "cat", "x": x}])
+
+    assert enc_x.shape == (1, 3)  # No x, just a, b, c
+
+    dec_x = encoder.decode(enc_x)
+    assert dec_x == [{"a": 5.5, "b": 5, "c": "cat", "x": x}]
+
+    # This doesn't have to hold true, but it's our current behaviour, we could make
+    # weaker gaurantees but then we'd have to clone the constants, even if it's very large
+    assert dec_x[0]["x"] is x
+
+
+def test_config_encoder_complains_if_missing_entry_in_config() -> None:
+    parameters = {
+        "b": IntegerParameter(5, 6),
+        "a": FloatParameter(5, 6),
+        "c": CategoricalParameter(["cat", "mouse", "dog"]),
+    }
+
+    encoder = ConfigEncoder.default(parameters)
+
+    with pytest.raises(KeyError):
+        encoder.encode([{"a": 5.5, "b": 5}])
+
+
+def test_config_encoder_sorts_parameters_by_name_for_consistent_ordering() -> None:
+    parameters = {
+        "b": IntegerParameter(0, 1),
+        "a": FloatParameter(0, 1),
+        "c": CategoricalParameter([0, 1]),
+    }
+    p1 = dict(sorted(parameters.items()))
+    p2 = dict(sorted(parameters.items(), reverse=True))
+
+    encoder_1 = ConfigEncoder.default(p1)
+    encoder_2 = ConfigEncoder.default(p2)
+
+    assert encoder_1.index_of["a"] == 0
+    assert encoder_1.index_of["b"] == 1
+    assert encoder_1.index_of["c"] == 2
+
+    assert encoder_2.index_of["a"] == 0
+    assert encoder_2.index_of["b"] == 1
+    assert encoder_2.index_of["c"] == 2

From fdfb3633f12b8da375f44fa3c6324005db7c167e Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Mon, 7 Oct 2024 17:59:48 +0200
Subject: [PATCH 61/63] fix(Domain): use cardinality for quantizing through
 unit interval

---
 neps/search_spaces/domain.py | 28 ++++++++++++++--------------
 tests/test_domain.py         | 27 ++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py
index 6f6e2693..7f9a914f 100644
--- a/neps/search_spaces/domain.py
+++ b/neps/search_spaces/domain.py
@@ -131,6 +131,7 @@ def __post_init__(self) -> None:
                     " `bins` is `None` and `round` is `False` and"
                     " boundaries are not integers."
                 )
+        object.__setattr__(self, "cardinality", cardinality)
 
         preferred_dtype = torch.int64 if is_int else torch.float64
         object.__setattr__(self, "preffered_dtype", preferred_dtype)
@@ -139,7 +140,6 @@ def __post_init__(self) -> None:
         if is_int:
             mid = int(round(mid))
 
-        object.__setattr__(self, "cardinality", cardinality)
         object.__setattr__(self, "midpoint", mid)
         object.__setattr__(self, "bounds", (self.lower, self.upper))
 
@@ -188,7 +188,7 @@ def integer(
 
         Args:
             lower: The lower bound of the domain.
-            upper: The upper bound of the domain.
+            upper: The upper bound of the domain (inclusive).
             log: Whether the domain is in log space.
             bins: The number of discrete bins to split the domain into.
             is_categorical: Whether the domain is representing a categorical.
@@ -236,8 +236,8 @@ def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor:
         elif not dtype.is_floating_point:
             raise ValueError(f"Unit interval only allows floating dtypes, got {dtype}.")
 
-        bins = self.bins
-        if self.is_unit_float and self.bins is not None:
+        q = self.cardinality
+        if self.is_unit_float and q is None:
             return x.to(dtype)
 
         if self.log_bounds is not None:
@@ -248,9 +248,9 @@ def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor:
 
         x = (x - lower) / (upper - lower)
 
-        if bins is not None:
-            quantization_levels = torch.floor(x * bins).clip(0, bins - 1)
-            x = quantization_levels / (bins - 1)
+        if q is not None:
+            quantization_levels = torch.floor(x * q).clip(0, q - 1)
+            x = quantization_levels / (q - 1)
 
         return x.type(dtype)
 
@@ -268,10 +268,10 @@ def from_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor:
         if self.is_unit_float:
             return x.to(dtype)
 
-        bins = self.bins
-        if bins is not None:
-            quantization_levels = torch.floor(x * bins).clip(0, bins - 1)
-            x = quantization_levels / (bins - 1)
+        q = self.cardinality
+        if q is not None:
+            quantization_levels = torch.floor(x * q).clip(0, q - 1)
+            x = quantization_levels / (q - 1)
 
         # Now we scale to the new domain
         if self.log_bounds is not None:
@@ -312,8 +312,8 @@ def cast(self, x: Tensor, frm: Domain, *, dtype: torch.dtype | None = None) -> T
         # have to go through unit space to figure out the bins
         same_bounds = self.lower == frm.lower and self.upper == frm.upper
         same_log_bounds = self.log_bounds == frm.log_bounds
-        same_bins = self.bins == frm.bins
-        if same_bounds and same_log_bounds and (self.bins is None or same_bins):
+        same_cardinality = self.cardinality == frm.cardinality
+        if same_bounds and same_log_bounds and same_cardinality:
             if self.round:
                 x = torch.round(x)
             return x.type(dtype)
@@ -327,7 +327,7 @@ def cast(self, x: Tensor, frm: Domain, *, dtype: torch.dtype | None = None) -> T
         # We can also shortcut out if the only diffrence is that we are coming frm the
         # log bounds of this domain. We dont care if where we came from was binned or not,
         # we just lift it up with `np.exp` and round if needed
-        if (self.lower, self.upper) == frm.log_bounds and self.bins is None:
+        if (self.lower, self.upper) == frm.log_bounds and self.cardinality is None:
             x = torch.exp(x)
             if self.round:
                 x = torch.round(x)
diff --git a/tests/test_domain.py b/tests/test_domain.py
index 0893337f..0b9f2b97 100644
--- a/tests/test_domain.py
+++ b/tests/test_domain.py
@@ -190,6 +190,30 @@ def test_domain_casting(
                 ]
             ).transpose(0, 1),
         ),
+        (
+            # This was a random case found while testing samplers which seemed to fail
+            # Uniform noise convert to integers
+            # 0-0.25 -> 12,
+            # 0.25-0.5 -> 13,
+            # 0.5-0.75 -> 14
+            # 0.75-1 -> 15
+            T(
+                [
+                    [0.2350, 0.6488, 0.6411],
+                    [0.6457, 0.2897, 0.6879],
+                    [0.7401, 0.4268, 0.7607],
+                ]
+            ),
+            Domain.unit_float(),
+            Domain.integer(12, 15),
+            T(
+                [
+                    [12, 14, 14],
+                    [14, 13, 14],
+                    [14, 13, 15],
+                ]
+            ),
+        ),
     ],
 )
 def test_translate(
@@ -200,6 +224,3 @@ def test_translate(
 ) -> None:
     y = Domain.translate(x, frm=frm, to=to)
     torch.testing.assert_close(y, expected, check_dtype=True, msg=f"{y} != {expected}")
-
-    x_back = Domain.translate(y, frm=to, to=frm)
-    torch.testing.assert_close(x_back, x, check_dtype=True, msg=f"{x_back} != {x}")

From 35427ebef1737d111134e58164abe7d0fb01a608 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Mon, 7 Oct 2024 18:38:32 +0200
Subject: [PATCH 62/63] test(Samplers): Initial tests and dtype fixes

---
 neps/sampling/priors.py   | 43 ++++++++++++++++++++++++++-------------
 neps/sampling/samplers.py | 34 ++++++++++++++++++++++---------
 2 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py
index e17db04b..3180e0ac 100644
--- a/neps/sampling/priors.py
+++ b/neps/sampling/priors.py
@@ -106,7 +106,7 @@ def uniform(cls, ncols: int) -> UniformPrior:
         Args:
             ncols: The number of columns in the tensor to sample.
         """
-        return UniformPrior(ndims=ncols)
+        return UniformPrior(ndim=ncols)
 
     @classmethod
     def from_parameters(
@@ -342,8 +342,10 @@ def log_prob(self, x: torch.Tensor, *, frm: list[Domain] | Domain) -> torch.Tens
         first_i, first_dist = next(itr)
         log_probs = first_dist.log_prob(translated_x[..., first_i])
 
+        _weight = 1 / len(self.distributions)
+
         for i, dist in itr:
-            log_probs = log_probs + dist.log_prob(translated_x[..., i])
+            log_probs = log_probs + _weight * dist.log_prob(translated_x[..., i])
 
         return log_probs
 
@@ -381,18 +383,18 @@ class UniformPrior(Prior):
     Uses a UnitUniform under the hood before converting to the value domain.
     """
 
-    ndims: int
+    ndim: int
     """The number of columns in the tensor to sample from."""
 
     @property
     @override
     def ncols(self) -> int:
-        return self.ndims
+        return self.ndim
 
     @override
     def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor:
         # NOTE: We just assume everything is in bounds...
-        shape = x.shape[:-1]
+        shape = x.shape[:-1]  # Select everything up to last dimension (configuration)
         return torch.zeros(shape, dtype=torch.float64, device=x.device)
 
     @override
@@ -409,11 +411,16 @@ def sample(
             raise NotImplementedError("Seeding is not yet implemented.")
 
         _n = (
-            torch.Size((n, self.ndims))
+            torch.Size((n, self.ndim))
             if isinstance(n, int)
-            else torch.Size((*n, self.ndims))
+            else torch.Size((*n, self.ndim))
         )
-        samples = torch.rand(_n, device=device, dtype=dtype)
+        # Doesn't like integer dtypes
+        if dtype is not None and dtype.is_floating_point:
+            samples = torch.rand(_n, device=device, dtype=dtype)
+        else:
+            samples = torch.rand(_n, device=device)
+
         return Domain.translate(samples, frm=UNIT_FLOAT_DOMAIN, to=to, dtype=dtype)
 
 
@@ -437,9 +444,9 @@ def __post_init__(self) -> None:
         )
 
     @property
-    def probabilities(self) -> torch.Tensor:
+    def sampler_probabilities(self) -> torch.Tensor:
         """The probabilities for each sampler. Normalized weights."""
-        return self._weighted_sampler.probabilities
+        return self._weighted_sampler.sampler_probabilities
 
     @property
     @override
@@ -450,12 +457,20 @@ def ncols(self) -> int:
     def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor:
         # OPTIM: Avoid an initial allocation by using the output of the first
         # distribution to store the weighted probabilities
-        itr = zip(self.probabilities, self.priors, strict=False)
+        itr = zip(self.sampler_probabilities, self.priors, strict=False)
         first_prob, first_prior = next(itr)
 
-        weighted_probs = first_prob * first_prior.log_prob(x, frm=frm)
-        for prob, prior in itr:
-            weighted_probs = weighted_probs + prob * prior.log_prob(x, frm=frm)
+        if first_prob == 0.0:
+            weighted_probs = first_prob * first_prior.log_prob(x, frm=frm)
+        else:
+            weighted_probs = torch.zeros(
+                x.shape[:-1], dtype=torch.float64, device=x.device
+            )
+
+        for sampler_prob, prior in itr:
+            if sampler_prob == 0.0:
+                continue
+            weighted_probs = weighted_probs + sampler_prob * prior.log_prob(x, frm=frm)
 
         return weighted_probs
 
diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py
index cf1c1e7a..d3439888 100644
--- a/neps/sampling/samplers.py
+++ b/neps/sampling/samplers.py
@@ -82,7 +82,7 @@ def uniform(cls, ndim: int) -> UniformPrior:
         """
         from neps.sampling.priors import UniformPrior
 
-        return UniformPrior(ndims=ndim)
+        return UniformPrior(ndim=ndim)
 
     @classmethod
     def borders(cls, ndim: int) -> BorderSampler:
@@ -158,8 +158,12 @@ def sample(
             dimension=self.ndim, scramble=self.scramble, seed=_seed
         )
 
-        out = torch.empty(_n, self.ncols, dtype=dtype, device=device)
-        x = sobol.draw(_n, dtype=dtype, out=out)
+        # If integer dtype, sobol will refuse, we need to cast then
+        if dtype is not None and not dtype.is_floating_point:
+            x = sobol.draw(_n, dtype=torch.float64)
+            x = x.to(dtype=dtype, device=device)
+        else:
+            x = sobol.draw(_n, dtype=dtype)
 
         # If we got extra dimensions, such as batch dimensions, we need to
         # reshape the tensor to the desired shape.
@@ -179,7 +183,7 @@ class WeightedSampler(Sampler):
     weights: torch.Tensor
     """The weights for each sampler."""
 
-    probabilities: torch.Tensor = field(init=False, repr=False)
+    sampler_probabilities: torch.Tensor = field(init=False, repr=False)
     """The probabilities for each sampler. Normalized weights."""
 
     def __post_init__(self) -> None:
@@ -201,7 +205,7 @@ def __post_init__(self) -> None:
             )
 
         self._ncols = ncols[0]
-        self.probabilities = self.weights / self.weights.sum()
+        self.sampler_probabilities = self.weights / self.weights.sum()
 
     @property
     @override
@@ -218,6 +222,16 @@ def sample(
         device: torch.device | None = None,
         dtype: torch.dtype | None = None,
     ) -> torch.Tensor:
+        if dtype is None:
+            if isinstance(to, Domain):
+                dtype = to.preffered_dtype
+            else:
+                dtype = (
+                    torch.float64
+                    if any(d.preffered_dtype.is_floating_point for d in to)
+                    else torch.int64
+                )
+
         if seed is not None:
             raise NotImplementedError("Seeding is not yet implemented.")
 
@@ -232,7 +246,7 @@ def sample(
         # Randomly select which sampler to sample from for each of the total_samples
         chosen_samplers = torch.empty((total_samples,), device=device, dtype=torch.int64)
         chosen_samplers = torch.multinomial(
-            self.probabilities,
+            self.sampler_probabilities,
             total_samples,
             replacement=True,
             generator=seed,
@@ -264,9 +278,7 @@ def sample(
                 output_samples[indices] = samples_from_sampler
 
         # Reshape to the output shape including ncols dimension
-        output_samples = output_samples.view(output_shape)
-
-        return Domain.translate(output_samples, frm=UNIT_FLOAT_DOMAIN, to=to)
+        return output_samples.view(output_shape)
 
 
 @dataclass
@@ -295,6 +307,8 @@ def sample(
         device: torch.device | None = None,
         dtype: torch.dtype | None = None,
     ) -> torch.Tensor:
+        dtype = dtype or torch.float64
+
         _arange = torch.arange(self.n_possible, device=device, dtype=torch.int32)
         # Calculate the total number of samples required
         if isinstance(n, int):
@@ -322,5 +336,5 @@ def sample(
         bit_masks = 2 ** _arange[: self.ndim]
         configs = configs.unsqueeze(1).bitwise_and(bit_masks).ne(0).to(dtype)
         # Reshape to the output shape including ncols dimension
-        configs.view(output_shape)
+        configs = configs.view(output_shape)
         return Domain.translate(configs, frm=UNIT_FLOAT_DOMAIN, to=to)

From d59b13458e1fb16a5819753fd2a62c91bfadf389 Mon Sep 17 00:00:00 2001
From: eddiebergman <eddiebergmanhs@gmail.com>
Date: Mon, 7 Oct 2024 18:38:51 +0200
Subject: [PATCH 63/63] test(Sampler): Include test file -_-

---
 tests/test_samplers.py | 93 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 tests/test_samplers.py

diff --git a/tests/test_samplers.py b/tests/test_samplers.py
new file mode 100644
index 00000000..dbc92287
--- /dev/null
+++ b/tests/test_samplers.py
@@ -0,0 +1,93 @@
+from pytest_cases import parametrize
+from neps.sampling.samplers import Sampler, Sobol, WeightedSampler, BorderSampler
+from neps.sampling.priors import Prior, UniformPrior, WeightedPrior
+
+import torch
+
+from neps.search_spaces.domain import Domain
+
+
+def _make_centered_prior(ndim: int) -> Prior:
+    return Prior.make_centered(
+        domains=[Domain.unit_float() for _ in range(ndim)],
+        centers=[(0.5, 0.5) for _ in range(ndim)],
+    )
+
+
+@parametrize(
+    "sampler",
+    [
+        Sobol(ndim=3),
+        BorderSampler(ndim=3),
+        UniformPrior(ndim=3),
+        # Convenence method for making a distribution around center points
+        _make_centered_prior(ndim=3),
+        WeightedSampler(
+            [UniformPrior(ndim=3), _make_centered_prior(3), Sobol(ndim=3)],
+            weights=torch.tensor([0.5, 0.25, 0.25]),
+        ),
+        WeightedPrior(
+            [UniformPrior(ndim=3), _make_centered_prior(3), UniformPrior(ndim=3)],
+            weights=torch.tensor([0.5, 0.25, 0.25]),
+        ),
+    ],
+)
+def test_sampler_samples_into_domain(sampler: Sampler) -> None:
+    assert sampler.ncols == 3
+
+    domain_to_sample_into = Domain.integer(12, 15)
+    for _ in range(10):
+        x = sampler.sample(
+            n=5,
+            to=domain_to_sample_into,
+            seed=None,
+        )
+
+        assert x.shape == (5, 3)
+        assert (x >= 12).all()
+        assert (x <= 15).all()
+
+    x = sampler.sample(
+        n=torch.Size((2, 1)),
+        to=domain_to_sample_into,
+        seed=None,
+    )
+    assert x.shape == (2, 1, 3)
+    assert (x >= 12).all()
+    assert (x <= 15).all()
+
+
+@parametrize(
+    "prior",
+    [
+        UniformPrior(ndim=3),
+        # Convenence method for making a distribution around center points
+        _make_centered_prior(ndim=3),
+        WeightedPrior(
+            [UniformPrior(ndim=3), _make_centered_prior(3), UniformPrior(ndim=3)],
+            weights=torch.tensor([0.5, 0.25, 0.25]),
+        ),
+    ],
+)
+def test_priors_give_positive_pdfs(prior: Prior) -> None:
+    # NOTE: The uniform prior does not check that
+    assert prior.ncols == 3
+    domain = Domain.floating(10, 100)
+
+    x = prior.sample(n=5, to=domain, seed=None)
+    assert x.shape == (5, 3)
+    assert (x >= 10).all()
+    assert (x <= 100).all()
+
+    probs = prior.prob(x, frm=domain)
+    assert (probs >= 0).all()
+    assert probs.shape == (5,)
+
+    x = prior.sample(n=torch.Size((2, 1)), to=domain, seed=None)
+    assert x.shape == (2, 1, 3)
+    assert (x >= 10).all()
+    assert (x <= 100).all()
+
+    probs = prior.prob(x, frm=domain)
+    assert (probs >= 0).all()
+    assert probs.shape == (2, 1)