From add9eaa9287a9a9b69acc4138751f6b18cba5721 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Tue, 13 Aug 2024 20:07:36 +0200 Subject: [PATCH 01/63] refactor: DeepGP --- .../bayesian_optimization/models/deepGP.py | 634 ------------------ neps/optimizers/models/__init__.py | 0 neps/optimizers/models/deepGP.py | 572 ++++++++++++++++ 3 files changed, 572 insertions(+), 634 deletions(-) delete mode 100644 neps/optimizers/bayesian_optimization/models/deepGP.py create mode 100644 neps/optimizers/models/__init__.py create mode 100644 neps/optimizers/models/deepGP.py diff --git a/neps/optimizers/bayesian_optimization/models/deepGP.py b/neps/optimizers/bayesian_optimization/models/deepGP.py deleted file mode 100644 index d5145043..00000000 --- a/neps/optimizers/bayesian_optimization/models/deepGP.py +++ /dev/null @@ -1,634 +0,0 @@ -from __future__ import annotations - -import logging -import os -from copy import deepcopy -from pathlib import Path - -import gpytorch -import numpy as np -import torch -import torch.nn as nn - -from ....search_spaces.search_space import ( - CategoricalParameter, - FloatParameter, - IntegerParameter, - SearchSpace, -) - - -def count_non_improvement_steps(root_directory: Path | str) -> int: - root_directory = Path(root_directory) - - all_losses_file = root_directory / "all_losses_and_configs.txt" - best_loss_fiel = root_directory / "best_loss_trajectory.txt" - - # Read all losses from the file in the order they are explored - losses = [ - float(line[6:]) - for line in all_losses_file.read_text(encoding="utf-8").splitlines() - if "Loss: " in line - ] - # Get the best seen loss value - best_loss = float(best_loss_fiel.read_text(encoding="utf-8").splitlines()[-1].strip()) - - # Count the non-improvement - count = 0 - for loss in reversed(losses): - if np.greater(loss, best_loss): - count += 1 - else: - break - - return count - - -class NeuralFeatureExtractor(nn.Module): - """ - Neural network to be used in the DeepGP - """ - - def __init__(self, input_size: int, **kwargs): - super().__init__() - - # Set number of hyperparameters - self.input_size = input_size - - self.n_layers = kwargs.get("n_layers", 2) - self.activation = nn.LeakyReLU() - - layer1_units = kwargs.get("layer1_units", 128) - self.fc1 = nn.Linear(input_size, layer1_units) - self.bn1 = nn.BatchNorm1d(layer1_units) - - previous_layer_units = layer1_units - for i in range(2, self.n_layers): - next_layer_units = kwargs.get(f"layer{i}_units", 256) - setattr( - self, - f"fc{i}", - nn.Linear(previous_layer_units, next_layer_units), - ) - setattr( - self, - f"bn{i}", - nn.BatchNorm1d(next_layer_units), - ) - previous_layer_units = next_layer_units - - setattr( - self, - f"fc{self.n_layers}", - nn.Linear( - previous_layer_units + kwargs.get("cnn_nr_channels", 4), - # accounting for the learning curve features - kwargs.get(f"layer{self.n_layers}_units", 256), - ), - ) - self.cnn = nn.Sequential( - nn.Conv1d( - in_channels=1, - kernel_size=(kwargs.get("cnn_kernel_size", 3),), - out_channels=4, - ), - nn.AdaptiveMaxPool1d(1), - ) - - def forward(self, x, budgets, learning_curves): - # add an extra dimensionality for the budget - # making it nr_rows x 1. - budgets = torch.unsqueeze(budgets, dim=1) - # concatenate budgets with examples - x = torch.cat((x, budgets), dim=1) - x = self.fc1(x) - x = self.activation(self.bn1(x)) - - for i in range(2, self.n_layers): - x = self.activation(getattr(self, f"bn{i}")(getattr(self, f"fc{i}")(x))) - - # add an extra dimensionality for the learning curve - # making it nr_rows x 1 x lc_values. - learning_curves = torch.unsqueeze(learning_curves, 1) - lc_features = self.cnn(learning_curves) - # revert the output from the cnn into nr_rows x nr_kernels. - lc_features = torch.squeeze(lc_features, 2) - - # put learning curve features into the last layer along with the higher level features. - x = torch.cat((x, lc_features), dim=1) - x = self.activation(getattr(self, f"fc{self.n_layers}")(x)) - - return x - - -class GPRegressionModel(gpytorch.models.ExactGP): - """ - A simple GP model. - """ - - def __init__( - self, - train_x: torch.Tensor, - train_y: torch.Tensor, - likelihood: gpytorch.likelihoods.GaussianLikelihood, - ): - """ - Constructor of the GPRegressionModel. - - Args: - train_x: The initial train examples for the GP. - train_y: The initial train labels for the GP. - likelihood: The likelihood to be used. - """ - super().__init__(train_x, train_y, likelihood) - - self.mean_module = gpytorch.means.ConstantMean() - self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel()) - - def forward(self, x): - mean_x = self.mean_module(x) - covar_x = self.covar_module(x) - - return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) - - -class DeepGP: - """ - Gaussian process with a deep kernel - """ - - def __init__( - self, - pipeline_space: SearchSpace, - neural_network_args: dict | None = None, - logger=None, - surrogate_model_fit_args: dict | None = None, - # IMPORTANT: Checkpointing does not use file locking, - # IMPORTANT: hence, it is not suitable for multiprocessing settings - checkpointing: bool = False, - root_directory: Path | str | None = None, - checkpoint_file: Path | str = "surrogate_checkpoint.pth", - refine_epochs: int = 50, - **kwargs, - ): - self.surrogate_model_fit_args = ( - surrogate_model_fit_args if surrogate_model_fit_args is not None else {} - ) - - self.checkpointing = checkpointing - self.refine_epochs = refine_epochs - if checkpointing: - assert ( - root_directory is not None - ), "neps root_directory must be provided for the checkpointing" - self.root_dir = Path(os.getcwd(), root_directory) - self.checkpoint_path = Path(os.getcwd(), root_directory, checkpoint_file) - - super().__init__() - self.__preprocess_search_space(pipeline_space) - # set the categories array for the encoder - self.categories_array = np.array(self.categories) - - if neural_network_args is None: - neural_network_args = {} - self.nn_args = neural_network_args - - self.device = ( - torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") - ) - # self.device = torch.device("cpu") - - # Save the NN args, necessary for preprocessing - self.cnn_kernel_size = neural_network_args.get("cnn_kernel_size", 3) - self.model, self.likelihood, self.mll = self.__initialize_gp_model( - neural_network_args.get("n_layers", 2) - ) - - # build the neural network - self.nn = NeuralFeatureExtractor(self.input_size, **neural_network_args) - - self.logger = logger or logging.getLogger("neps") - - def __initialize_gp_model( - self, - train_size: int, - ) -> tuple[ - GPRegressionModel, - gpytorch.likelihoods.GaussianLikelihood, - gpytorch.mlls.ExactMarginalLogLikelihood, - ]: - """ - Called when the surrogate is first initialized or restarted. - - Args: - train_size: The size of the current training set. - - Returns: - model, likelihood, mll - The GP model, the likelihood and - the marginal likelihood. - """ - train_x = torch.ones(train_size, train_size).to(self.device) - train_y = torch.ones(train_size).to(self.device) - - likelihood = gpytorch.likelihoods.GaussianLikelihood().to(self.device) - model = GPRegressionModel( - train_x=train_x, train_y=train_y, likelihood=likelihood - ).to(self.device) - mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model).to(self.device) - return model, likelihood, mll - - def __preprocess_search_space(self, pipeline_space: SearchSpace): - self.categories = [] - self.categorical_hps = [] - - parameter_count = 0 - for hp_name, hp in pipeline_space.items(): - # Collect all categories in a list for the encoder - if isinstance(hp, CategoricalParameter): - self.categorical_hps.append(hp_name) - self.categories.extend(hp.choices) - parameter_count += len(hp.choices) - else: - parameter_count += 1 - - # add 1 for budget - self.input_size = parameter_count - self.continuous_params_size = self.input_size - len(self.categories) - self.min_fidelity = pipeline_space.fidelity.lower - self.max_fidelity = pipeline_space.fidelity.upper - - def __encode_config(self, config: SearchSpace): - categorical_encoding = np.zeros_like(self.categories_array) - continuous_values = [] - - for hp_name, hp in config.items(): - if hp.is_fidelity: - continue # Ignore fidelity - if hp_name in self.categorical_hps: - label = hp.value - categorical_encoding[np.argwhere(self.categories_array == label)] = 1 - else: - continuous_values.append(hp.value_to_normalized(hp.value)) - - continuous_encoding = np.array(continuous_values) - - encoding = np.concatenate([categorical_encoding, continuous_encoding]) - return encoding - - def __extract_budgets( - self, x_train: list[SearchSpace], normalized: bool = True - ) -> np.ndarray: - budgets = np.array([config.fidelity.value for config in x_train], dtype=np.single) - if normalized: - normalized_budgets = (budgets - self.min_fidelity) / ( - self.max_fidelity - self.min_fidelity - ) - budgets = normalized_budgets - return budgets - - def __preprocess_learning_curves( - self, learning_curves: list[list[float]], padding_value: float = 0.0 - ) -> np.ndarray: - # Add padding to the learning curves to make them the same size - - # Get max learning_curve length - max_length = 0 - for lc in learning_curves: - length = len(lc) - if length > max_length: - max_length = length - - for lc in learning_curves: - # add padding to the learning curve to fit the cnn kernel or - # the max_length depending on which is the largest - padding_length = max([max_length - len(lc), self.cnn_kernel_size - len(lc)]) - lc.extend([padding_value] * padding_length) - - # TODO: check if the lc values are within bounds [0, 1] (karibbov) - # TODO: add normalize_lcs option in the future - - return np.array(learning_curves, dtype=np.single) - - def __reset_xy( - self, - x_train: list[SearchSpace], - y_train: list[float], - learning_curves: list[list[float]], - normalize_y: bool = False, - normalize_budget: bool = True, - ): - self.normalize_budget = normalize_budget - self.normalize_y = normalize_y - - x_train, train_budgets, learning_curves = self._preprocess_input( - x_train, learning_curves, normalize_budget - ) - - y_train = self._preprocess_y(y_train, normalize_y) - - self.x_train = x_train - self.train_budgets = train_budgets - self.learning_curves = learning_curves - self.y_train = y_train - - def _preprocess_input( - self, - x: list[SearchSpace], - learning_curves: list[list[float]], - normalize_budget: bool = True, - ): - budgets = self.__extract_budgets(x, normalize_budget) - learning_curves = self.__preprocess_learning_curves(learning_curves) - - x = np.array([self.__encode_config(config) for config in x], dtype=np.single) - - x = torch.tensor(x).to(device=self.device) - budgets = torch.tensor(budgets).to(device=self.device) - learning_curves = torch.tensor(learning_curves).to(device=self.device) - - return x, budgets, learning_curves - - def _preprocess_y(self, y_train: list[float], normalize_y: bool = False): - y_train_array = np.array(y_train, dtype=np.single) - self.min_y = y_train_array.min() - self.max_y = y_train_array.max() - if normalize_y: - y_train_array = (y_train_array - self.min_y) / (self.max_y - self.min_y) - y_train_array = torch.tensor(y_train_array).to(device=self.device) - return y_train_array - - def fit( - self, - x_train: list[SearchSpace], - y_train: list[float], - learning_curves: list[list[float]], - ): - self._fit(x_train, y_train, learning_curves, **self.surrogate_model_fit_args) - - def _fit( - self, - x_train: list[SearchSpace], - y_train: list[float], - learning_curves: list[list[float]], - normalize_y: bool = False, - normalize_budget: bool = True, - n_epochs: int = 1000, - batch_size: int = 64, - optimizer_args: dict | None = None, - early_stopping: bool = True, - patience: int = 10, - perf_patience: int = 10, - ): - self.__reset_xy( - x_train, - y_train, - learning_curves, - normalize_y=normalize_y, - normalize_budget=normalize_budget, - ) - self.model, self.likelihood, self.mll = self.__initialize_gp_model(len(y_train)) - self.nn = NeuralFeatureExtractor(self.input_size, **self.nn_args) - self.model.to(self.device) - self.likelihood.to(self.device) - self.nn.to(self.device) - - if self.checkpointing and self.checkpoint_path.exists(): - non_improvement_steps = count_non_improvement_steps(self.root_dir) - # If checkpointing and patience is not exhausted load a partial model - if non_improvement_steps < perf_patience: - n_epochs = self.refine_epochs - self.load_checkpoint() - self.logger.debug(f"No improvement for: {non_improvement_steps} evaulations") - self.logger.debug(f"N Epochs for the full training: {n_epochs}") - - initial_state = self.get_state() - try: - self.__train_model( - self.x_train, - self.train_budgets, - self.learning_curves, - self.y_train, - n_epochs=n_epochs, - batch_size=batch_size, - optimizer_args=optimizer_args, - early_stopping=early_stopping, - patience=patience, - ) - if self.checkpointing: - self.save_checkpoint() - except gpytorch.utils.errors.NotPSDError: - self.logger.info("Model training failed loading the untrained model") - self.load_checkpoint(initial_state) - # Delete checkpoint to restart training - self.delete_checkpoint() - - def __train_model( - self, - x_train: torch.Tensor, - train_budgets: torch.Tensor, - learning_curves: torch.Tensor, - y_train: torch.Tensor, - n_epochs: int = 1000, - batch_size: int = 64, - optimizer_args: dict | None = None, - early_stopping: bool = True, - patience: int = 10, - ): - if optimizer_args is None: - optimizer_args = {"lr": 0.001} - - self.model.train() - self.likelihood.train() - self.nn.train() - self.optimizer = torch.optim.Adam( - [ - dict({"params": self.model.parameters()}, **optimizer_args), - dict({"params": self.nn.parameters()}, **optimizer_args), - ] - ) - - count_down = patience - min_avg_loss_val = np.inf - average_loss: float = 0.0 - - for epoch_nr in range(0, n_epochs): - if early_stopping and count_down == 0: - self.logger.info( - f"Epoch: {epoch_nr - 1} surrogate training stops due to early " - f"stopping with the patience: {patience} and " - f"the minimum average loss of {min_avg_loss_val} and " - f"the final average loss of {average_loss}" - ) - break - - n_examples_batch = x_train.size(dim=0) - - # get a random permutation for mini-batches - permutation = torch.randperm(n_examples_batch) - - # optimize over mini-batches - total_scaled_loss = 0.0 - for batch_idx, start_index in enumerate( - range(0, n_examples_batch, batch_size) - ): - end_index = start_index + batch_size - if end_index > n_examples_batch: - end_index = n_examples_batch - indices = permutation[start_index:end_index] - batch_x, batch_budget, batch_lc, batch_y = ( - x_train[indices], - train_budgets[indices], - learning_curves[indices], - y_train[indices], - ) - - minibatch_size = end_index - start_index - # if only one example in the batch, skip the batch. - # Otherwise, the code will fail because of batchnorm - if minibatch_size <= 1: - continue - - # Zero backprop gradients - self.optimizer.zero_grad() - - projected_x = self.nn(batch_x, batch_budget, batch_lc) - self.model.set_train_data(projected_x, batch_y, strict=False) - output = self.model(projected_x) - - # try: - # Calc loss and backprop derivatives - loss = -self.mll(output, self.model.train_targets) - episodic_loss_value: float = loss.detach().to("cpu").item() - # weighted sum over losses in the batch - total_scaled_loss = ( - total_scaled_loss + episodic_loss_value * minibatch_size - ) - - mse = gpytorch.metrics.mean_squared_error( - output, self.model.train_targets - ) - self.logger.debug( - f"Epoch {epoch_nr} Batch {batch_idx} - MSE {mse:.5f}, " - f"Loss: {episodic_loss_value:.3f}, " - f"lengthscale: {self.model.covar_module.base_kernel.lengthscale.item():.3f}, " - f"noise: {self.model.likelihood.noise.item():.3f}, " - ) - - loss.backward() - self.optimizer.step() - - # Get average weighted loss over every batch - average_loss = total_scaled_loss / n_examples_batch - if average_loss < min_avg_loss_val: - min_avg_loss_val = average_loss - count_down = patience - elif early_stopping: - self.logger.debug( - f"No improvement over the minimum loss value of {min_avg_loss_val} " - f"for the past {patience - count_down} epochs " - f"the training will stop in {count_down} epochs" - ) - count_down -= 1 - # except Exception as training_error: - # self.logger.error( - # f'The following error happened while training: {training_error}') - # # An error has happened, trigger the restart of the optimization and restart - # # the model with default hyperparameters. - # self.restart = True - # training_errored = True - # break - - def set_prediction_learning_curves(self, learning_curves: list[list[float]]): - self.prediction_learning_curves = learning_curves - - def predict( - self, x: list[SearchSpace], learning_curves: list[list[float]] | None = None - ): - # Preprocess input - if learning_curves is None: - learning_curves = self.prediction_learning_curves - x_test, test_budgets, learning_curves = self._preprocess_input( - x, learning_curves, self.normalize_budget - ) - - self.model.eval() - self.nn.eval() - self.likelihood.eval() - - with torch.no_grad(): - projected_train_x = self.nn( - self.x_train, self.train_budgets, self.learning_curves - ) - self.model.set_train_data( - inputs=projected_train_x, targets=self.y_train, strict=False - ) - - projected_test_x = self.nn(x_test, test_budgets, learning_curves) - - preds = self.likelihood(self.model(projected_test_x)) - - means = preds.mean.detach().cpu() - - if self.normalize_y: - means = (means + self.min_y) * (self.max_y - self.min_y) - - cov = torch.diag(torch.pow(preds.stddev.detach(), 2)).cpu() - - return means, cov - - def load_checkpoint(self, state: dict | None = None): - """ - Load the state from a previous checkpoint. - """ - if state is None: - checkpoint = torch.load(self.checkpoint_path) - else: - checkpoint = state - self.model.load_state_dict(checkpoint["gp_state_dict"]) - self.nn.load_state_dict(checkpoint["nn_state_dict"]) - self.likelihood.load_state_dict(checkpoint["likelihood_state_dict"]) - - self.model.to(self.device) - self.likelihood.to(self.device) - self.nn.to(self.device) - - def save_checkpoint(self, state: dict | None = None): - """ - Save the given state or the current state in a - checkpoint file. - - Args: - checkpoint_path: path to the checkpoint file - state: The state to save, if none, it will - save the current state. - """ - - if state is None: - torch.save( - self.get_state(), - self.checkpoint_path, - ) - else: - torch.save( - state, - self.checkpoint_path, - ) - - def get_state(self) -> dict[str, dict]: - """ - Get the current state of the surrogate. - - Returns: - current_state: A dictionary that represents - the current state of the surrogate model. - """ - current_state = { - "gp_state_dict": deepcopy(self.model.state_dict()), - "nn_state_dict": deepcopy(self.nn.state_dict()), - "likelihood_state_dict": deepcopy(self.likelihood.state_dict()), - } - - return current_state - - def delete_checkpoint(self): - self.checkpoint_path.unlink(missing_ok=True) diff --git a/neps/optimizers/models/__init__.py b/neps/optimizers/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/neps/optimizers/models/deepGP.py b/neps/optimizers/models/deepGP.py new file mode 100644 index 00000000..e0c225e6 --- /dev/null +++ b/neps/optimizers/models/deepGP.py @@ -0,0 +1,572 @@ +from __future__ import annotations +from dataclasses import dataclass, field + +import logging +import os +from copy import deepcopy +from pathlib import Path + +import gpytorch +import numpy as np +import torch +import torch.nn as nn +from neps.search_spaces.architecture.graph_grammar import GraphParameter + +from neps.exceptions import SurrogateFailedToFit + +from neps.search_spaces.search_space import ( + CategoricalParameter, + FloatParameter, + IntegerParameter, + SearchSpace, +) + +logger = logging.getLogger(__name__) + + +def count_non_improvement_steps(root_directory: Path | str) -> int: + root_directory = Path(root_directory) + + all_losses_file = root_directory / "all_losses_and_configs.txt" + best_loss_fiel = root_directory / "best_loss_trajectory.txt" + + # Read all losses from the file in the order they are explored + losses = [ + float(line[6:]) + for line in all_losses_file.read_text(encoding="utf-8").splitlines() + if "Loss: " in line + ] + # Get the best seen loss value + best_loss = float(best_loss_fiel.read_text(encoding="utf-8").splitlines()[-1].strip()) + + # Count the non-improvement + count = 0 + for loss in reversed(losses): + if np.greater(loss, best_loss): + count += 1 + else: + break + + return count + + +class NeuralFeatureExtractor(nn.Module): + """ + Neural network to be used in the DeepGP + """ + + def __init__(self, input_size: int, **kwargs): + super().__init__() + + # Set number of hyperparameters + self.input_size = input_size + + self.n_layers = kwargs.get("n_layers", 2) + self.activation = nn.LeakyReLU() + + layer1_units = kwargs.get("layer1_units", 128) + self.fc1 = nn.Linear(input_size, layer1_units) + self.bn1 = nn.BatchNorm1d(layer1_units) + + previous_layer_units = layer1_units + for i in range(2, self.n_layers): + next_layer_units = kwargs.get(f"layer{i}_units", 256) + setattr( + self, + f"fc{i}", + nn.Linear(previous_layer_units, next_layer_units), + ) + setattr( + self, + f"bn{i}", + nn.BatchNorm1d(next_layer_units), + ) + previous_layer_units = next_layer_units + + setattr( + self, + f"fc{self.n_layers}", + nn.Linear( + previous_layer_units + kwargs.get("cnn_nr_channels", 4), + # accounting for the learning curve features + kwargs.get(f"layer{self.n_layers}_units", 256), + ), + ) + self.cnn = nn.Sequential( + nn.Conv1d( + in_channels=1, + kernel_size=(kwargs.get("cnn_kernel_size", 3),), + out_channels=4, + ), + nn.AdaptiveMaxPool1d(1), + ) + + def forward(self, x, budgets, learning_curves): + # add an extra dimensionality for the budget + # making it nr_rows x 1. + budgets = torch.unsqueeze(budgets, dim=1) + # concatenate budgets with examples + x = torch.cat((x, budgets), dim=1) + x = self.fc1(x) + x = self.activation(self.bn1(x)) + + for i in range(2, self.n_layers): + x = self.activation(getattr(self, f"bn{i}")(getattr(self, f"fc{i}")(x))) + + # add an extra dimensionality for the learning curve + # making it nr_rows x 1 x lc_values. + learning_curves = torch.unsqueeze(learning_curves, 1) + lc_features = self.cnn(learning_curves) + # revert the output from the cnn into nr_rows x nr_kernels. + lc_features = torch.squeeze(lc_features, 2) + + # put learning curve features into the last layer along with the higher level features. + x = torch.cat((x, lc_features), dim=1) + x = self.activation(getattr(self, f"fc{self.n_layers}")(x)) + + return x + + +class GPRegressionModel(gpytorch.models.ExactGP): + """ + A simple GP model. + """ + + def __init__( + self, + train_x: torch.Tensor, + train_y: torch.Tensor, + likelihood: gpytorch.likelihoods.GaussianLikelihood, + ): + """ + Constructor of the GPRegressionModel. + + Args: + train_x: The initial train examples for the GP. + train_y: The initial train labels for the GP. + likelihood: The likelihood to be used. + """ + super().__init__(train_x, train_y, likelihood) + + self.mean_module = gpytorch.means.ConstantMean() + self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel()) + + def forward(self, x): + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + + return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + + +@dataclass +class DeepGPDataTransformer: + # TODO: This class could be used for other models as well + space: SearchSpace + fidelity_bounds: tuple[int | float, int | float] | None + normalize_y: bool + min_learning_curve_length: int + learning_curve_pad_value: float + device: torch.device + + numericals: dict[str, FloatParameter | IntegerParameter] = field(init=False) + categoricals: dict[str, CategoricalParameter] = field(init=False) + output_dim: int = field(init=False) + + def __post_init__(self) -> None: + self.numericals = { + name: h + for name, h in self.space.items() + if isinstance(h, (FloatParameter, IntegerParameter)) + } + self.categoricals = { + name: h + for name, h in self.space.items() + if isinstance(h, CategoricalParameter) + } + self.output_dim = len(self.numericals) + sum( + len(c.choices) for c in self.categoricals.values() + ) + + def encode_configs( + self, + configs: list[SearchSpace], + ) -> tuple[torch.Tensor, torch.Tensor]: + x_buffer = torch.empty( + (len(configs), self.output_dim), + device=self.device, + dtype=torch.float32, + ) + + # Normals are just fill the columns with the normalized values + for i, (hp_name, hp) in enumerate(self.numericals.items()): + budget_tensor = torch.tensor( + [config[hp_name].value for config in configs], + device=self.device, + dtype=torch.float32, + ) + + x_buffer[:, i] = (budget_tensor - hp.lower) / (hp.upper - hp.lower) + + # Categoricals is a bit harder, we create a tensor with all the indices (values) + # as we did above, but then we sub-select the portion of the buffer for that categorical + # before inserting the one-hot encoding. + offset = len(self.numericals) + for hp_name, hp in self.categoricals.items(): + budget_tensor = torch.tensor( + [config[hp_name]._value_index for config in configs], # type: ignore + device=self.device, + dtype=torch.float64, + ) + + # .. and insert one-hot encoding (ChatGPT solution, verified locally) + portion = x_buffer[:, offset : offset + len(hp.choices)] + portion.scatter_(1, budget_tensor.unsqueeze(1), 1) + + offset += len(hp.choices) + + # Finally, ... budgets + budgets = [config.fidelity.value for config in configs] # type: ignore + budget_tensor = torch.tensor(budgets, device=self.device, dtype=torch.float32) + if self.fidelity_bounds: + assert self.space.fidelity is not None + _min = self.space.fidelity.lower + _max = self.space.fidelity.upper + budget_tensor.sub_(_min).div_(_max - _min) + + return x_buffer, budget_tensor + + def encode_learning_curves(self, learning_curves: list[list[float]]) -> torch.Tensor: + lc_height = len(learning_curves) + lc_width = max( + max(len(lc) for lc in learning_curves), self.min_learning_curve_length + ) + lc_buffer = torch.full( + (lc_width, lc_height), + self.learning_curve_pad_value, + device=self.device, + dtype=torch.float32, + ) + + for i, lc in enumerate(learning_curves): + lc_buffer[: len(lc), i] = torch.tensor( + lc, device=self.device, dtype=torch.float32 + ) + + return lc_buffer + + def encode_y( + self, y: list[float] + ) -> tuple[torch.Tensor, None | tuple[int | float, int | float]]: + tensor = torch.tensor(y, device=self.device, dtype=torch.float32) + if self.fidelity_bounds: + _min, _max = tensor.min(), tensor.max() + tensor.sub_(_min).div_(_max - _min) + bounds = (_min.detach().item(), _max.detach().item()) + else: + bounds = None + + return tensor, bounds + + +def _train_model( + x_train: torch.Tensor, + train_budgets: torch.Tensor, + learning_curves: torch.Tensor, + model: GPRegressionModel, + likelihood: gpytorch.likelihoods.GaussianLikelihood, + device: torch.device, + nn: NeuralFeatureExtractor, + y_train: torch.Tensor, + n_epochs: int = 1000, + batch_size: int = 64, + optimizer_args: dict | None = None, + early_stopping: bool = True, + patience: int = 10, +): + if optimizer_args is None: + optimizer_args = {"lr": 0.001} + + mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model).to(device) + + # Set to training mode + mll.train() + model.train() + likelihood.train() + nn.train() + + optimizer = torch.optim.Adam( + [ + dict({"params": model.parameters()}, **optimizer_args), + dict({"params": nn.parameters()}, **optimizer_args), + ] + ) + + count_down = patience + min_avg_loss_val = np.inf + average_loss: float = 0.0 + + for epoch_nr in range(0, n_epochs): + if early_stopping and count_down == 0: + logger.info( + f"Epoch: {epoch_nr - 1} surrogate training stops due to early " + f"stopping with the patience: {patience} and " + f"the minimum average loss of {min_avg_loss_val} and " + f"the final average loss of {average_loss}" + ) + break + + n_examples_batch = x_train.size(dim=0) + + # get a random permutation for mini-batches + permutation = torch.randperm(n_examples_batch) + + # optimize over mini-batches + total_scaled_loss = 0.0 + for batch_idx, start_index in enumerate(range(0, n_examples_batch, batch_size)): + end_index = min(start_index + batch_size, n_examples_batch) + minibatch_size = end_index - start_index + 1 + + # if only one example in the batch, skip the batch. + # Otherwise, the code will fail because of batchnorm + if minibatch_size <= 1: + continue + + indices = permutation[start_index:end_index] + + batch_x, batch_budget, batch_lc, batch_y = ( + x_train[indices], + train_budgets[indices], + learning_curves[indices], + y_train[indices], + ) + + # Zero backprop gradients + optimizer.zero_grad() + + projected_x = nn(batch_x, batch_budget, batch_lc) + model.set_train_data(projected_x, batch_y, strict=False) + output = model(projected_x) + + # Calc loss and backprop derivatives + loss = -mll(output, model.train_targets) # type: ignore + episodic_loss_value: float = loss.detach().to("cpu").item() + # weighted sum over losses in the batch + total_scaled_loss = total_scaled_loss + episodic_loss_value * minibatch_size + + mse = gpytorch.metrics.mean_squared_error(output, model.train_targets) + logger.debug( + f"Epoch {epoch_nr} Batch {batch_idx} - MSE {mse:.5f}, " + f"Loss: {episodic_loss_value:.3f}, " + f"lengthscale: {model.covar_module.base_kernel.lengthscale.item():.3f}, " + f"noise: {model.likelihood.noise.item():.3f}, " # type: ignore + ) + + loss.backward() + optimizer.step() + + # Get average weighted loss over every batch + average_loss = total_scaled_loss / n_examples_batch + if average_loss < min_avg_loss_val: + min_avg_loss_val = average_loss + count_down = patience + elif early_stopping: + logger.debug( + f"No improvement over the minimum loss value of {min_avg_loss_val} " + f"for the past {patience - count_down} epochs " + f"the training will stop in {count_down} epochs" + ) + count_down -= 1 + + +@dataclass +class DeepGP: + """Gaussian process with a deep kernel.""" + + # Required + pipeline_space: SearchSpace + + # Optional + learning_curve_pad_value: float = 0.0 + root_directory: Path | None = None + # IMPORTANT: Checkpointing does not use file locking + # IMPORTANT: hence it is not suitable for multiprocessing settings + checkpoint_file: Path | str = "surrogate_checkpoint.pth" + checkpointing: bool = False + early_stopping: bool = True + batch_size: int = 64 + n_epochs: int = 1000 + patience: int = 10 + refine_epochs: int = 50 + perf_patience: int = 10 + device: torch.device = field( + default_factory=lambda: torch.device("cuda") + if torch.cuda.is_available() + else torch.device("cpu") + ) + normalize_budget: bool = True + normalize_y: bool = True + neural_network_args: dict = field(default_factory=dict) + surrogate_model_fit_args: dict = field(default_factory=dict) + optimizer_args: dict = field(default_factory=dict) + + # Created from the above arguments + # TODO: Lift this out of DeepGP and let the optimizer worry about pre-processing + preprocessor: DeepGPDataTransformer = field(init=False) + + # Post fit parameters, following scikit-learn convention of appending an underscore + model_: GPRegressionModel | None = field(init=False) + likelihood_: gpytorch.likelihoods.GaussianLikelihood | None = field(init=False) + nn_: NeuralFeatureExtractor | None = field(init=False) + projected_x_train_: torch.Tensor | None = field(init=False) + y_train_: torch.Tensor | None = field(init=False) + y_bounds_: tuple[float, float] | None = field(init=False) + + def __post_init__(self): + if any(isinstance(h, GraphParameter) for h in self.pipeline_space.values()): + raise ValueError("Graph parameters are not supported for DeepGP") + + if self.normalize_budget: + budget_bounds = (pipeline_space.fidelity.lower, pipeline_space.fidelity.upper) # type: ignore + else: + budget_bounds = None + + if self.checkpointing: + assert ( + self.root_directory is not None + ), "neps root_directory must be provided for the checkpointing" + self.checkpoint_path = self.root_directory / self.checkpoint_file + + self.preprocessor = DeepGPDataTransformer( + space=self.pipeline_space, + fidelity_bounds=budget_bounds, + normalize_y=self.normalize_y, + min_learning_curve_length=self.neural_network_args.get("cnn_kernel_size", 3), + learning_curve_pad_value=self.learning_curve_pad_value, + device=self.device, + ) + self.model_ = None + self.likelihood_ = None + self.nn_ = None + + def fit( + self, + x_train: list[SearchSpace], + y_train: list[float], + learning_curves: list[list[float]], + ): + x_, train_budget = self.preprocessor.encode_configs(x_train) + curves = self.preprocessor.encode_learning_curves(learning_curves) + y_, y_bounds = self.preprocessor.encode_y(y_train) + + # Required for predictions later + self.y_train_ = y_ + self.y_bounds_ = y_bounds + + input_dim = x_.shape[1] + + # Initial state + likelihood = gpytorch.likelihoods.GaussianLikelihood().to(self.device) + model = GPRegressionModel(train_x=x_, train_y=y_, likelihood=likelihood).to( + self.device + ) + nn = NeuralFeatureExtractor(input_dim, **self.neural_network_args).to(self.device) + + # If checkpointing and we are improving, load existing model + if self.checkpointing and self.checkpoint_path.exists(): + assert self.root_directory is not None + + non_improvement_steps = count_non_improvement_steps(self.root_directory) + if non_improvement_steps < self.perf_patience: + n_epochs = self.refine_epochs + + checkpoint = torch.load(self.checkpoint_path) + model.load_state_dict(checkpoint["gp_state_dict"]) + nn.load_state_dict(checkpoint["nn_state_dict"]) + likelihood.load_state_dict(checkpoint["likelihood_state_dict"]) + else: + n_epochs = self.n_epochs + logger.debug(f"No improvement for: {non_improvement_steps} evaulations") + else: + # Starting from scratch + n_epochs = self.n_epochs + + logger.debug(f"N Epochs for the full training: {self.n_epochs}") + + try: + _train_model( + x_train=x_, + train_budgets=train_budget, + learning_curves=curves, + y_train=y_, + model=model, + likelihood=likelihood, + nn=nn, + n_epochs=n_epochs, + device=self.device, + batch_size=self.batch_size, + optimizer_args=self.optimizer_args, + early_stopping=self.early_stopping, + patience=self.patience, + ) + self.model_ = model + self.likelihood_ = likelihood + self.nn_ = nn + + nn.eval() + # Cheaper to do this once during fit, rather than on each call to predict + self.projected_x_train_ = nn(x_, train_budget, curves) + + if self.checkpointing: + torch.save( + { + "gp_state_dict": deepcopy(model).state_dict(), + "nn_state_dict": deepcopy(nn).state_dict(), + "likelihood_state_dict": deepcopy(likelihood.state_dict()), + }, + self.checkpoint_path, + ) + except gpytorch.utils.errors.NotPSDError as e: + logger.error( + "Model training failed loading the untrained model", exc_info=True + ) + # Delete checkpoint to restart training + self.checkpoint_path.unlink(missing_ok=True) + raise SurrogateFailedToFit("DeepGP Failed to fit the training data!") from e + + def predict( + self, x: list[SearchSpace], learning_curves: list[list[float]] + ) -> tuple[torch.Tensor, torch.Tensor]: + assert self.model_ is not None, "Please fit the model first" + assert self.nn_ is not None, "Please fit the model first" + assert self.likelihood_ is not None, "Please fit the model first" + assert self.projected_x_train_ is not None, "Please fit the model first" + assert self.y_train_ is not None, "Please fit the model first" + assert self.y_bounds_ is not None, "Please fit the model first" + + self.model_.eval() + self.nn_.eval() + self.likelihood_.eval() + + x_test, test_budgets = self.preprocessor.encode_configs(x) + _curves = self.preprocessor.encode_learning_curves(learning_curves) + + with torch.no_grad(): + # Set GP prior + self.model_.set_train_data( + inputs=self.projected_x_train_, + targets=self.y_train_, + strict=False, + ) + + projected_test_x = self.nn_(x_test, test_budgets, _curves) + preds = self.likelihood_(self.model_(projected_test_x)) + + means = preds.mean.detach().cpu() + + if self.normalize_y: + _min, _max = self.y_bounds_ + means = (means + _min) * (_max - _min) + + cov = torch.diag(torch.pow(preds.stddev.detach(), 2)).cpu() + + return means, cov From 3aba212c233fe73fc37db18142adae67dcaef20a Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 14 Aug 2024 14:56:47 +0200 Subject: [PATCH 02/63] fix: Address comements from @karibbov --- neps/exceptions.py | 4 +++ neps/optimizers/models/deepGP.py | 44 ++++++++++++++------------------ 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/neps/exceptions.py b/neps/exceptions.py index 7054d7c6..edf0232c 100644 --- a/neps/exceptions.py +++ b/neps/exceptions.py @@ -52,3 +52,7 @@ class WorkerRaiseError(NePSError): Includes additional information on how to recover """ + + +class SurrogateFailedToFitError(NePSError): + """Raised when a surrogate model fails to fit.""" diff --git a/neps/optimizers/models/deepGP.py b/neps/optimizers/models/deepGP.py index e0c225e6..a94988ab 100644 --- a/neps/optimizers/models/deepGP.py +++ b/neps/optimizers/models/deepGP.py @@ -176,7 +176,7 @@ def __post_init__(self) -> None: self.numericals = { name: h for name, h in self.space.items() - if isinstance(h, (FloatParameter, IntegerParameter)) + if isinstance(h, (FloatParameter, IntegerParameter)) and not h.is_fidelity } self.categoricals = { name: h @@ -254,18 +254,8 @@ def encode_learning_curves(self, learning_curves: list[list[float]]) -> torch.Te return lc_buffer - def encode_y( - self, y: list[float] - ) -> tuple[torch.Tensor, None | tuple[int | float, int | float]]: - tensor = torch.tensor(y, device=self.device, dtype=torch.float32) - if self.fidelity_bounds: - _min, _max = tensor.min(), tensor.max() - tensor.sub_(_min).div_(_max - _min) - bounds = (_min.detach().item(), _max.detach().item()) - else: - bounds = None - - return tensor, bounds + def encode_y(self, y: list[float]) -> torch.Tensor: + return torch.tensor(y, device=self.device, dtype=torch.float32) def _train_model( @@ -397,7 +387,8 @@ class DeepGP: n_epochs: int = 1000 patience: int = 10 refine_epochs: int = 50 - perf_patience: int = 10 + perf_patience_factor: float = 1.2 # X * max_fidelity + n_initial_full_trainings: int = 10 device: torch.device = field( default_factory=lambda: torch.device("cuda") if torch.cuda.is_available() @@ -412,6 +403,7 @@ class DeepGP: # Created from the above arguments # TODO: Lift this out of DeepGP and let the optimizer worry about pre-processing preprocessor: DeepGPDataTransformer = field(init=False) + max_fidelity: int | float = field(init=False) # Post fit parameters, following scikit-learn convention of appending an underscore model_: GPRegressionModel | None = field(init=False) @@ -419,7 +411,6 @@ class DeepGP: nn_: NeuralFeatureExtractor | None = field(init=False) projected_x_train_: torch.Tensor | None = field(init=False) y_train_: torch.Tensor | None = field(init=False) - y_bounds_: tuple[float, float] | None = field(init=False) def __post_init__(self): if any(isinstance(h, GraphParameter) for h in self.pipeline_space.values()): @@ -436,6 +427,7 @@ def __post_init__(self): ), "neps root_directory must be provided for the checkpointing" self.checkpoint_path = self.root_directory / self.checkpoint_file + self.max_fidelity = self.pipeline_space.fidelity.upper # type: ignore self.preprocessor = DeepGPDataTransformer( space=self.pipeline_space, fidelity_bounds=budget_bounds, @@ -447,6 +439,8 @@ def __post_init__(self): self.model_ = None self.likelihood_ = None self.nn_ = None + self.projected_x_train_ = None + self.y_train_ = None def fit( self, @@ -456,11 +450,10 @@ def fit( ): x_, train_budget = self.preprocessor.encode_configs(x_train) curves = self.preprocessor.encode_learning_curves(learning_curves) - y_, y_bounds = self.preprocessor.encode_y(y_train) + y_ = self.preprocessor.encode_y(y_train) # Required for predictions later self.y_train_ = y_ - self.y_bounds_ = y_bounds input_dim = x_.shape[1] @@ -476,7 +469,12 @@ def fit( assert self.root_directory is not None non_improvement_steps = count_non_improvement_steps(self.root_directory) - if non_improvement_steps < self.perf_patience: + + patience_steps = self.perf_patience_factor * self.max_fidelity + if ( + len(y_train) >= self.n_initial_full_trainings + and non_improvement_steps < patience_steps + ): n_epochs = self.refine_epochs checkpoint = torch.load(self.checkpoint_path) @@ -534,14 +532,15 @@ def fit( raise SurrogateFailedToFit("DeepGP Failed to fit the training data!") from e def predict( - self, x: list[SearchSpace], learning_curves: list[list[float]] + self, + x: list[SearchSpace], + learning_curves: list[list[float]], ) -> tuple[torch.Tensor, torch.Tensor]: assert self.model_ is not None, "Please fit the model first" assert self.nn_ is not None, "Please fit the model first" assert self.likelihood_ is not None, "Please fit the model first" assert self.projected_x_train_ is not None, "Please fit the model first" assert self.y_train_ is not None, "Please fit the model first" - assert self.y_bounds_ is not None, "Please fit the model first" self.model_.eval() self.nn_.eval() @@ -562,11 +561,6 @@ def predict( preds = self.likelihood_(self.model_(projected_test_x)) means = preds.mean.detach().cpu() - - if self.normalize_y: - _min, _max = self.y_bounds_ - means = (means + _min) * (_max - _min) - cov = torch.diag(torch.pow(preds.stddev.detach(), 2)).cpu() return means, cov From 40b6830d6914e988abf6b5fa7e96881a4bad5443 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 14 Aug 2024 15:03:46 +0200 Subject: [PATCH 03/63] fix: Import from the moved file --- neps/optimizers/bayesian_optimization/models/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neps/optimizers/bayesian_optimization/models/__init__.py b/neps/optimizers/bayesian_optimization/models/__init__.py index c76bedfd..6279e973 100755 --- a/neps/optimizers/bayesian_optimization/models/__init__.py +++ b/neps/optimizers/bayesian_optimization/models/__init__.py @@ -4,7 +4,7 @@ from .gp_hierarchy import ComprehensiveGPHierarchy try: - from .deepGP import DeepGP + from neps.optimizers.models.deepGP import DeepGP except ImportError as e: DeepGP = MissingDependencyError("gpytorch", e) From df4e089ce537c52cf96da379771686c5e9c89aa1 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 14 Aug 2024 18:40:59 +0200 Subject: [PATCH 04/63] refactor: Cleanup vectorial kernels --- .../kernels/combine_kernels.py | 20 +- .../kernels/combine_kernels_hierarchy.py | 3 - .../kernels/vectorial_kernels.py | 339 ++++---- .../models/deepGP.py | 1 - .../bayesian_optimization/models/gp.py | 749 ++++++------------ neps/optimizers/models/__init__.py | 0 neps/search_spaces/search_space.py | 21 +- 7 files changed, 434 insertions(+), 699 deletions(-) rename neps/optimizers/{ => bayesian_optimization}/models/deepGP.py (99%) delete mode 100644 neps/optimizers/models/__init__.py diff --git a/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py b/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py index 0e464713..3aa320b5 100644 --- a/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py +++ b/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py @@ -46,7 +46,7 @@ def fit_transform( rebuild_model: bool = True, save_gram_matrix: bool = True, gp_fit: bool = True, - feature_lengthscale: list = None, + feature_lengthscale: dict[str, torch.Tensor] | None = None, **kwargs, ): N = len(configs) @@ -147,16 +147,13 @@ def transform( return K.t() - def clamp_theta_vector(self, theta_vector): - if theta_vector is None: - return None + def clamp_theta_vector( + self, theta_vector: dict[str, torch.Tensor] + ) -> dict[str, torch.Tensor]: + for t_ in theta_vector.values(): + if t_.is_leaf: + t_.clamp_(self.lengthscale_bounds[0], self.lengthscale_bounds[1]) - [ - t_.clamp_(self.lengthscale_bounds[0], self.lengthscale_bounds[1]) - if t_ is not None and t_.is_leaf - else None - for t_ in theta_vector.values() - ] return theta_vector @@ -210,6 +207,3 @@ def forward_t( class ProductKernel(CombineKernel): def __init__(self, *kernels, **kwargs): super().__init__("product", *kernels, **kwargs) - - def dk_dphi(self, weights, gr: list = None, x=None, feature_lengthscale=None): - raise NotImplementedError diff --git a/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py b/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py index b35b9d91..2f3d2bf6 100644 --- a/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py +++ b/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py @@ -243,6 +243,3 @@ def forward_t( class ProductKernel(CombineKernel): def __init__(self, *kernels, **kwargs): super().__init__("product", *kernels, **kwargs) - - def dk_dphi(self, weights, gr: list = None, x=None, feature_lengthscale=None): - raise NotImplementedError diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py index 6e0b2052..bd7a1661 100644 --- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py +++ b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py @@ -1,92 +1,96 @@ +from __future__ import annotations from copy import deepcopy from math import sqrt -from typing import Tuple, Union +from dataclasses import dataclass +from typing import Iterable +from typing_extensions import override import numpy as np import torch +LENGTHSCALE_BOUNDS_DEFAULT = ( + np.exp(-6.754111155189306), + np.exp(0.0858637988771976), +) + +@dataclass class Stationary: """Here we follow the structure of GPy to build a sub class of stationary kernel. All the classes (i.e. the class of stationary kernel_operators) derived from this class use the scaled distance to compute the Gram matrix.""" - def __init__( + lengthscale: float | torch.Tensor = 1.0 + lengthscale_bounds: tuple[float, float] = LENGTHSCALE_BOUNDS_DEFAULT + outputscale: float = 1.0 + + gram_: torch.Tensor | None = None + train_: torch.Tensor | None = None + + def forward( self, - lengthscale: Union[float, Tuple[float, ...]] = 1.0, - lengthscale_bounds: Tuple[float, float] = ( - np.exp(-6.754111155189306), - np.exp(0.0858637988771976), - ), - outputscale=1.0, - **kwargs, - ): - super().__init__(**kwargs) - self.lengthscale = lengthscale - self.lengthscale_bounds = lengthscale_bounds - self.outputscale = outputscale - - self._gram = None - self._train = None - - def forward(self, x1, x2=None, l=None, **params): - if l is not None: - return _scaled_distance(l, x1, x2) - return _scaled_distance(self.lengthscale, x1, x2) + x1: torch.Tensor, + x2: torch.Tensor | None = None, + l: float | torch.Tensor | None = None, + ) -> torch.Tensor: + lengthscale = l if l is not None else self.lengthscale + return _scaled_distance(lengthscale, x1, x2) def fit_transform( self, x1, - l=None, - rebuild_model=True, - save_gram_matrix=True, - ): - if not rebuild_model and self._gram is not None: - return self._gram + l: float | torch.Tensor | None = None, + rebuild_model: bool = True, + save_gram_matrix: bool = True, + ) -> torch.Tensor: + if not rebuild_model and self.gram_ is not None: + return self.gram_ K = self.forward(x1, l=l) if save_gram_matrix: - self._train = deepcopy(x1) + self.train_ = deepcopy(x1) assert isinstance(K, torch.Tensor), "it doesnt work with np arrays.." - self._gram = K.clone() + self.gram_ = K.clone() return K - def transform( - self, - x1, - l=None, - ): - if self._gram is None: + def transform(self, x1, l: float | torch.Tensor | None = None) -> torch.Tensor: + if self.gram_ is None or self.train_ is None: raise ValueError("The kernel has not been fitted. Run fit_transform first") - return self.forward(self._train, x1, l=l) - - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) + return self.forward(self.train_, x1, l=l) - def forward_t(self, x2, x1=None, l=None): + def forward_t( + self, + x2: torch.Tensor, + x1: torch.Tensor | None = None, + l: float | torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: if x1 is None: - x1 = torch.tensor(self._train) - x2 = torch.tensor(x2).requires_grad_(True) + x1 = torch.tensor(self.train_) + x2 = torch.tensor(x2, requires_grad=True) K = self.forward(x1, x2, l) return K, x2 - def update_hyperparameters(self, lengthscale): + def update_hyperparameters(self, lengthscale: Iterable[torch.Tensor]) -> None: self.lengthscale = [ l_.clamp(self.lengthscale_bounds[0], self.lengthscale_bounds[1]).item() for l_ in lengthscale ] +@dataclass class RBFKernel(Stationary): - def forward(self, x1, x2=None, l=None, **kwargs): - if l is None: - dist_sq = _scaled_distance(self.lengthscale, x1, x2, sq_dist=True) - else: - dist_sq = _scaled_distance(l, x1, x2, sq_dist=True) - if isinstance(dist_sq, torch.Tensor): - return self.outputscale * torch.exp(-0.5 * dist_sq) - return self.outputscale * np.exp(-0.5 * dist_sq) + @override + def forward( + self, + x1: torch.Tensor, + x2: torch.Tensor | None = None, + l: float | torch.Tensor | None = None, + ) -> torch.Tensor: + lengthscale = l if l is not None else self.lengthscale + dist_sq = _scaled_distance(lengthscale, x1, x2, sq_dist=True) + return self.outputscale * torch.exp(-0.5 * dist_sq) +@dataclass class LayeredRBFKernel(RBFKernel): """ Same as the conventional RBF kernel, but adapted in a way as a midway between @@ -94,178 +98,167 @@ class LayeredRBFKernel(RBFKernel): Weisfiler-Lehman iteration only (e.g. one weight for h=0, another for h=1 and etc.) """ - def forward(self, ard_dims, x1, x2=None, l=None, **kwargs): - l = l if l is not None else self.lengthscale - assert l.shape[0] == ard_dims.shape[0], ( + @override + def forward( + self, + ard_dims: torch.Tensor, + x1: torch.Tensor, + x2: torch.Tensor | None = None, + l: torch.Tensor | None = None, + ) -> torch.Tensor: + _l = l if l is not None else self.lengthscale + assert isinstance(_l, torch.Tensor), "Lengthscale must be a torch tensor" + assert _l.shape[0] == ard_dims.shape[0], ( "LayeredRBF expects the lengthscale vector to have the same " "dimensionality as the " "number of WL iterations, but got lengthscale vector of shape" - + str(l.shape[0]) + + str(_l.shape[0]) + "and WL iteration of shape " + str(ard_dims.shape[0]) ) - if not isinstance(ard_dims, torch.Tensor): - ard_dims = torch.tensor(ard_dims) + M = torch.cat( - [torch.ones(int(ard_dims[i])) * l[i] for i in range(ard_dims.shape[0])] + [torch.ones(int(ard_dims[i])) * _l[i] for i in range(len(ard_dims))] ) - return super().forward(x1, x2, M, **kwargs) + return super().forward(x1, x2, M) +@dataclass class Matern32Kernel(Stationary): - def forward(self, x1, x2=None, l=None, **kwargs): - if l is None: - l = self.lengthscale - dist = _scaled_distance(l, x1, x2) - if isinstance(dist, torch.Tensor): - return ( - self.outputscale * (1 + sqrt(3.0) * dist) * torch.exp(-sqrt(3.0) * dist) - ) - return self.outputscale * (1 + sqrt(3.0) * dist) * np.exp(-sqrt(3.0) * dist) + @override + def forward( + self, + x1: torch.Tensor, + x2: torch.Tensor | None = None, + l: float | torch.Tensor | None = None, + ) -> torch.Tensor: + lengthscale = l if l is not None else self.lengthscale + dist = _scaled_distance(lengthscale, x1, x2) + return self.outputscale * (1 + sqrt(3.0) * dist) * torch.exp(-sqrt(3.0) * dist) class Matern52Kernel(Stationary): - def forward(self, x1, x2=None, l=None, **kwargs): - if l is None: - l = self.lengthscale - dist = _scaled_distance(l, x1, x2) - sq_dist = dist**2 - if isinstance(dist, torch.Tensor): - return ( - self.outputscale - * (1 + sqrt(5.0) * dist + 5.0 / 3.0 * sq_dist) - * torch.exp(-sqrt(5.0) * dist) - ) + @override + def forward( + self, + x1: torch.Tensor, + x2: torch.Tensor | None = None, + l: float | torch.Tensor | None = None, + ) -> torch.Tensor: + lengthscale = l if l is not None else self.lengthscale + dist = _scaled_distance(lengthscale, x1, x2, sq_dist=True) return ( self.outputscale - * (1 + sqrt(5.0) * dist + 5.0 / 3.0 * sq_dist) - * np.exp(-sqrt(5.0) * dist) + * (1 + sqrt(5.0) * dist + 5.0 / 3.0 * dist) + * torch.exp(-sqrt(5.0) * dist) ) def update_hyperparameters(self, lengthscale): if lengthscale is None or "continuous" not in lengthscale.keys(): - return + raise ValueError("wtf") lengthscale = lengthscale["continuous"] super().update_hyperparameters(lengthscale=lengthscale) +@dataclass class HammingKernel(Stationary): - def forward(self, x1, x2=None, l=None, **kwargs): - if l is None: - dist = _hamming_distance( - self.lengthscale, - x1, - x2, - ) - else: - dist = _hamming_distance( - l, - x1, - x2, - ) + @override + def forward( + self, + x1: torch.Tensor, + x2: torch.Tensor | None = None, + l: float | torch.Tensor | None = None, + ) -> torch.Tensor: + lengthscale = l if l is not None else self.lengthscale + dist = _hamming_distance(lengthscale, x1, x2) return self.outputscale * dist def update_hyperparameters(self, lengthscale): if lengthscale is None or "categorical" not in lengthscale.keys(): - return + raise ValueError("wtf") lengthscale = lengthscale["categorical"] super().update_hyperparameters(lengthscale=lengthscale) +@dataclass class RationalQuadraticKernel(Stationary): - def __init__(self, lengthscale, outputscale=1.0, power=2.0, **kwargs): - super().__init__(lengthscale, outputscale, **kwargs) - self.power = power + power: float = 2.0 - def forward(self, x1, x2=None, **kwargs): - dist_sq = _scaled_distance(self.lengthscale, x1, x2, sq_dist=True) + @override + def forward( + self, + x1: torch.Tensor, + x2: torch.Tensor | None = None, + l: float | torch.Tensor | None = None, + ) -> torch.Tensor: + lengthscale = l if l is not None else self.lengthscale + dist_sq = _scaled_distance(lengthscale, x1, x2, sq_dist=True) return self.outputscale * (1 + dist_sq / 2.0) ** (-self.power) -def _unscaled_distance(X, X2=None, sq_dist=False): - """The unscaled distance between X and X2. if x2 is not supplied, then the squared Euclidean distance is - computed within X""" - if isinstance(X, torch.Tensor): - assert X.ndimension() == 2 - if X2 is not None: - assert isinstance(X2, torch.Tensor) - assert X2.ndimension() == 2 - if X2 is None: - Xsq = torch.sum(X**2, 1) - r2 = -2 * X @ X.t() + Xsq[:, None] + Xsq[None, :] - else: - X1sq = torch.sum(X**2, 1) - X2sq = torch.sum(X2**2, 1) - r2 = -2 * X @ X2.t() + X1sq[:, None] + X2sq[None, :] - r2 += 1e-8 - r2 = torch.maximum(r2, torch.tensor(0)) - if not sq_dist: - r2 = torch.sqrt(r2) - else: - assert X.ndim == 2 - if X2 is not None: - assert X2.ndim == 2 - if X2 is None: - Xsq = np.sum(X**2, 1) - r2 = -2 * X @ X.transpose() + Xsq[:, None] + Xsq[None, :] - else: - X1sq = np.sum(X**2, 1) - X2sq = np.sum(X2**2, 1) - r2 = -2 * X @ X2.transpose() + X1sq[:, None] + X2sq[None, :] - if not sq_dist: - r2 = np.sqrt(r2) - return r2 - - -def _scaled_distance(lengthscale, X, X2=None, sq_dist=False): +def _unscaled_square_distance( + X: torch.Tensor, + X2: torch.Tensor | None = None, +) -> torch.Tensor: + """The unscaled distance between X and X2.""" + assert X.ndim == 2 + X1sq = torch.sum(X**2, 1) + X2sq = X1sq if X is X2 else torch.sum(X**2, 1) + X2 = X if X2 is None else X2 + + r2 = -2 * X @ X2.T + X1sq[:, None] + X2sq[None, :] + r2 += 1e-15 + return torch.clamp_min(r2, 0.0) + + +def _scaled_distance( + lengthscale: float | torch.Tensor, + X: torch.Tensor, + X2: torch.Tensor | None = None, + *, + sq_dist: bool = False, +) -> torch.Tensor: """Compute the *scaled* distance between X and x2 (or, if X2 is not supplied, the distance between X and itself) by the lengthscale. if a scalar (float) or a dim=1 lengthscale vector is supplied, then it is assumed that we use one lengthscale for all dimensions. Otherwise, we have an ARD kernel and in which case the length of the lengthscale vector must be the same as the dimensionality of the problem.""" - X = torch.tensor(X, dtype=torch.float64) + if isinstance(lengthscale, float): + if sq_dist is False: + return torch.sqrt(_unscaled_square_distance(X, X2)) / (lengthscale**2) + + return _unscaled_square_distance(X, X2) / lengthscale + + # ARD kernel - one lengthscale per dimension + assert len(lengthscale) == X.shape[1], ( + f"Lengthscale must have the same dimensionality as the input data." + f"Got {len(lengthscale)} and {X.shape[1]}" + ) + rescaled_X = X / lengthscale if X2 is None: - X2 = X - if isinstance(lengthscale, float) or len(lengthscale) == 1: - return ( - _unscaled_distance(X, X2) / lengthscale - if sq_dist is False - else _unscaled_distance(X, X2, sq_dist=True) / (lengthscale**2) - ) + dist = _unscaled_square_distance(rescaled_X) else: - # ARD kernel - one lengthscale per dimension - _check_lengthscale(lengthscale, X) - dist = _unscaled_distance(X / lengthscale, X2 / lengthscale) - return dist if not sq_dist else dist**2 + rescaled_X2 = X2 / lengthscale + dist = _unscaled_square_distance(rescaled_X, rescaled_X2) + + return dist if sq_dist else torch.sqrt(dist) -def _hamming_distance(lengthscale, X, X2=None): +def _hamming_distance( + lengthscale: float | torch.Tensor, + X: torch.Tensor, + X2: torch.Tensor | None = None, +) -> torch.Tensor: if X2 is None: X2 = X - def _distance(X, X2, lengthscale=1.0): - if isinstance(lengthscale, torch.Tensor): - lengthscale = lengthscale.detach().numpy() - indicator = np.expand_dims(X, axis=1) != X2 - K = (-1 / (2 * lengthscale**2) * indicator).sum(axis=2) - K = np.exp(K) - return torch.from_numpy(K) + indicator = X.unsqueeze(1) != X2 + C = -1 / (2 * lengthscale**2) + scaled_indicator = C * indicator + diffs = scaled_indicator.sum(dim=2) if isinstance(lengthscale, float) or len(lengthscale) == 1: - return _distance(X, X2) / lengthscale + return torch.exp(diffs) / lengthscale else: - _check_lengthscale(lengthscale, X) - return _distance(X, X2, lengthscale) - - -def _check_lengthscale(lengthscale, X): - x_shape = len(X[0]) if isinstance(X, list) else X.shape[1] - assert len(lengthscale) == x_shape, ( - "For a non-scaler theta, it needs to be of the same length as the dim" - "of the " - "input data, but got input dim of " - + str(x_shape) - + " and lengthscale dimension of " - + str(lengthscale.shape[0]) - ) + return torch.exp(diffs) diff --git a/neps/optimizers/models/deepGP.py b/neps/optimizers/bayesian_optimization/models/deepGP.py similarity index 99% rename from neps/optimizers/models/deepGP.py rename to neps/optimizers/bayesian_optimization/models/deepGP.py index a94988ab..82355ec5 100644 --- a/neps/optimizers/models/deepGP.py +++ b/neps/optimizers/bayesian_optimization/models/deepGP.py @@ -2,7 +2,6 @@ from dataclasses import dataclass, field import logging -import os from copy import deepcopy from pathlib import Path diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index 73ecf019..6a878748 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -1,276 +1,279 @@ +from __future__ import annotations + import logging from copy import deepcopy -from typing import Iterable, Union +from typing import Iterable, Literal, Sequence, Any import numpy as np +import contextlib import torch -from ..kernels.combine_kernels import ProductKernel, SumKernel +from neps.optimizers.bayesian_optimization.kernels.combine_kernels import ( + ProductKernel, + SumKernel, +) + +from neps.optimizers.bayesian_optimization.kernels.graph_kernel import GraphKernels +from neps.optimizers.bayesian_optimization.kernels.utils import extract_configs +from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary +from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import WeisfilerLehman +from neps.search_spaces.search_space import SearchSpace -# GP model as a weighted average between the vanilla vectorial GP and the graph GP -from ..kernels.graph_kernel import GraphKernels -from ..kernels.utils import extract_configs -from ..kernels.vectorial_kernels import Stationary -from ..kernels.weisfilerlehman import WeisfilerLehman +logger = logging.getLogger(__name__) class ComprehensiveGP: def __init__( self, + space: SearchSpace, graph_kernels: Iterable, hp_kernels: Iterable, - likelihood: float = 1e-3, - weights=None, - vectorial_features: list = None, - combined_kernel: str = "sum", - logger=None, - surrogate_model_fit_args: dict = None, + initial_likelihood: float = 1e-3, + weights: Sequence[float] | torch.Tensor | None = None, + combined_kernel: Literal["sum", "product"] = "sum", + surrogate_model_fit_args: dict | None = None, + optimizer_kwargs: dict[str, Any] | None = None, + wl_subtree_candidates: Sequence[int] = (1, 2, 3, 4, 5), + wl_lengthscales: Sequence[float] = tuple(np.e**i for i in range(-2, 3)), + optimize_likelihood: bool = True, + optimizer: Literal["adam", "sgd"] = "adam", + optimizer_iters: int = 20, + max_likelihood: float = 0.01, + optimize_wl_layer_weights: bool = False, ): - self.likelihood = likelihood - self.surrogate_model_fit_args = surrogate_model_fit_args or {} - - self.domain_kernels: list = [] - if bool(graph_kernels): - self.domain_kernels += list(graph_kernels) - if bool(hp_kernels): - self.domain_kernels += list(hp_kernels) - - self.n_kernels: int = len(self.domain_kernels) - self.n_graph_kernels: int = len( - [i for i in self.domain_kernels if isinstance(i, GraphKernels)] - ) - self.n_vector_kernels: int = self.n_kernels - self.n_graph_kernels - - self.vectorial_features = vectorial_features - + graph_kernels = list(graph_kernels) + hp_kernels = list(hp_kernels) + n_graph_kernels = len(graph_kernels) + n_vector_kernels = len(hp_kernels) + n_kernels = n_graph_kernels + n_vector_kernels + domain_kernels = [*graph_kernels, *hp_kernels] + + fixed_weights = weights is not None if weights is not None: - self.fixed_weights = True if weights is not None: - assert len(weights) == len(self.n_kernels), ( + assert len(weights) == n_kernels, ( "the weights vector, if supplied, needs to have the same length as " "the number of kernel_operators!" ) - self.init_weights = ( - weights - if isinstance(weights, torch.Tensor) - else torch.tensor(weights).flatten() - ) + init_weights = torch.as_tensor(weights).flatten() else: - self.fixed_weights = False - # Initialise the domain kernel weights to uniform - self.init_weights = torch.tensor( - [1.0 / self.n_kernels] * self.n_kernels, - ) - self.weights = self.init_weights.clone() + uniform_weight = 1.0 / self.n_kernels + init_weights = torch.full((n_kernels,), uniform_weight, dtype=torch.float64) if combined_kernel == "product": - self.combined_kernel = ProductKernel( - *self.domain_kernels, weights=self.weights - ) + _combined_kernel = ProductKernel(*domain_kernels, weights=weights) elif combined_kernel == "sum": - self.combined_kernel = SumKernel(*self.domain_kernels, weights=self.weights) + _combined_kernel = SumKernel(*domain_kernels, weights=weights) else: raise NotImplementedError( f'Combining kernel {combined_kernel} is not yet implemented! Only "sum" ' f'or "product" are currently supported. ' ) - self.logger = logger or logging.getLogger("neps") + # TODO: Clone only needed while it can act like configurations + self.space = space.clone() + self.init_weights = init_weights + self.fixed_weights = fixed_weights + self.combined_kernel = _combined_kernel + self.initial_likelihood = initial_likelihood + self.surrogate_model_fit_args = surrogate_model_fit_args or {} + self.domain_kernels: list = [*graph_kernels, *hp_kernels] + self.n_kernels: int = len(self.domain_kernels) + self.n_graph_kernels: int = len(graph_kernels) + self.n_vector_kernels: int = len(hp_kernels) + self.optimizer_kwargs = optimizer_kwargs or {"lr": 0.1} + self.optimize_likelihood = optimize_likelihood + self.optimize_wl_layer_weights = optimize_wl_layer_weights + self.optimizer = optimizer + self.optimizer_iters = optimizer_iters + self.max_likelihood = max_likelihood + self.wl_subtree_candidates = wl_subtree_candidates + self.wl_lengthscales = wl_lengthscales + # Cache the Gram matrix inverse and its log-determinant - self.K, self.K_i, self.logDetK = [None] * 3 - self.theta_vector = None - self.layer_weights = None - self.nlml = None - - self.x_configs: list = None - self.y: torch.Tensor = None - self.y_: torch.Tensor = None - self.y_mean: torch.Tensor = None - self.y_std: torch.Tensor = None - self.n: int = None - - def _optimize_graph_kernels(self, h_: int, lengthscale_): - graphs, _ = extract_configs(self.x_configs) - for i, k in enumerate(self.combined_kernel.kernels): - if not isinstance(k, GraphKernels): - continue - elif isinstance(k, WeisfilerLehman): - _grid_search_wl_kernel( - k, - h_, + self.K_ = None + self.K_i_ = None + self.logDetK_ = None + self.theta_vector_ = None + self.layer_weights_ = None + self.nlml_ = None + self.likelihood_: float | None = None + self.weights_: torch.Tensor | None = None + self.x_configs_: list[SearchSpace] | None = None + self.y_: torch.Tensor | None = None + self.y_normalized_: torch.Tensor | None = None + self.y_mean_: float | None = None + self.y_std_: float | None = None + self.n_: int | None = None + + def fit(self, train_x: list[SearchSpace], train_y: list[float]) -> None: + """Called by self.fit""" + self.x_configs = train_x + self.n_ = len(train_x) + self.y_ = torch.as_tensor(train_y, dtype=torch.float64) + + # TODO: Dunno if I like this silent hack, setting std to 1 if no std + self.y_std_ = s if (s := torch.std(self.y_).item()) != 0 else 1 + self.y_mean_ = torch.mean(self.y_).item() + self.y_normalized_ = (self.y_ - self.y_mean_) / self.y_std_ + + # The Gram matrix of the training data + self.K_i_, self.logDetK_ = None, None + + if len(self.wl_subtree_candidates) > 0: + graphs, _ = extract_configs(self.x_configs) + graph_kernels = [ + k for k in self.domain_kernels if isinstance(k, GraphKernels) + ] + for i, kernel in enumerate(graph_kernels): + if not isinstance(kernel, WeisfilerLehman): + logger.warning(f"No kernel opt. for {type(kernel).__name__}.") + continue + + _xs = ( [x[i] for x in graphs] if isinstance(graphs[0], list) - else [c for c in graphs], - self.y, - self.likelihood, - lengthscales=lengthscale_, + else [x for x in graphs] ) - else: - self.logger.warning( - "(Graph) kernel optimisation for " - + type(k).__name__ - + " not implemented yet." + _grid_search_wl_kernel( + kernel=kernel, + subtree_candidates=self.wl_subtree_candidates, + train_x=_xs, + train_y=self.y_, + likelihood=self.initial_likelihood, + lengthscales=self.wl_lengthscales, ) - def fit(self, train_x, train_y): - self._fit(train_x, train_y, **self.surrogate_model_fit_args) - - def _fit( - self, - train_x, - train_y, - iters: int = 20, - optimizer: str = "adam", - wl_subtree_candidates: tuple = tuple(range(5)), - wl_lengthscales: tuple = tuple(np.e**i for i in range(-2, 3)), - optimize_lik: bool = True, - max_lik: float = 0.01, - optimize_wl_layer_weights: bool = False, - optimizer_kwargs: dict = None, - ): - """Called by self.fit""" - self._reset_XY(train_x, train_y) - - # Get the node weights, if needed - - if optimizer_kwargs is None: - optimizer_kwargs = {"lr": 0.1} - if len(wl_subtree_candidates) > 0: - self._optimize_graph_kernels( - wl_subtree_candidates, - wl_lengthscales, - ) - weights = self.init_weights.clone() - if (not self.fixed_weights) and len(self.domain_kernels) > 1: + if not self.fixed_weights and self.n_kernels > 1: weights.requires_grad_(True) - theta_vector = get_theta_vector(vectorial_features=self.vectorial_features) + n_cat = len(self.space.categoricals) + n_num = len(self.space.numerical) + theta_categorical = torch.ones( + n_cat, requires_grad=n_cat > 1, dtype=torch.float64 + ) + theta_numerical = torch.ones(n_num, requires_grad=n_num > 1, dtype=torch.float64) - # Whether to include the likelihood (jitter or noise variance) as a hyperparameter + theta_vectors = { + "categorical": theta_categorical, + "continuous": theta_numerical, # NOTE: This actually includes integers too -_- + } likelihood = torch.tensor( - self.likelihood, + self.initial_likelihood, requires_grad=self.optimize_likelihood ) - if optimize_lik: - likelihood.requires_grad_(True) layer_weights = None - if optimize_wl_layer_weights: - for k in self.domain_kernels: - if isinstance(k, WeisfilerLehman): - layer_weights = torch.ones(k.h + 1).requires_grad_(True) - if layer_weights.shape[0] <= 1: - layer_weights = None - else: - break + if self.optimize_wl_layer_weights: + for kernel in self.domain_kernels: + if isinstance(kernel, WeisfilerLehman) and kernel.h != 0: + layer_weights = torch.ones(kernel.h + 1, requires_grad=True) + break # Linking the optimizer variables to the sum kernel - optim_vars = [] - for a in [weights, likelihood, layer_weights]: - if a is not None and a.is_leaf and a.requires_grad: - optim_vars.append(a) - - if theta_vector is not None: - for a in theta_vector.values(): - if a is not None and a.requires_grad: - optim_vars.append(a) + optim_vars = [ + a + for a in ( + weights, + likelihood, + layer_weights, + theta_categorical, + theta_numerical, + ) + if a is not None and a.is_leaf and a.requires_grad + ] + nlml = None if len(optim_vars) == 0: # Skip optimisation K = self.combined_kernel.fit_transform( weights, self.x_configs, - feature_lengthscale=theta_vector, + feature_lengthscale=theta_vectors, layer_weights=layer_weights, rebuild_model=True, ) - K_i, logDetK = compute_pd_inverse(K, likelihood) + K_i, logDetK = compute_pd_inverse(K, jitter=likelihood) else: # Select the optimizer - assert optimizer.lower() in ["adam", "sgd"] - if optimizer.lower() == "adam": - optim = torch.optim.Adam(optim_vars, **optimizer_kwargs) + if self.optimizer == "adam": + optim = torch.optim.Adam(optim_vars, **self.optimizer_kwargs) # type: ignore + elif self.optimizer == "sgd": + optim = torch.optim.SGD(optim_vars, **self.optimizer_kwargs) # type: ignore else: - optim = torch.optim.SGD(optim_vars, **optimizer_kwargs) + raise ValueError(f"Invalid optimizer {self.optimizer}") - K = None - for i in range(iters): + K: torch.Tensor | None = None + for i in range(self.optimizer_iters): optim.zero_grad() K = self.combined_kernel.fit_transform( - weights, - self.x_configs, - feature_lengthscale=theta_vector, + weights=weights, + configs=train_x, # TODO + feature_lengthscale=theta_vectors, layer_weights=layer_weights, rebuild_model=True, save_gram_matrix=True, ) - K_i, logDetK = compute_pd_inverse(K, likelihood) - nlml = -compute_log_marginal_likelihood(K_i, logDetK, self.y) + K_i, logDetK = compute_pd_inverse(K, jitter=likelihood) + nlml = -compute_log_marginal_likelihood( + K_i, logDetK, y=self.y_normalized_ + ) nlml.backward() if i % 10 == 0: - self.logger.debug( - f"Iteration: {i}/{iters} " + logger.debug( + f"Iteration: {i}/{self.optimizer_iters} " f"Negative log-marginal likelihood:" - f"{nlml.item()} {theta_vector} {weights} {likelihood}" + f"{nlml.item()} {theta_vectors} {weights} {likelihood}" ) + optim.step() # TODO + with torch.no_grad(): + if weights.is_leaf: + weights.clamp_(0.0, 1.0) - weights.clamp_( - 0.0, 1.0 - ) if weights is not None and weights.is_leaf else None - theta_vector = self.combined_kernel.clamp_theta_vector(theta_vector) - likelihood.clamp_( - 1e-5, max_lik - ) if likelihood is not None and likelihood.is_leaf else None - layer_weights.clamp_( - 0.0, 1.0 - ) if layer_weights is not None and layer_weights.is_leaf else None + theta_vectors = self.combined_kernel.clamp_theta_vector(theta_vectors) + if likelihood.is_leaf: + likelihood.clamp_(1e-5, self.max_likelihood) + + if layer_weights is not None and layer_weights.is_leaf: + layer_weights.clamp_(0.0, 1.0) optim.zero_grad(set_to_none=True) - K_i, logDetK = compute_pd_inverse(K, likelihood) + assert K is not None + K_i, logDetK = compute_pd_inverse(K, jitter=likelihood) # Apply the optimal hyperparameters - self.weights = weights.clone() / torch.sum(weights) - self.K_i = K_i.clone() - self.K = K.clone() - self.logDetK = logDetK.clone() - self.likelihood = likelihood.item() - self.theta_vector = theta_vector - self.layer_weights = layer_weights - self.nlml = nlml.detach().cpu() if nlml is not None else None - - for k in self.combined_kernel.kernels: - if isinstance(k, Stationary): - k.update_hyperparameters(lengthscale=theta_vector) - - self.combined_kernel.weights = weights.clone() - - self.logger.debug("Optimisation summary: ") - self.logger.debug( - f"Optimal NLML: {nlml}", - ) - self.logger.debug(f"Lengthscales: {theta_vector}") - try: - self.logger.debug( - f"Optimal h: {self.domain_kernels[0]._h}", - ) - except AttributeError: - pass - self.logger.debug(f"Weights: {self.weights}") - self.logger.debug(f"Lik: {self.likelihood}") - self.logger.debug(f"Optimal layer weights {layer_weights}") - - def predict(self, x_configs, preserve_comp_graph: bool = False): + self.weights_ = weights.clone() / torch.sum(weights) + self.K_i_ = K_i.clone() + self.K_ = K.clone() + self.logDetK_ = logDetK.clone() + self.likelihood_ = likelihood.item() + self.theta_vector_ = theta_vectors + self.layer_weights_ = layer_weights + self.nlml_ = nlml.detach().cpu() if nlml is not None else None + + for kernel in self.combined_kernel.kernels: + if isinstance(kernel, Stationary): + kernel.update_hyperparameters(lengthscale=self.theta_vector_) + + logger.debug("Optimisation summary: ") + logger.debug(f"Optimal NLML: {nlml}") + logger.debug(f"Lengthscales: {theta_vectors}") + with contextlib.suppress(AttributeError): + logger.debug(f"Optimal h: {self.domain_kernels[0]._h}") + logger.debug(f"Weights: {self.weights_}") + logger.debug(f"Lik: {self.likelihood_}") + logger.debug(f"Optimal layer weights {layer_weights}") + + def predict(self, x_configs: list[SearchSpace]) -> tuple[torch.Tensor, torch.Tensor]: """Kriging predictions""" - if not isinstance(x_configs, list): - # Convert a single input X_s to a singleton list x_configs = [x_configs] - if self.K_i is None or self.logDetK is None: + if self.K_i_ is None or self.logDetK_ is None or self.weights_ is None: raise ValueError( "Inverse of Gram matrix is not instantiated. Please call the optimize " "function to fit on the training data first!" @@ -278,246 +281,39 @@ def predict(self, x_configs, preserve_comp_graph: bool = False): # Concatenate the full list X_configs_all = self.x_configs + x_configs - - # Make a copy of the sum_kernels for this step, to avoid breaking the autodiff - # if grad guided mutation is used - if preserve_comp_graph: - combined_kernel_copy = deepcopy(self.combined_kernel) - else: - combined_kernel_copy = self.combined_kernel - - K_full = combined_kernel_copy.fit_transform( - self.weights, - X_configs_all, - layer_weights=self.layer_weights, - feature_lengthscale=self.theta_vector, + n_train = len(self.x_configs) + n_test = len(x_configs) + + K_full = self.combined_kernel.fit_transform( + weights=self.weights_, + configs=X_configs_all, + layer_weights=self.layer_weights_, + feature_lengthscale=self.theta_vector_, rebuild_model=True, save_gram_matrix=False, gp_fit=False, ) - K_s = K_full[: self.n :, self.n :] - - K_ss = K_full[self.n :, self.n :] + self.likelihood * torch.eye( - len(x_configs), - ) - - mu_s = K_s.t() @ self.K_i @ self.y - cov_s = K_ss - K_s.t() @ self.K_i @ K_s - cov_s = torch.clamp(cov_s, self.likelihood, np.inf) - mu_s = unnormalize_y(mu_s, self.y_mean, self.y_std) - std_s = torch.sqrt(cov_s) - std_s = unnormalize_y(std_s, None, self.y_std, True) - cov_s = std_s**2 - if preserve_comp_graph: - del combined_kernel_copy - return mu_s, cov_s + K_s = K_full[:n_train:, n_train:] + K_ss = K_full[n_train:, n_train:] + self.likelihood_ * torch.eye(n_test) - @property - def x(self): - return self.x_configs + mu_s = K_s.t() @ self.K_i_ @ self.y_ + mu_s = mu_s * self.y_std_ + self.y_mean_ - def _reset_XY(self, train_x: Iterable, train_y: Union[Iterable, torch.Tensor]): - self.x_configs = train_x - self.n = len(self.x_configs) - train_y_tensor = ( - train_y - if isinstance(train_y, torch.Tensor) - else torch.tensor(train_y, dtype=torch.get_default_dtype()) - ) - self.y_ = train_y_tensor - self.y, self.y_mean, self.y_std = normalize_y(train_y_tensor) - # The Gram matrix of the training data - self.K_i, self.logDetK = None, None + cov_s = K_ss - K_s.t() @ self.K_i_ @ K_s + cov_s = torch.clamp(cov_s, self.likelihood_, np.inf) + cov_s = (torch.sqrt(cov_s) * self.y_std_) ** 2 - def dmu_dphi( - self, - X_s=None, - # compute_grad_var=False, - average_across_features=True, - average_across_occurrences=False, - ): - r""" - Compute the derivative of the GP posterior mean at the specified input location with respect to the - *vector embedding* of the graph (e.g., if using WL-subtree, this function computes the gradient wrt - each subtree pattern) - - The derivative is given by - $ - \frac{\partial \mu^*}{\partial \phi ^*} = \frac{\partial K(\phi, \phi^*)}{\partial \phi ^ *}K(\phi, \phi)^{-1} - \mathbf{y} - $ - - which derives directly from the GP posterior mean formula, and since the term $K(\phi, \phi)^{-1} and \mathbf{y} - are both independent of the testing points (X_s, or \phi^*}, the posterior gradient is simply the matrix - produce of the kernel gradient with the inverse Gram and the training label vector. - - Parameters - ---------- - X_s: The locations on which the GP posterior mean derivatives should be evaluated. If left blank, the - derivatives will be evaluated at the training points. - - compute_grad_var: bool. If true, also compute the gradient variance. - - The derivative of GP is also a GP, and thus the predictive distribution of the posterior gradient is Gaussian. - The posterior mean is given above, and the posterior variance is: - $ - \mathbb{V}[\frac{\partial f^*}{\partial \phi^*}]= \frac{\partial^2k(\phi^*, \phi^*)}{\partial \phi^*^2} - - \frac{\partial k(\phi^*, \Phi)}{\partial \phi^*}K(X, X)^{-1}\frac{\partial k{(\Phi, \phi^*)}}{\partial \phi^*} - $ - - Returns - ------- - list of K torch.Tensor of the shape N x2 D, where N is the length of the X_s list (each element of which is a - networkx graph), K is the number of kernel_operators in the combined kernel and D is the dimensionality of the - feature vector (this is determined by the specific graph kernel. - - OR - - list of K torch.Tensor of shape D, if averaged_over_samples flag is enabled. - """ - if self.K_i is None or self.logDetK is None: - raise ValueError( - "Inverse of Gram matrix is not instantiated. Please call the optimize " - "function to fit on the training data first!" - ) - if self.n_vector_kernels: - if X_s is not None: - V_s = self._get_vectorial_features(X_s, self.vectorial_feactures) - V_s, _, _ = standardize_x(V_s, self.x_features_min, self.x_features_max) - else: - V_s = self.x_features - X_s = self.x[:] - else: - V_s = None - X_s = X_s if X_s is not None else self.x[:] - - alpha = (self.K_i @ self.y).double().reshape(1, -1) - dmu_dphi = [] - # dmu_dphi_var = [] if compute_grad_var else None - - Ks_handles = [] - feature_matrix = [] - for j, x_s in enumerate(X_s): - jacob_vecs = [] - if V_s is None: - handles = self.combined_kernel.forward_t( - self.weights, - [x_s], - ) - else: - handles = self.combined_kernel.forward_t(self.weights, [x_s], V_s[j]) - Ks_handles.append(handles) - # Each handle is a 2-tuple. first element is the Gram matrix, second element is the leaf variable - feature_vectors = [] - for handle in handles: - k_s, y, _ = handle - # k_s is output, leaf is input, alpha is the K_i @ y term which is constant. - # When compute_grad_var is not required, computational graphs do not need to be saved. - jacob_vecs.append( - torch.autograd.grad( - outputs=k_s, inputs=y, grad_outputs=alpha, retain_graph=False - )[0] - ) - feature_vectors.append(y) - feature_matrix.append(feature_vectors) - jacob_vecs = torch.cat(jacob_vecs) - dmu_dphi.append(jacob_vecs) - - feature_matrix = torch.cat([f[0] for f in feature_matrix]) - if average_across_features: - dmu_dphi = torch.cat(dmu_dphi) - # compute the weighted average of the gradient across N_t. - # feature matrix is of shape N_t x K x D - avg_mu, avg_var, incidences = get_grad( - dmu_dphi, feature_matrix, average_across_occurrences - ) - return avg_mu, avg_var, incidences - return ( - dmu_dphi, - None, - feature_matrix.sum(dim=0) if average_across_occurrences else feature_matrix, - ) - - -def get_grad(grad_matrix, feature_matrix, average_occurrences=False): - r""" - Average across the samples via a Monte Carlo sampling scheme. Also estimates the - empirical variance. :param average_occurrences: if True, do a weighted summation - based on the frequency distribution of the occurrence to compute a gradient *per - each feature*. Otherwise, each different occurrence (\phi_i = k) will get a - different gradient estimate. - """ - assert grad_matrix.shape == feature_matrix.shape - # Prune out the all-zero columns that pop up sometimes - valid_cols = [] - for col_idx in range(feature_matrix.size(1)): - if not torch.all(feature_matrix[:, col_idx] == 0): - valid_cols.append(col_idx) - feature_matrix = feature_matrix[:, valid_cols] - grad_matrix = grad_matrix[:, valid_cols] - - _, D = feature_matrix.shape - if average_occurrences: - avg_grad = torch.zeros(D) - avg_grad_var = torch.zeros(D) - for d in range(D): - current_feature = feature_matrix[:, d].clone().detach() - instances, indices, counts = torch.unique( - current_feature, return_inverse=True, return_counts=True - ) - weight_vector = torch.tensor([counts[i] for i in indices]).type(torch.float) - weight_vector /= weight_vector.sum() - mean = torch.sum(weight_vector * grad_matrix[:, d]) - # Compute the empirical variance of gradients - variance = torch.sum(weight_vector * grad_matrix[:, d] ** 2) - mean**2 - avg_grad[d] = mean - avg_grad_var[d] = variance - return avg_grad, avg_grad_var, feature_matrix.sum(dim=0) - else: - # The maximum number possible occurrences -- 7 is an example, if problem occurs, maybe we can increase this - # number. But for now, for both NAS-Bench datasets, this should be more than enough! - max_occur = 7 - avg_grad = torch.zeros(D, max_occur) - avg_grad_var = torch.zeros(D, max_occur) - incidences = torch.zeros(D, max_occur) - for d in range(D): - current_feature = feature_matrix[:, d].clone().detach() - instances, indices, counts = torch.unique( - current_feature, return_inverse=True, return_counts=True - ) - for i, val in enumerate(instances): - # Find index of all feature counts that are equal to the current val - feature_at_val = grad_matrix[current_feature == val] - avg_grad[d, int(val)] = torch.mean(feature_at_val) - avg_grad_var[d, int(val)] = torch.var(feature_at_val) - incidences[d, int(val)] = counts[i] - return avg_grad, avg_grad_var, incidences - - -# Optimize Graph kernel -def getBack(var_grad_fn, logger): - logger.debug(var_grad_fn) - for n in var_grad_fn.next_functions: - if n[0]: - try: - tensor = getattr(n[0], "variable") - logger.debug(n[0]) - logger.debug(f"Tensor with grad found: {tensor}") - logger.debug(f" - gradient: {tensor.grad}") - except AttributeError: - getBack(n[0], logger) + return mu_s, cov_s def _grid_search_wl_kernel( - k: WeisfilerLehman, + kernel: WeisfilerLehman, subtree_candidates, train_x: list, train_y: torch.Tensor, - lik: float, - subtree_prior=None, + likelihood: float, lengthscales=None, - lengthscales_prior=None, ): """Optimize the *discrete hyperparameters* of Weisfeiler Lehman kernel. k: a Weisfeiler-Lehman kernel instance @@ -533,136 +329,73 @@ def _grid_search_wl_kernel( best_subtree_depth = None best_lengthscale = None best_K = None - if lengthscales is not None and k.se is not None: + if lengthscales is not None and kernel.se is not None: candidates = [(h_, l_) for h_ in subtree_candidates for l_ in lengthscales] else: candidates = [(h_, None) for h_ in subtree_candidates] for i in candidates: - if k.se is not None: - k.change_se_params({"lengthscale": i[1]}) - k.change_kernel_params({"h": i[0]}) - K = k.fit_transform(train_x, rebuild_model=True, save_gram_matrix=True) - # self.logger.debug(K) - K_i, logDetK = compute_pd_inverse(K, lik) - # self.logger.debug(train_y) + if kernel.se is not None: + kernel.change_se_params({"lengthscale": i[1]}) + + kernel.change_kernel_params({"h": i[0]}) + K = kernel.fit_transform(train_x, rebuild_model=True, save_gram_matrix=True) + K_i, logDetK = compute_pd_inverse(K, jitter=likelihood) nlml = -compute_log_marginal_likelihood(K_i, logDetK, train_y) - # self.logger.debug(f"{i} {nlml}") if nlml < best_nlml: best_nlml = nlml best_subtree_depth, best_lengthscale = i best_K = torch.clone(K) - # self.logger.debug(f"h: {best_subtree_depth} theta: {best_lengthscale}") - # self.logger.debug(best_subtree_depth) - k.change_kernel_params({"h": best_subtree_depth}) - if k.se is not None: - k.change_se_params({"lengthscale": best_lengthscale}) - k._gram = best_K - - -def get_theta_vector(vectorial_features): - if vectorial_features is None: - return None - theta_vector = {} - for key, dim in vectorial_features.items(): - t = torch.ones(dim) - if t.shape[0] > 1: - t.requires_grad_(True) - theta_vector[key] = t - return theta_vector - - -def normalize_y(y: torch.Tensor): - y_mean = torch.mean(y) if isinstance(y, torch.Tensor) else np.mean(y) - y_std = torch.std(y) if isinstance(y, torch.Tensor) else np.std(y) - if y_std == 0: - y_std = 1 - y = (y - y_mean) / y_std - return y, y_mean, y_std - - -def unnormalize_y(y, y_mean, y_std, scale_std=False): - """Similar to the undoing of the pre-processing step above, but on the output predictions""" - if not scale_std: - return y * y_std + y_mean - else: - return y * y_std - -def standardize_x( - x: torch.Tensor, x_min: torch.Tensor = None, x_max: torch.Tensor = None -): - """Standardize the vectorial input into a d-dimensional hypercube [0, 1]^d, where d is the number of features. - if x_min ond x_max are supplied, x2 will be standardised using these instead. This is used when standardising the - validation/test inputs. - """ - if (x_min is not None and x_max is None) or (x_min is None and x_max is not None): - raise ValueError( - "Either *both* or *neither* of x_min, x_max need to be supplied!" - ) - if x_min is None: - x_min = torch.min(x, 0)[0] - x_max = torch.max(x, 0)[0] - x = (x - x_min) / (x_max - x_min) - return x, x_min, x_max + kernel.change_kernel_params({"h": best_subtree_depth}) + if kernel.se is not None: + kernel.change_se_params({"lengthscale": best_lengthscale}) + kernel._gram = best_K def compute_log_marginal_likelihood( K_i: torch.Tensor, logDetK: torch.Tensor, y: torch.Tensor, + *, normalize: bool = True, - log_prior_dist=None, -): +) -> torch.Tensor: """Compute the zero mean Gaussian process log marginal likelihood given the inverse of Gram matrix K(x2,x2), its log determinant, and the training label vector y. Option: normalize: normalize the log marginal likelihood by the length of the label vector, as per the gpytorch routine. - - prior: A pytorch distribution object. If specified, the hyperparameter prior will be taken into consideration and - we use Type-II MAP instead of Type-II MLE (compute log_posterior instead of log_evidence) """ lml = ( - -0.5 * y.t() @ K_i @ y + -0.5 * (y.t() @ K_i @ y) + 0.5 * logDetK - - y.shape[0] - / 2.0 - * torch.log( - 2 - * torch.tensor( - np.pi, - ) - ) + - y.shape[0] / 2.0 * torch.log(2 * torch.tensor(np.pi)) ) - if log_prior_dist is not None: - lml -= log_prior_dist return lml / y.shape[0] if normalize else lml -def compute_pd_inverse(K: torch.tensor, jitter: float = 1e-6): +def compute_pd_inverse( + K: torch.Tensor, + *, + jitter: float | torch.Tensor = 1e-6, + attempts: int = 3, +) -> tuple[torch.Tensor, torch.Tensor]: """Compute the inverse of a postive-(semi)definite matrix K using Cholesky inversion.""" n = K.shape[0] assert ( isinstance(jitter, float) or jitter.ndim == 0 ), "only homoscedastic noise variance is allowed here!" - is_successful = False - fail_count = 0 - max_fail = 3 - while fail_count < max_fail and not is_successful: + for i in range(attempts): try: - jitter_diag = jitter * torch.eye(n, device=K.device) * 10**fail_count - K_ = K + jitter_diag - try: - Kc = torch.linalg.cholesky(K_) - except AttributeError: # For torch < 1.8.0 - Kc = torch.cholesky(K_) - is_successful = True + jitter_diag = jitter * torch.eye(n, device=K.device) * 10**i + Kc = torch.linalg.cholesky(K + jitter_diag) + break except RuntimeError: - fail_count += 1 - if not is_successful: + pass + else: raise RuntimeError(f"Gram matrix not positive definite despite of jitter:\n{K}") + logDetK = -2 * torch.sum(torch.log(torch.diag(Kc))) K_i = torch.cholesky_inverse(Kc) - return K_i.to(torch.get_default_dtype()), logDetK.to(torch.get_default_dtype()) + return K_i.to(dtype=torch.float64), logDetK.to(dtype=torch.float64) diff --git a/neps/optimizers/models/__init__.py b/neps/optimizers/models/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/neps/search_spaces/search_space.py b/neps/search_spaces/search_space.py index 3f0d6703..40ecd0cf 100644 --- a/neps/search_spaces/search_space.py +++ b/neps/search_spaces/search_space.py @@ -211,7 +211,7 @@ def __init__(self, **hyperparameters: Parameter): if not isinstance(hp, NumericalParameter): raise ValueError( - "neps only suport float and integer fidelity parameters" + f"Only float and integer fidelities supported, got {hp}" ) _fidelity_param = hp @@ -232,6 +232,25 @@ def __init__(self, **hyperparameters: Parameter): self.raw_tabular_space: SearchSpace | None = None self.has_tabular: bool = False + self.categoricals: Mapping[str, CategoricalParameter] = { + k: hp for k, hp in _hyperparameters if isinstance(hp, CategoricalParameter) + } + self.numerical: Mapping[str, NumericalParameter] = { + k: hp + for k, hp in _hyperparameters + if isinstance(hp, NumericalParameter) and not hp.is_fidelity + } + self.graphs: Mapping[str, GraphParameter] = { + k: hp for k, hp in _hyperparameters if isinstance(hp, GraphParameter) + } + self.constants: Mapping[str, Any] = { + k: hp.value for k, hp in _hyperparameters if isinstance(hp, ConstantParameter) + } + # NOTE: For future of multiple fidelities + self.fidelities: Mapping[str, NumericalParameter] = {} + if _fidelity_param is not None and _fidelity_name is None: + self.fidelities = {_fidelity_name: _fidelity_param} + def set_custom_grid_space( self, grid_table: pd.Series | pd.DataFrame, From dc3ae030b3d38f08f13f88d734dce70303e4cdf9 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 14 Aug 2024 18:44:54 +0200 Subject: [PATCH 05/63] refactor: Lengthscale is always tensor --- .../kernels/vectorial_kernels.py | 55 +++++++------------ 1 file changed, 20 insertions(+), 35 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py index bd7a1661..9d0d4df2 100644 --- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py +++ b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py @@ -1,8 +1,7 @@ from __future__ import annotations from copy import deepcopy from math import sqrt -from dataclasses import dataclass -from typing import Iterable +from dataclasses import dataclass, field from typing_extensions import override import numpy as np @@ -20,7 +19,8 @@ class Stationary: All the classes (i.e. the class of stationary kernel_operators) derived from this class use the scaled distance to compute the Gram matrix.""" - lengthscale: float | torch.Tensor = 1.0 + # A single value applies to all dimensions, a vector applies to each dimension + lengthscale: torch.Tensor = field(default_factory=lambda: torch.tensor(1.0)) lengthscale_bounds: tuple[float, float] = LENGTHSCALE_BOUNDS_DEFAULT outputscale: float = 1.0 @@ -31,7 +31,7 @@ def forward( self, x1: torch.Tensor, x2: torch.Tensor | None = None, - l: float | torch.Tensor | None = None, + l: torch.Tensor | None = None, ) -> torch.Tensor: lengthscale = l if l is not None else self.lengthscale return _scaled_distance(lengthscale, x1, x2) @@ -39,7 +39,7 @@ def forward( def fit_transform( self, x1, - l: float | torch.Tensor | None = None, + l: torch.Tensor | None = None, rebuild_model: bool = True, save_gram_matrix: bool = True, ) -> torch.Tensor: @@ -52,7 +52,7 @@ def fit_transform( self.gram_ = K.clone() return K - def transform(self, x1, l: float | torch.Tensor | None = None) -> torch.Tensor: + def transform(self, x1, l: torch.Tensor | None = None) -> torch.Tensor: if self.gram_ is None or self.train_ is None: raise ValueError("The kernel has not been fitted. Run fit_transform first") return self.forward(self.train_, x1, l=l) @@ -61,7 +61,7 @@ def forward_t( self, x2: torch.Tensor, x1: torch.Tensor | None = None, - l: float | torch.Tensor | None = None, + l: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: if x1 is None: x1 = torch.tensor(self.train_) @@ -69,11 +69,8 @@ def forward_t( K = self.forward(x1, x2, l) return K, x2 - def update_hyperparameters(self, lengthscale: Iterable[torch.Tensor]) -> None: - self.lengthscale = [ - l_.clamp(self.lengthscale_bounds[0], self.lengthscale_bounds[1]).item() - for l_ in lengthscale - ] + def update_hyperparameters(self, lengthscale: torch.Tensor) -> None: + self.lengthscale = torch.clamp(lengthscale, *self.lengthscale_bounds) @dataclass @@ -83,7 +80,7 @@ def forward( self, x1: torch.Tensor, x2: torch.Tensor | None = None, - l: float | torch.Tensor | None = None, + l: torch.Tensor | None = None, ) -> torch.Tensor: lengthscale = l if l is not None else self.lengthscale dist_sq = _scaled_distance(lengthscale, x1, x2, sq_dist=True) @@ -130,7 +127,7 @@ def forward( self, x1: torch.Tensor, x2: torch.Tensor | None = None, - l: float | torch.Tensor | None = None, + l: torch.Tensor | None = None, ) -> torch.Tensor: lengthscale = l if l is not None else self.lengthscale dist = _scaled_distance(lengthscale, x1, x2) @@ -143,7 +140,7 @@ def forward( self, x1: torch.Tensor, x2: torch.Tensor | None = None, - l: float | torch.Tensor | None = None, + l: torch.Tensor | None = None, ) -> torch.Tensor: lengthscale = l if l is not None else self.lengthscale dist = _scaled_distance(lengthscale, x1, x2, sq_dist=True) @@ -153,12 +150,6 @@ def forward( * torch.exp(-sqrt(5.0) * dist) ) - def update_hyperparameters(self, lengthscale): - if lengthscale is None or "continuous" not in lengthscale.keys(): - raise ValueError("wtf") - lengthscale = lengthscale["continuous"] - super().update_hyperparameters(lengthscale=lengthscale) - @dataclass class HammingKernel(Stationary): @@ -167,18 +158,12 @@ def forward( self, x1: torch.Tensor, x2: torch.Tensor | None = None, - l: float | torch.Tensor | None = None, + l: torch.Tensor | None = None, ) -> torch.Tensor: lengthscale = l if l is not None else self.lengthscale dist = _hamming_distance(lengthscale, x1, x2) return self.outputscale * dist - def update_hyperparameters(self, lengthscale): - if lengthscale is None or "categorical" not in lengthscale.keys(): - raise ValueError("wtf") - lengthscale = lengthscale["categorical"] - super().update_hyperparameters(lengthscale=lengthscale) - @dataclass class RationalQuadraticKernel(Stationary): @@ -189,7 +174,7 @@ def forward( self, x1: torch.Tensor, x2: torch.Tensor | None = None, - l: float | torch.Tensor | None = None, + l: torch.Tensor | None = None, ) -> torch.Tensor: lengthscale = l if l is not None else self.lengthscale dist_sq = _scaled_distance(lengthscale, x1, x2, sq_dist=True) @@ -212,7 +197,7 @@ def _unscaled_square_distance( def _scaled_distance( - lengthscale: float | torch.Tensor, + lengthscale: torch.Tensor, X: torch.Tensor, X2: torch.Tensor | None = None, *, @@ -224,7 +209,7 @@ def _scaled_distance( lengthscale for all dimensions. Otherwise, we have an ARD kernel and in which case the length of the lengthscale vector must be the same as the dimensionality of the problem.""" - if isinstance(lengthscale, float): + if len(lengthscale) == 1: if sq_dist is False: return torch.sqrt(_unscaled_square_distance(X, X2)) / (lengthscale**2) @@ -246,7 +231,7 @@ def _scaled_distance( def _hamming_distance( - lengthscale: float | torch.Tensor, + lengthscale: torch.Tensor, X: torch.Tensor, X2: torch.Tensor | None = None, ) -> torch.Tensor: @@ -258,7 +243,7 @@ def _hamming_distance( scaled_indicator = C * indicator diffs = scaled_indicator.sum(dim=2) - if isinstance(lengthscale, float) or len(lengthscale) == 1: + if len(lengthscale) == 1: return torch.exp(diffs) / lengthscale - else: - return torch.exp(diffs) + + return torch.exp(diffs) From 3b8e549ffde1fcd5490e915cf3843221c490cf8c Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Thu, 15 Aug 2024 18:55:24 +0200 Subject: [PATCH 06/63] refactor: Cleanup and reduce duplicate preprocessing GP --- .../bayesian_optimization/cost_cooling.py | 4 +- .../bayesian_optimization/kernels/__init__.py | 4 + .../kernels/combine_kernels.py | 209 ------- .../kernels/combine_kernels_hierarchy.py | 44 -- .../bayesian_optimization/kernels/encoding.py | 277 --------- .../kernels/get_kernels.py | 92 +-- .../kernels/grakel_replace/__init__.py | 8 + .../grakel_replace/vertex_histogram.py | 15 +- .../grakel_replace/weisfeiler_lehman.py | 19 +- .../kernels/graph_kernel.py | 35 -- .../bayesian_optimization/kernels/kernel.py | 110 ++++ .../bayesian_optimization/kernels/utils.py | 14 +- .../kernels/vectorial_kernels.py | 167 ++--- .../kernels/weisfilerlehman.py | 360 ++--------- .../bayesian_optimization/models/deepGP.py | 4 +- .../bayesian_optimization/models/gp.py | 578 +++++++----------- .../models/gp_hierarchy.py | 162 +---- .../bayesian_optimization/optimizer.py | 4 +- neps/optimizers/multi_fidelity/dyhpo.py | 4 +- .../multi_fidelity/sampling_policy.py | 27 +- neps/search_spaces/__init__.py | 2 + neps/search_spaces/encoding.py | 132 ++++ 22 files changed, 671 insertions(+), 1600 deletions(-) delete mode 100644 neps/optimizers/bayesian_optimization/kernels/combine_kernels.py delete mode 100644 neps/optimizers/bayesian_optimization/kernels/encoding.py delete mode 100644 neps/optimizers/bayesian_optimization/kernels/graph_kernel.py create mode 100644 neps/optimizers/bayesian_optimization/kernels/kernel.py create mode 100644 neps/search_spaces/encoding.py diff --git a/neps/optimizers/bayesian_optimization/cost_cooling.py b/neps/optimizers/bayesian_optimization/cost_cooling.py index f2878fe9..0d77fbc6 100644 --- a/neps/optimizers/bayesian_optimization/cost_cooling.py +++ b/neps/optimizers/bayesian_optimization/cost_cooling.py @@ -23,7 +23,7 @@ from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( AcquisitionSampler, ) -from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_kernels +from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_default_kernels from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization @@ -124,7 +124,7 @@ def __init__( surrogate_model_args = surrogate_model_args or {} cost_model_args = cost_model_args or {} - graph_kernels, hp_kernels = get_kernels( + graph_kernels, hp_kernels = get_default_kernels( self.pipeline_space, domain_se_kernel, graph_kernels, diff --git a/neps/optimizers/bayesian_optimization/kernels/__init__.py b/neps/optimizers/bayesian_optimization/kernels/__init__.py index 8d11ea81..44c8e0ac 100644 --- a/neps/optimizers/bayesian_optimization/kernels/__init__.py +++ b/neps/optimizers/bayesian_optimization/kernels/__init__.py @@ -1,8 +1,12 @@ from __future__ import annotations +from dataclasses import dataclass from functools import partial from typing import Callable +from typing_extensions import TypeAlias +from neps.optimizers.bayesian_optimization.kernels.graph_kernel import GraphKernels +from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary from .encoding import NASBOTDistance from .vectorial_kernels import HammingKernel, Matern32Kernel, Matern52Kernel, RBFKernel from .weisfilerlehman import WeisfilerLehman diff --git a/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py b/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py deleted file mode 100644 index 3aa320b5..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/combine_kernels.py +++ /dev/null @@ -1,209 +0,0 @@ -import logging - -import torch - -from .utils import extract_configs -from .vectorial_kernels import HammingKernel, Stationary -from .weisfilerlehman import GraphKernels - - -def _select_dimensions(k): - if isinstance(k, HammingKernel): - return "categorical" - return "continuous" - - -class CombineKernel: - def __init__( - self, - combined_by="sum", - *kernels: list, - **kwargs, - ): - if combined_by not in ["sum", "product"]: - raise ValueError(f"Invalid value for combined_by ({combined_by})") - - self.has_graph_kernels = False - self.has_vector_kernels = False - self.lengthscale_bounds = (None, None) - for k in kernels: - if isinstance(k, GraphKernels): - self.has_graph_kernels = True - if not isinstance(k, GraphKernels): - self.has_vector_kernels = True - self.lengthscale_bounds = k.lengthscale_bounds - self.kernels = kernels - # Store the training graphs and vector features.. - self._gram = None - self.gr, self.x = None, None - self.combined_by = combined_by - - def fit_transform( - self, - weights: torch.Tensor, - configs: list, - normalize: bool = True, - rebuild_model: bool = True, - save_gram_matrix: bool = True, - gp_fit: bool = True, - feature_lengthscale: dict[str, torch.Tensor] | None = None, - **kwargs, - ): - N = len(configs) - K = torch.zeros(N, N) if self.combined_by == "sum" else torch.ones(N, N) - - gr1, x1 = extract_configs(configs) - - for i, k in enumerate(self.kernels): - if isinstance(k, GraphKernels) and None not in gr1: - update_val = weights[i] * k.fit_transform( - [g[i] for g in gr1] if isinstance(gr1[0], (list, tuple)) else gr1, - rebuild_model=rebuild_model, - save_gram_matrix=save_gram_matrix, - gp_fit=gp_fit, - **kwargs, - ) - - elif isinstance(k, Stationary) and None not in x1: - key = _select_dimensions(k) - update_val = ( - weights[i] - * k.fit_transform( - [x_[key] for x_ in x1], - l=feature_lengthscale[key] - if isinstance(feature_lengthscale, dict) - else None, - rebuild_model=rebuild_model, - save_gram_matrix=save_gram_matrix, - ) - ).double() - - else: - raise NotImplementedError( - "For now, only the Stationary custom built kernel_operators are " - "supported! " - ) - - if self.combined_by == "sum": - K += update_val - elif self.combined_by == "product": - K *= update_val - - if normalize: - K_diag = torch.sqrt(torch.diag(K)) - K /= torch.ger(K_diag, K_diag) - if save_gram_matrix: - self._gram = K.clone() - - return K - - def transform( - self, - weights: torch.Tensor, - configs: list, - x=None, - feature_lengthscale=None, - ): - if self._gram is None: - raise ValueError( - "The kernel has not been fitted. Call fit_transform first to generate " - "the training Gram matrix." - ) - gr, x = extract_configs(configs) - # K is in shape of len(Y), len(X) - size = len(configs) - K = ( - torch.zeros(size, self._gram.shape[0]) - if self.combined_by == "sum" - else torch.ones(size, self._gram.shape[0]) - ) - - for i, k in enumerate(self.kernels): - if isinstance(k, GraphKernels) and None not in gr: - update_val = weights[i] * k.transform( - [g[i] for g in gr] if isinstance(gr, list) else gr - ) - elif isinstance(k, Stationary) and None not in x: - key = _select_dimensions(k) - update_val = ( - weights[i] - * k.transform( - [x_[key] for x_ in x], - l=feature_lengthscale[key] - if isinstance(feature_lengthscale, dict) - else None, - ).double() - ) - else: - raise NotImplementedError( - "For now, only the Stationary custom built kernel_operators are " - "supported! " - ) - - if self.combined_by == "sum": - K += update_val - elif self.combined_by == "product": - K *= update_val - - return K.t() - - def clamp_theta_vector( - self, theta_vector: dict[str, torch.Tensor] - ) -> dict[str, torch.Tensor]: - for t_ in theta_vector.values(): - if t_.is_leaf: - t_.clamp_(self.lengthscale_bounds[0], self.lengthscale_bounds[1]) - - return theta_vector - - -class SumKernel(CombineKernel): - def __init__(self, *kernels, **kwargs): - super().__init__("sum", *kernels, **kwargs) - - def forward_t( - self, - weights: torch.Tensor, - gr2: list, - x2=None, - gr1: list = None, - x1=None, - feature_lengthscale=None, - ): - """ - Compute the kernel gradient w.r.t the feature vector - Parameters - ---------- - feature_lengthscale - x2 - x1 - gr1 - weights - gr2 - - Returns ------- grads: k list of 2-tuple. (K, x2) where K is the weighted Gram - matrix of that matrix, x2 is the leaf variable on which Jacobian-vector product - to be computed. - - """ - grads = [] - for i, k in enumerate(self.kernels): - if isinstance(k, GraphKernels): - handle = k.forward_t(gr2, gr1=gr1) - grads.append((weights[i] * handle[0], handle[1], handle[2])) - elif isinstance(k, Stationary): - key = _select_dimensions(k) - handle = k.forward_t(x2=x2[key], x1=x1[key], l=feature_lengthscale[i]) - grads.append((weights[i] * handle[0], handle[1], handle[2])) - else: - logging.warning( - "Gradient not implemented for kernel type" + str(k.__name__) - ) - grads.append((None, None)) - assert len(grads) == len(self.kernels) - return grads - - -class ProductKernel(CombineKernel): - def __init__(self, *kernels, **kwargs): - super().__init__("product", *kernels, **kwargs) diff --git a/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py b/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py index 2f3d2bf6..086cfc03 100644 --- a/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py +++ b/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py @@ -195,50 +195,6 @@ class SumKernel(CombineKernel): def __init__(self, *kernels, **kwargs): super().__init__("sum", *kernels, **kwargs) - def forward_t( - self, - weights: torch.Tensor, - gr2: list, - x2=None, - gr1: list = None, - x1=None, - feature_lengthscale=None, - ): - """ - Compute the kernel gradient w.r.t the feature vector - Parameters - ---------- - feature_lengthscale - x2 - x1 - gr1 - weights - gr2 - - Returns - ------- - grads: k list of 2-tuple. - (K, x2) where K is the weighted Gram matrix of that matrix, x2 is the leaf variable on which Jacobian-vector - product to be computed. - - """ - weights = transform_weights(weights.clone()) - grads = [] - for i, k in enumerate(self.kernels): - if isinstance(k, GraphKernels): - handle = k.forward_t(gr2, gr1=gr1) - grads.append((weights[i] * handle[0], handle[1], handle[2])) - elif isinstance(k, Stationary): - handle = k.forward_t(x2=x2, x1=x1, l=feature_lengthscale) - grads.append((weights[i] * handle[0], handle[1], handle[2])) - else: - logging.warning( - "Gradient not implemented for kernel type" + str(k.__name__) - ) - grads.append((None, None)) - assert len(grads) == len(self.kernels) - return grads - class ProductKernel(CombineKernel): def __init__(self, *kernels, **kwargs): diff --git a/neps/optimizers/bayesian_optimization/kernels/encoding.py b/neps/optimizers/bayesian_optimization/kernels/encoding.py deleted file mode 100644 index 419b6926..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/encoding.py +++ /dev/null @@ -1,277 +0,0 @@ -# Code from https://github.com/xingchenwan/nasbowl - -import networkx as nx -import numpy as np -import torch - -from .graph_kernel import GraphKernels - -INPUT = "input" -OUTPUT = "output" -CONV3X3 = "conv3x3-bn-relu" -CONV1X1 = "conv1x1-bn-relu" -MAXPOOL3X3 = "maxpool3x3" -OPS = [INPUT, CONV3X3, CONV1X1, MAXPOOL3X3, OUTPUT] -OPS_EX = [ - CONV3X3, - CONV1X1, - MAXPOOL3X3, -] - -OPS_201 = ["avg_pool_3x3", "nor_conv_1x1", "nor_conv_3x3", "none", "skip_connect"] - -NUM_VERTICES = 7 -OP_SPOTS = NUM_VERTICES - 2 -MAX_EDGES = 9 - - -def get_op_list(string): - # given a string, get the list of operations - tokens = string.split("|") - ops = [t.split("~")[0] for i, t in enumerate(tokens) if i not in [0, 2, 5, 9]] - return ops - - -def edit_distance(g1, g2): - g1_ops = get_op_list(g1.name) - g2_ops = get_op_list(g2.name) - return np.sum([1 for i in range(len(g1_ops)) if g1_ops[i] != g2_ops[i]]) - - -class NASBOTDistance(GraphKernels): - """NASBOT OATMANN distance according to BANANAS paper""" - - def __init__( - self, - node_name="op_name", - include_op_list=None, - exclude_op_list=None, - lengthscale=3.0, - normalize=True, - max_size=None, - **kwargs, - ): - super().__init__(**kwargs) - self.node_name = node_name - self.include_op_list = include_op_list if include_op_list is not None else OPS - self.exclude_op_list = exclude_op_list if exclude_op_list is not None else [] - self.normalize = normalize - self.lengthscale = lengthscale - self.max_size = max_size - self._gram = None - - def _compute_kernel(self, dist, l=None): - if dist is None: - return 0.0 - if l is None: - l = self.lengthscale - return np.exp(-dist / (l**2)) - - def _compute_dist( - self, - g1: nx.Graph, - g2: nx.Graph, - ): - # if cell-based nasbench201 - if "~" in g1.name: - g1_ops = get_op_list(g1.name) - g2_ops = get_op_list(g2.name) - - g1_counts = [g1_ops.count(op) for op in OPS_201] - g2_counts = [g2_ops.count(op) for op in OPS_201] - ops_dist = np.sum(np.abs(np.subtract(g1_counts, g2_counts))) - edit_dist = edit_distance(g1, g2) - return ops_dist + edit_dist - else: - # adjacency matrices - a1 = nx.to_numpy_array(g1) - a2 = nx.to_numpy_array(g2) - row_sums = sorted(np.array(a1).sum(axis=0)) - col_sums = sorted(np.array(a1).sum(axis=1)) - - other_row_sums = sorted(np.array(a2).sum(axis=0)) - other_col_sums = sorted(np.array(a2).sum(axis=1)) - - row_sums_arr = np.atleast_2d(row_sums) - col_sums_arr = np.atleast_2d(col_sums) - - other_row_sums_arr = np.atleast_2d(other_row_sums) - other_col_sums_arr = np.atleast_2d(other_col_sums) - row_dist = np.sum( - np.abs(np.diag(np.subtract(row_sums_arr, other_row_sums_arr.T))) - ) - col_dist = np.sum( - np.abs(np.diag(np.subtract(col_sums_arr, other_col_sums_arr.T))) - ) - counts = [0] * len(self.include_op_list) - other_counts = [0] * len(self.include_op_list) - for _, attrs in g1.nodes(data=True): - op_name = attrs[self.node_name] - if op_name not in self.exclude_op_list: - idx = self.include_op_list.index(op_name) - counts[idx] += 1 - for _, attrs in g2.nodes(data=True): - op_name = attrs[self.node_name] - if op_name not in self.exclude_op_list: - idx = self.include_op_list.index(op_name) - other_counts[idx] += 1 - - ops_dist = np.sum(np.abs(np.subtract(counts, other_counts))) - return (row_dist + col_dist + ops_dist) + 0.0 - - def forward(self, *graphs: nx.Graph, l: float = None): - n = len(graphs) - K = torch.zeros((n, n)) - for i in range(n): - for j in range(i, n): - K[i, j] = self._compute_kernel( - self._compute_dist(graphs[i], graphs[j]), l - ) - K[j, i] = K[i, j] - if self.normalize: - K = self.normalize_gram(K) - return K - - def fit_transform( - self, - gr: list, - l: float = None, - rebuild_model: bool = False, - save_gram_matrix: bool = False, - **kwargs, - ): - if not rebuild_model and self._gram is not None: - return self._gram - K = self.forward(*gr, l=l) - if save_gram_matrix: - self._gram = K.clone() - self._train_x = gr[:] - return K - - def transform(self, gr: list, l: float = None, **kwargs): - if self._gram is None: - raise ValueError("The kernel has not been fitted. Run fit_transform first") - n = len(gr) - K = torch.zeros((len(self._train_x), n)) - for i, _ in enumerate(self._train_x): - for j in range(n): - K[i, j] = self._compute_kernel( - self._compute_dist(self._train_x[i], gr[j]), l - ) - return K - - -class AdjacencyDistance( - NASBOTDistance, -): - def _compute_dist(self, g1: nx.Graph, g2: nx.Graph): - # adjacency matrices - a1 = nx.to_numpy_array(g1) - a2 = nx.to_numpy_array(g2) - x1 = np.array([attrs[self.node_name] for node, attrs in g1.nodes(data=True)]) - x2 = np.array([attrs[self.node_name] for node, attrs in g2.nodes(data=True)]) - graph_dist = np.sum(a1 != a2) - ops_dist = np.sum(x1 != x2) - return (graph_dist + ops_dist) + 0.0 - - -class PathDistance(NASBOTDistance): - def get_paths(self, g: nx.Graph): - """ - return all paths from input to output - """ - paths: list = [] - matrix = nx.to_numpy_array(g) - ops: list = [] - for _, attr in g.nodes(data=True): - ops.append(attr[self.node_name]) - for j in range(0, NUM_VERTICES): - if matrix[0][j]: - paths.append([[]]) - else: - paths.append([]) - - # create paths sequentially - for i in range(1, NUM_VERTICES - 1): - for j in range(1, NUM_VERTICES): - if matrix[i][j]: - for path in paths[i]: - paths[j].append([*path, ops[i]]) - return paths[-1] - - def get_path_indices(self, g: nx.Graph): - """ - compute the index of each path - There are 3^0 + ... + 3^5 paths total. - (Paths can be length 0 to 5, and for each path, for each node, there - are three choices for the operation.) - """ - paths = self.get_paths(g) - mapping = {CONV3X3: 0, CONV1X1: 1, MAXPOOL3X3: 2} - path_indices = [] - - for path in paths: - index = 0 - for i in range(NUM_VERTICES - 1): - if i == len(path): - path_indices.append(index) - break - else: - index += len(OPS_EX) ** i * (mapping[path[i]] + 1) - - return tuple(path_indices) - - @staticmethod - def get_paths_201(g: nx.Graph): - """ - return all paths from input to output - """ - path_blueprints = [[3], [0, 4], [1, 5], [0, 2, 5]] - ops = get_op_list(g.name) - paths = [] - for blueprint in path_blueprints: - paths.append([ops[node] for node in blueprint]) - - return paths - - def get_path_indices_201(self, g: nx.Graph): - """ - compute the index of each path - """ - paths = self.get_paths_201(g) - path_indices = [] - NUM_OPS = len(OPS_201) - for i, path in enumerate(paths): - if i == 0: - index = 0 - elif i in [1, 2]: - index = NUM_OPS - else: - index = NUM_OPS + NUM_OPS**2 - for j, op in enumerate(path): - index += OPS_201.index(op) * NUM_OPS**j - path_indices.append(index) - - return tuple(path_indices) - - def encode_paths(self, g: nx.Graph): - """output one-hot encoding of paths""" - if "~" in g.name: - LONGEST_PATH_LENGTH = 3 - num_paths = sum(len(OPS_201) ** i for i in range(1, LONGEST_PATH_LENGTH + 1)) - path_indices = self.get_path_indices_201(g) - elif "101" in g.name: - num_paths = sum(len(OPS_EX) ** i for i in range(OP_SPOTS + 1)) - path_indices = self.get_path_indices(g) - else: - num_paths = sum(len(self.op_list) ** i for i in range(self.max_size - 1)) - path_indices = self.get_paths(g) - path_encoding = np.zeros(num_paths) - for index in path_indices: - path_encoding[index] = 1 - return path_encoding - - def _compute_dist(self, g1: nx.Graph, g2: nx.Graph): - encode1 = self.encode_paths(g1) - encode2 = self.encode_paths(g2) - return np.sum(np.array(encode1 != np.array(encode2))) diff --git a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py b/neps/optimizers/bayesian_optimization/kernels/get_kernels.py index f606f442..3ed9b5b9 100644 --- a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py +++ b/neps/optimizers/bayesian_optimization/kernels/get_kernels.py @@ -1,40 +1,58 @@ from __future__ import annotations -from neps.utils.common import instance_from_map -from ....search_spaces.architecture.core_graph_grammar import CoreGraphGrammar -from ....search_spaces.hyperparameters.categorical import CategoricalParameter -from ....search_spaces.hyperparameters.float import FloatParameter -from ....search_spaces.hyperparameters.integer import IntegerParameter -from ....utils.common import has_instance -from . import GraphKernelMapping, StationaryKernelMapping - - -def get_kernels( - pipeline_space, domain_se_kernel, graph_kernels, hp_kernels, optimal_assignment -): - if not graph_kernels: - graph_kernels = [] - if has_instance(pipeline_space.values(), CoreGraphGrammar): - graph_kernels.append("wl") - if not hp_kernels: - hp_kernels = [] - if has_instance(pipeline_space.values(), FloatParameter, IntegerParameter): - hp_kernels.append("m52") - if has_instance(pipeline_space.values(), CategoricalParameter): - hp_kernels.append("hm") - graph_kernels = [ - instance_from_map(GraphKernelMapping, kernel, "kernel", as_class=True)( - oa=optimal_assignment, - se_kernel=instance_from_map( - StationaryKernelMapping, domain_se_kernel, "se kernel" - ), +from neps.optimizers.bayesian_optimization.kernels import Kernel +from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import ( + HammingKernel, + Matern52Kernel, +) +import torch +from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import WeisfilerLehman + +from neps.search_spaces import SearchSpace + + +# TODO: Option to combine numerical and categorical into one. +def get_default_kernels( + *, + space: SearchSpace, + optimizable: bool = True, +) -> list[tuple[Kernel, list[str]],]: + kernels: list[tuple[Kernel, list[str]]] = [] + if any(space.graphs): + h = 2 + if optimizable: + layer_weights = torch.nn.Parameter(torch.ones(h + 1)) + else: + layer_weights = None + + kernels.append( + ( + WeisfilerLehman(h=2, layer_weights=layer_weights, oa=True), + list(space.graphs.keys()), + ) ) - for kernel in graph_kernels - ] - hp_kernels = [ - instance_from_map(StationaryKernelMapping, kernel, "kernel") - for kernel in hp_kernels - ] - if not graph_kernels and not hp_kernels: - raise ValueError("No kernels are provided!") - return graph_kernels, hp_kernels + + if any(space.categoricals): + if optimizable: + lengthscales = torch.nn.Parameter(torch.ones(len(space.categoricals))) + else: + lengthscales = torch.ones(len(space.categoricals)) + + kernels.append( + ( + HammingKernel(lengthscale=lengthscales), + list(space.categoricals.keys()), + ) + ) + + if any(space.numerical): + if optimizable: + lengthscales = torch.nn.Parameter(torch.ones(len(space.numerical))) + else: + lengthscales = torch.ones(len(space.numerical)) + + kernels.append( + (Matern52Kernel(lengthscale=lengthscales), list(space.numerical.keys())) + ) + + return kernels diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py index e69de29b..ac1c60ad 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py @@ -0,0 +1,8 @@ +from neps.optimizers.bayesian_optimization.kernels.grakel_replace.vertex_histogram import ( + VertexHistogram, +) +from neps.optimizers.bayesian_optimization.kernels.grakel_replace.weisfeiler_lehman import ( + WeisfeilerLehman, +) + +__all__ = ["VertexHistogram", "WeisfeilerLehman"] diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py index 285b067c..103818ae 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py @@ -1,4 +1,5 @@ """The vertex kernel as defined in :cite:`sugiyama2015halting`.""" + import logging from collections import Counter from collections.abc import Iterable @@ -54,7 +55,7 @@ def __init__( sparse="auto", oa=False, mahalanobis_precision=None, - se_kernel: Stationary = None, + se_kernel: Stationary | None = None, requires_ordered_features: bool = False, as_tensor: bool = True, ): @@ -75,6 +76,7 @@ def __init__( self.sparse = False else: self.sparse = sparse + self.oa = oa self.se_kernel = se_kernel self._initialized.update({"sparse": True}) @@ -220,8 +222,11 @@ def parse_input(self, X, label_start_idx=0, label_end_idx=None): except MemoryError: warn("memory-error: switching to sparse") - self.sparse_, features = True, csr_matrix( - (data, (rows, cols)), shape=(ni, label_length), copy=False + self.sparse_, features = ( + True, + csr_matrix( + (data, (rows, cols)), shape=(ni, label_length), copy=False + ), ) if ni == 0: @@ -257,7 +262,7 @@ def _calculate_kernel_matrix(self, Y=None): K[j, i] = K[i, j] else: if self.se_kernel is not None: - K = self.se_kernel.forward(self.X, self.X) + K = self.se_kernel._forwardd(self.X, self.X) else: K = self.X @ self.X.T else: @@ -270,7 +275,7 @@ def _calculate_kernel_matrix(self, Y=None): ) else: if self.se_kernel is not None: - K = self.se_kernel.forward(self.X, Y) + K = self.se_kernel._forwardd(self.X, Y) else: K = Y[:, : self.X.shape[1]] @ self.X.T diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py index 890c2c8d..dd5dd829 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py @@ -18,7 +18,6 @@ from .vertex_histogram import VertexHistogram -warnings.filterwarnings("ignore", message="Importing from numpy.matlib is deprecated ") class WeisfeilerLehman(Kernel): """Compute the Weisfeiler Lehman Kernel. @@ -71,7 +70,7 @@ def __init__( h: int = 5, base_graph_kernel=VertexHistogram, node_weights=None, - layer_weights=None, + layer_weights: torch.Tensor | None = None, as_tensor: bool = True, ): """Initialise a `weisfeiler_lehman` kernel.""" @@ -114,7 +113,7 @@ def initialize(self): if base_graph_kernel is None: base_graph_kernel, params = VertexHistogram, dict() # TODO: make sure we're always passing like this - elif type(base_graph_kernel) is type and issubclass( + elif type(base_graph_kernel) is type and issubclass( # pylint: disable=C0123 base_graph_kernel, Kernel ): params = dict() @@ -129,7 +128,7 @@ def initialize(self): ) from _error if not ( - type(base_graph_kernel) is type + type(base_graph_kernel) is type # pylint: disable=C0123 and issubclass(base_graph_kernel, Kernel) ): raise TypeError( @@ -159,10 +158,10 @@ def initialize(self): self._h = self.h + 1 self._initialized["h"] = True - if self.layer_weights is None or self.layer_weights.shape[0] != self._h: - self.layer_weights = np.ones((self._h,)) - if self.as_tensor and not isinstance(self.layer_weights, torch.Tensor): - self.layer_weights = torch.tensor(self.layer_weights) + if self.layer_weights is None: + self.layer_weights = torch.ones((self._h,)) + else: + assert len(self.layer_weights) == self._h self._initialized["h"] = True self._initialized["layer_weights"] = True @@ -424,9 +423,7 @@ def generate_graphs(label_count: int, WL_labels_inverse): return K, base_graph_kernel return np.sum(K, axis=0), base_graph_kernel - def fit_transform( - self, X: Iterable, y=None, gp_fit: bool = True - ): + def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True): # pylint: disable=unused-argument """Fit and transform, on the same dataset. Parameters diff --git a/neps/optimizers/bayesian_optimization/kernels/graph_kernel.py b/neps/optimizers/bayesian_optimization/kernels/graph_kernel.py deleted file mode 100644 index b9d10102..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/graph_kernel.py +++ /dev/null @@ -1,35 +0,0 @@ -import torch - - -class GraphKernels: - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.n_hyperparameters = 0 - self.rbf_lengthscale = False - self.kern = None - self.__name__ = "GraphKernelBase" - - @staticmethod - def normalize_gram(K: torch.Tensor): - K_diag = torch.sqrt(torch.diag(K)) - K_diag_outer = torch.ger(K_diag, K_diag) - return K / K_diag_outer - - def fit_transform( - self, gr: list, rebuild_model=False, save_gram_matrix=False, **kwargs - ): - raise NotImplementedError - - def transform( - self, - gr: list, - ): - raise NotImplementedError - - def forward_t(self, gr2, gr1: list = None): - """ - Compute the derivative of the kernel function k(phi, phi*) with respect to phi* (the training point) - """ - raise NotImplementedError( - "The kernel gradient is not implemented for the graph kernel called!" - ) diff --git a/neps/optimizers/bayesian_optimization/kernels/kernel.py b/neps/optimizers/bayesian_optimization/kernels/kernel.py new file mode 100644 index 00000000..52db2751 --- /dev/null +++ b/neps/optimizers/bayesian_optimization/kernels/kernel.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +import math +import inspect +import copy +from typing import TypeVar, Generic, Any, Sequence, Mapping, Callable +from typing_extensions import Self +import torch +import torch.nn as nn + +from neps.utils.types import NotSet + +T = TypeVar("T") + + +class Kernel(nn.Module, Generic[T]): + def fit_transform(self, x: T) -> torch.Tensor: + raise NotImplementedError + + def transform(self, x: T) -> torch.Tensor: + raise NotImplementedError + + def clone(self) -> Self: + return self.clone_with() + + def clone_with(self, **params: dict[str, Any]) -> Self: + # h ttps://github.com/scikit-learn/scikit-learn/blob/70fdc843a4b8182d97a3508c1a426acc5e87e980/sklearn/base.py#L197 + sig = inspect.signature(self.__init__) + + self_values = {} + for p in sig.parameters.values(): + if p.name == "self": + continue + + attr = getattr(self, p.name, NotSet) + if attr is NotSet: + raise ValueError( + f"Could not clone as the variable {p.name} was not set in" + f" the constructor on the object: {self}" + ) + self_values[p.name] = params.get(p.name, attr) + + new_self_values = copy.deepcopy(self_values) + return self.__class__(**new_self_values) + + def grid_search( + self, + x: T, + *, + grid: Sequence[Mapping[str, Any]], + to_minimize: Callable[[torch.Tensor], float], + ) -> tuple[Self, float]: + if len(grid) == 0: + raise ValueError("Grid must have at least one element.") + + def _fit_and_eval(_params: Mapping[str, Any]) -> tuple[Kernel[T], float]: + cloned_kernel = self.clone_with(**_params) + K = cloned_kernel.fit_transform(x) + metric = to_minimize(K) + return cloned_kernel, metric + + return min( + (_fit_and_eval(params) for params in grid), + key=lambda x: x[1], + ) + + +class NumericKernel(Kernel[torch.Tensor]): ... + + +PI = torch.tensor(math.pi) + + +def compute_normalized_log_marginal_likelihood( + K_i: torch.Tensor, + logDetK: torch.Tensor, + y: torch.Tensor, +) -> torch.Tensor: + """Compute the zero mean Gaussian process log marginal likelihood + given the inverse of Gram matrix K(x2,x2), its log determinant, + and the training label vector y. + """ + lml = -0.5 * (y.t() @ K_i @ y) + 0.5 * logDetK - y.shape[0] / 2.0 * torch.log(2 * PI) + return lml / y.shape[0] + + +def compute_pd_inverse( + K: torch.Tensor, + *, + jitter: float | torch.Tensor = 1e-9, + attempts: int = 3, +) -> tuple[torch.Tensor, torch.Tensor]: + """Compute the inverse of a postive-(semi)definite matrix K using Cholesky inversion.""" + n = K.shape[0] + assert ( + isinstance(jitter, float) or jitter.ndim == 0 + ), "only homoscedastic noise variance is allowed here!" + for i in range(attempts): + try: + jitter_diag = jitter * torch.eye(n, device=K.device) * 10**i + Kc = torch.linalg.cholesky(K + jitter_diag) + break + except RuntimeError: + pass + else: + raise RuntimeError(f"Gram matrix not positive definite despite of jitter:\n{K}") + + logDetK = -2 * torch.sum(torch.log(torch.diag(Kc))) + K_i = torch.cholesky_inverse(Kc) + return K_i.to(dtype=torch.float64), logDetK.to(dtype=torch.float64) diff --git a/neps/optimizers/bayesian_optimization/kernels/utils.py b/neps/optimizers/bayesian_optimization/kernels/utils.py index 92ee1817..e134bfd0 100644 --- a/neps/optimizers/bayesian_optimization/kernels/utils.py +++ b/neps/optimizers/bayesian_optimization/kernels/utils.py @@ -33,16 +33,22 @@ def extract_configs(configs: list[SearchSpace]) -> Tuple[list, list]: """ config_hps = [conf.get_normalized_hp_categories() for conf in configs] graphs = [hps["graphs"] for hps in config_hps] + # Don't call np.array on structured objects # https://github.com/numpy/numpy/issues/24546#issuecomment-1693913119 # _nested_graphs = np.array(graphs, dtype=object) # if _nested_graphs.ndim == 3 # graphs = _nested_graphs[:, :, 0].reshape(-1).tolist() # Long hand way of doing the above - if (len(graphs) > 0 and isinstance(graphs[0], list) - and len(graphs[0]) > 0 and isinstance(graphs[0][0], list)): - res = [_list for list_of_list in graphs for _list in list_of_list] - graphs = res + # I guess this is just flattening... + if ( + len(graphs) > 0 + and isinstance(graphs[0], list) + and len(graphs[0]) > 0 + and isinstance(graphs[0][0], list) + ): + graphs = [_list for list_of_list in graphs for _list in list_of_list] + return graphs, config_hps diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py index 9d0d4df2..8e7a1074 100644 --- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py +++ b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py @@ -1,149 +1,80 @@ from __future__ import annotations -from copy import deepcopy + from math import sqrt -from dataclasses import dataclass, field from typing_extensions import override +from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel import numpy as np import torch -LENGTHSCALE_BOUNDS_DEFAULT = ( - np.exp(-6.754111155189306), - np.exp(0.0858637988771976), -) +DEFAULT_LENGTHSCALE_BOUNDS = np.exp(-6.754111155189306), np.exp(0.0858637988771976) -@dataclass -class Stationary: +class Stationary(Kernel[torch.Tensor]): """Here we follow the structure of GPy to build a sub class of stationary kernel. - All the classes (i.e. the class of stationary kernel_operators) derived from this - class use the scaled distance to compute the Gram matrix.""" - - # A single value applies to all dimensions, a vector applies to each dimension - lengthscale: torch.Tensor = field(default_factory=lambda: torch.tensor(1.0)) - lengthscale_bounds: tuple[float, float] = LENGTHSCALE_BOUNDS_DEFAULT - outputscale: float = 1.0 - gram_: torch.Tensor | None = None - train_: torch.Tensor | None = None - - def forward( - self, - x1: torch.Tensor, - x2: torch.Tensor | None = None, - l: torch.Tensor | None = None, - ) -> torch.Tensor: - lengthscale = l if l is not None else self.lengthscale - return _scaled_distance(lengthscale, x1, x2) + All the classes (i.e. the class of stationary kernel_operators) derived from this + class use the scaled distance to compute the Gram matrix. + """ - def fit_transform( + def __init__( self, - x1, - l: torch.Tensor | None = None, - rebuild_model: bool = True, - save_gram_matrix: bool = True, - ) -> torch.Tensor: - if not rebuild_model and self.gram_ is not None: - return self.gram_ - K = self.forward(x1, l=l) - if save_gram_matrix: - self.train_ = deepcopy(x1) - assert isinstance(K, torch.Tensor), "it doesnt work with np arrays.." - self.gram_ = K.clone() + *, + lengthscale: torch.Tensor, + outputscale: float | torch.Tensor = 1.0, + lengthscale_bounds: tuple[float, float] = DEFAULT_LENGTHSCALE_BOUNDS, + ): + self.lengthscale = lengthscale + self.outputscale = outputscale + self.lengthscale_bounds = lengthscale_bounds + + self.gram_: torch.Tensor | None = None + self.train_: torch.Tensor | None = None + + def fit_transform(self, x: torch.Tensor) -> torch.Tensor: + K = self._forward(x) + self.train_ = x.clone().detach() return K - def transform(self, x1, l: torch.Tensor | None = None) -> torch.Tensor: - if self.gram_ is None or self.train_ is None: + def transform(self, x: torch.Tensor) -> torch.Tensor: + if self.train_ is None: raise ValueError("The kernel has not been fitted. Run fit_transform first") - return self.forward(self.train_, x1, l=l) + return self._forward(self.train_, x) - def forward_t( - self, - x2: torch.Tensor, - x1: torch.Tensor | None = None, - l: torch.Tensor | None = None, - ) -> tuple[torch.Tensor, torch.Tensor]: - if x1 is None: - x1 = torch.tensor(self.train_) - x2 = torch.tensor(x2, requires_grad=True) - K = self.forward(x1, x2, l) - return K, x2 - - def update_hyperparameters(self, lengthscale: torch.Tensor) -> None: - self.lengthscale = torch.clamp(lengthscale, *self.lengthscale_bounds) + def _forward(self, x1: torch.Tensor, x2: torch.Tensor | None = None) -> torch.Tensor: + return _scaled_distance(self.lengthscale, x1, x2) -@dataclass class RBFKernel(Stationary): @override - def forward( + def _forward( self, x1: torch.Tensor, x2: torch.Tensor | None = None, - l: torch.Tensor | None = None, ) -> torch.Tensor: - lengthscale = l if l is not None else self.lengthscale - dist_sq = _scaled_distance(lengthscale, x1, x2, sq_dist=True) + dist_sq = _scaled_distance(self.lengthscale, x1, x2, sq_dist=True) return self.outputscale * torch.exp(-0.5 * dist_sq) -@dataclass -class LayeredRBFKernel(RBFKernel): - """ - Same as the conventional RBF kernel, but adapted in a way as a midway between - spherical RBF and ARD RBF. In this case, one weight is assigned to each - Weisfiler-Lehman iteration only (e.g. one weight for h=0, another for h=1 and etc.) - """ - - @override - def forward( - self, - ard_dims: torch.Tensor, - x1: torch.Tensor, - x2: torch.Tensor | None = None, - l: torch.Tensor | None = None, - ) -> torch.Tensor: - _l = l if l is not None else self.lengthscale - assert isinstance(_l, torch.Tensor), "Lengthscale must be a torch tensor" - assert _l.shape[0] == ard_dims.shape[0], ( - "LayeredRBF expects the lengthscale vector to have the same " - "dimensionality as the " - "number of WL iterations, but got lengthscale vector of shape" - + str(_l.shape[0]) - + "and WL iteration of shape " - + str(ard_dims.shape[0]) - ) - - M = torch.cat( - [torch.ones(int(ard_dims[i])) * _l[i] for i in range(len(ard_dims))] - ) - return super().forward(x1, x2, M) - - -@dataclass class Matern32Kernel(Stationary): @override - def forward( + def _forward( self, x1: torch.Tensor, x2: torch.Tensor | None = None, - l: torch.Tensor | None = None, ) -> torch.Tensor: - lengthscale = l if l is not None else self.lengthscale - dist = _scaled_distance(lengthscale, x1, x2) + dist = _scaled_distance(self.lengthscale, x1, x2) return self.outputscale * (1 + sqrt(3.0) * dist) * torch.exp(-sqrt(3.0) * dist) class Matern52Kernel(Stationary): @override - def forward( + def _forward( self, x1: torch.Tensor, x2: torch.Tensor | None = None, - l: torch.Tensor | None = None, ) -> torch.Tensor: - lengthscale = l if l is not None else self.lengthscale - dist = _scaled_distance(lengthscale, x1, x2, sq_dist=True) + dist = _scaled_distance(self.lengthscale, x1, x2, sq_dist=True) return ( self.outputscale * (1 + sqrt(5.0) * dist + 5.0 / 3.0 * dist) @@ -151,36 +82,6 @@ def forward( ) -@dataclass -class HammingKernel(Stationary): - @override - def forward( - self, - x1: torch.Tensor, - x2: torch.Tensor | None = None, - l: torch.Tensor | None = None, - ) -> torch.Tensor: - lengthscale = l if l is not None else self.lengthscale - dist = _hamming_distance(lengthscale, x1, x2) - return self.outputscale * dist - - -@dataclass -class RationalQuadraticKernel(Stationary): - power: float = 2.0 - - @override - def forward( - self, - x1: torch.Tensor, - x2: torch.Tensor | None = None, - l: torch.Tensor | None = None, - ) -> torch.Tensor: - lengthscale = l if l is not None else self.lengthscale - dist_sq = _scaled_distance(lengthscale, x1, x2, sq_dist=True) - return self.outputscale * (1 + dist_sq / 2.0) ** (-self.power) - - def _unscaled_square_distance( X: torch.Tensor, X2: torch.Tensor | None = None, diff --git a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py index e6550d65..b1d4cd7e 100644 --- a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py +++ b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py @@ -1,335 +1,81 @@ -import logging +from __future__ import annotations -import numpy as np import torch -from grakel.kernels import ShortestPathAttr -from grakel.utils import graph_from_networkx -from .grakel_replace.edge_histogram import EdgeHistogram -from .grakel_replace.utils import calculate_kernel_matrix_as_tensor -from .grakel_replace.vertex_histogram import VertexHistogram -from .grakel_replace.weisfeiler_lehman import WeisfeilerLehman as _WL -from .graph_kernel import GraphKernels -from .utils import transform_to_undirected -from .vectorial_kernels import Stationary +from typing import Sequence +from neps.optimizers.bayesian_optimization.kernels.grakel_replace import ( + VertexHistogram, + WeisfeilerLehman as _WL, +) +from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel +from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary +from neps.search_spaces.encoding import WLInput -class WeisfilerLehman(GraphKernels): +class WeisfilerLehman(Kernel[Sequence[WLInput]]): """Weisfiler Lehman kernel using grakel functions""" def __init__( self, h: int = 0, - base_type: str = "subtree", - se_kernel: Stationary = None, - layer_weights=None, - node_weights=None, + se_kernel: Stationary | None = None, + layer_weights: torch.Tensor | None = None, oa: bool = False, node_label: str = "op_name", - edge_label: tuple = "op_name", - n_jobs: int = None, - return_tensor: bool = True, - requires_grad: bool = False, - undirected: bool = False, - **kwargs, ): + """Initializes the Weisfeiler-Lehman kernel. + + Args: + h: The number of Weisfeiler-Lehman iterations + se_kernel: defines a stationary vector kernel to be used for + successive embedding (i.e. the kernel function on which the + vector embedding inner products are computed). + If None, uses the default linear kernel + layer_weights: The weights for each layer of the Weisfeiler-Lehman kernel. + If None, uses uniform + oa: whether the optimal assignment variant of the Weisfiler-Lehman + kernel should be used + node_label: the node_label defining the key node attribute. """ - - Parameters - ---------- - h: int: The number of Weisfeiler-Lehman iterations - base_type: str: defines the base kernel of WL iteration. Possible types are 'subtree' (default), 'sp': shortest path - and 'edge' (The latter two are untested) - se_kernel: Stationary. defines a stationary vector kernel to be used for successive embedding (i.e. the kernel - function on which the vector embedding inner products are computed). if None, use the default linear kernel - node_weights - oa: whether the optimal assignment variant of the Weisfiler-Lehman kernel should be used - node_label: the node_label defining the key node attribute. - edge_label: the edge label defining the key edge attribute. only relevant when base_type == 'edge' - n_jobs: Parallisation to be used. *current version does not support parallel computing' - return_tensor: whether return a torch tensor. If False, a numpy array will be returned. - kwargs - """ - super().__init__(**kwargs) if se_kernel is not None and oa: raise ValueError( "Only one or none of se (successive embedding) and oa (optimal assignment) may be true!" ) + self.h = h + self.se_kernel = se_kernel + self.layer_weights = layer_weights self.oa = oa self.node_label = node_label - self.edge_label = edge_label - self.layer_weights = layer_weights - self.se = se_kernel - self.requires_grad = requires_grad - self.undirected = undirected + if node_label != "op_name": + raise NotImplementedError("Only 'op_name' is supported for node_label") - if base_type not in ["subtree", "sp", "edge"]: - raise ValueError(f"Invalid value for base_type ({base_type})") - if base_type == "subtree": - base_kernel = VertexHistogram, { - "sparse": False, - "requires_ordered_features": requires_grad, - } - if oa: - base_kernel = VertexHistogram, { - "oa": True, - "sparse": False, - "requires_ordered_features": requires_grad, - } - elif se_kernel is not None: - base_kernel = VertexHistogram, { - "se_kernel": se_kernel, - "sparse": False, - "requires_ordered_features": requires_grad, - } - elif base_type == "edge": - base_kernel = EdgeHistogram, {"sparse": False} - if oa: - base_kernel = EdgeHistogram, { - "oa": True, - "sparse": False, - "requires_ordered_features": requires_grad, - } - elif se_kernel is not None: - base_kernel = EdgeHistogram, { - "se_kernel": se_kernel, - "sparse": False, - "requires_ordered_features": requires_grad, - } + self.wl_kernel_: _WL | None = None - elif base_type == "sp": - base_kernel = ShortestPathAttr, {} - else: - raise NotImplementedError( - "The selected WL base kernel type" - + str(base_type) - + " is not implemented." - ) - self.base_type = base_type - self.kern = _WL( - n_jobs, - h=h, - base_graph_kernel=base_kernel, - normalize=True, + def fit_transform(self, gr: Sequence[WLInput]) -> torch.Tensor: + self.wl_kernel_ = _WL( + h=self.h, + base_graph_kernel=( # type: ignore + VertexHistogram, + { + "sparse": False, + "se_kernel": self.se_kernel, + "oa": self.oa, + "requires_ordered_features": True, + }, + ), layer_weights=self.layer_weights, - node_weights=node_weights, - ) - self.return_tensor = return_tensor - self._gram = None - self._train, self._train_transformed = None, None - self.__name__ = "WeisfeilerLehman" - - def change_se_params(self, params: dict): - """Change the kernel parameter of the successive embedding kernel.""" - if self.se is None: - logging.warning("SE kernel is None. change_se_params action voided.") - return - for k, v in params.items(): - try: - setattr(self.se, k, v) - except AttributeError: - logging.warning( - str(k) + " is not a valid attribute name of the SE kernel." - ) - continue - self.kern.change_se_kernel(self.se) - - def get_info_se_kernel(self): - return self.se.lengthscale, self.kern.X[0].X.shape[1] - - def change_kernel_params(self, params: dict): - for k, v in params.items(): - try: - getattr(self.kern, k) - setattr(self.kern, k, v) - except AttributeError: - logging.warning(str(k) + " is not a valid attribute name of this kernel.") - continue - try: - setattr(self, k, v) - except AttributeError: - pass - for k in self.kern._initialized.keys(): - self.kern._initialized[k] = False - - self.kern.initialize() - - def fit_transform( - self, - gr: list, - rebuild_model: bool = False, - save_gram_matrix: bool = True, - layer_weights=None, - gp_fit: bool = True, - **kwargs, - ): - # Transform into GraKeL graph format - if rebuild_model is False and self._gram is not None: - return self._gram - if self.undirected: - gr = transform_to_undirected(gr) - if self.base_type == "edge": - if not all([g.graph_type == "edge_attr" for g in gr]): - raise ValueError( - "One or more graphs passed are not edge-attributed graphs. You need all graphs to be" - "in edge format to use 'edge' type Weisfiler-Lehman kernel." - ) - - gr_ = list(graph_from_networkx(gr, self.node_label, self.edge_label)) - else: - gr_ = list( - graph_from_networkx( - gr, - self.node_label, - ) - ) - - if rebuild_model or self._gram is None: - self._train = gr[:] - self._train_transformed = gr_[:] - - if layer_weights is not None and layer_weights is not self.layer_weights: - self.change_kernel_params({"layer_weights": layer_weights}) - self.layer_weights = layer_weights - - K = self.kern.fit_transform(gr_, gp_fit=gp_fit) - if self.return_tensor and not isinstance(K, torch.Tensor): - K = torch.tensor(K) - if save_gram_matrix: - self._gram = K.clone() - self.layer_weights = self.kern.layer_weights - return K - - def transform( - self, - gr: list, - ): - """transpose: by default, the grakel produces output in shape of len(y) * len(x2). Use transpose to - reshape that to a more conventional shape..""" - if self.undirected: - gr = transform_to_undirected(gr) - if self.base_type == "edge": - if not all([g.graph_type == "edge_attr" for g in gr]): - raise ValueError( - "One or more graphs passed are not edge-attributed graphs. You need all graphs to be" - "in edge format to use 'edge' type Weisfiler-Lehman kernel." - ) - gr_ = graph_from_networkx(gr, self.node_label, self.edge_label) - else: - gr_ = graph_from_networkx( - gr, - self.node_label, - ) - - K = self.kern.transform(gr_) - if self.return_tensor and not isinstance(K, torch.Tensor): - K = torch.tensor(K) - return K - - def forward_t(self, gr2, gr1=None): - """ - Forward pass, but in tensor format. - - Parameters - ---------- - gr1: single networkx graph - - Returns - ------- - K: the kernel matrix - x2 or y: the leaf variable(s) with requires_grad enabled. - This allows future Jacobian-vector product to be efficiently computed. - """ - if self.undirected: - gr2 = transform_to_undirected(gr2) - - # Convert into GraKel compatible graph format - if self.base_type == "edge": - gr2 = graph_from_networkx(gr2, self.node_label, self.edge_label) - else: - gr2 = graph_from_networkx(gr2, self.node_label) - - if gr1 is None: - gr1 = self._train_transformed - else: - if self.undirected: - gr1 = transform_to_undirected(gr1) - if self.base_type == "edge": - gr1 = graph_from_networkx(gr1, self.node_label, self.edge_label) - else: - gr1 = graph_from_networkx(gr1, self.node_label) - - x_ = torch.tensor( - np.concatenate(self.kern.transform(gr1, return_embedding_only=True), axis=1) - ) - y_ = torch.tensor( - np.concatenate(self.kern.transform(gr2, return_embedding_only=True), axis=1) + normalize=True, ) - # Note that the vector length of the WL procedure is indeterminate, and thus dim(Y) != dim(X) in general. - # However, since the newly observed features in the test data is always concatenated at the end of the feature - # matrix, these features will not matter for the inference, and as such we can safely truncate the feature - # matrix for the test data so that only those appearing in both the training and testing datasets are included. - - x_.requires_grad_() - y_ = y_[:, : x_.shape[1]].requires_grad_() - K = calculate_kernel_matrix_as_tensor(x_, y_, oa=self.oa, se_kernel=self.se) - return K, y_, x_ - - def feature_map(self, flatten=True): - """ - Get the feature map in term of encoding (position in the feature index): the feature string. - Parameters - ---------- - flatten: whether flatten the dict (originally, the result is layered in term of h (the number of WL iterations). - - Returns - ------- - - """ - if not self.requires_grad: - logging.warning( - "Requires_grad flag is off -- in this case, there is risk that the element order in the " - "feature map DOES NOT correspond to the order in the feature matrix. To suppress this warning," - "when initialising the WL kernel, do WeisfilerLehman(requires_grad=True)" - ) - if self._gram is None: - return None - if not flatten: - return self.kern._label_node_attr - else: - res = {} - for _, map_ in self.kern._label_node_attr.items(): - for k, v in map_.items(): - res.update({k: v}) - return res - - def feature_value(self, X_s): - """Given a list of architectures X_s, compute their WL embedding of size N_s x D, where N_s is the length - of the list and D is the number of training set features. + # TODO: This could probably be lifted to the caller + K = self.wl_kernel_.fit_transform(gr) + K = torch.as_tensor(K, dtype=torch.float64) + self.layer_weights_ = self.wl_kernel_.layer_weights + return torch.as_tensor(K, dtype=torch.float64) - Returns: - embedding: torch.Tensor of shape N_s x D, described above - names: list of shape D, which has 1-to-1 correspondence to each element of the embedding matrix above - """ - if not self.requires_grad: - logging.warning( - "Requires_grad flag is off -- in this case, there is risk that the element order in the " - "feature map DOES NOT correspond to the order in the feature matrix. To suppress this warning," - "when initialising the WL kernel, do WeisfilerLehman(requires_grad=True)" - ) - feat_map = self.feature_map(flatten=False) - len_feat_map = [len(f) for f in feat_map.values()] - X_s = graph_from_networkx( - X_s, - self.node_label, - ) - embedding = self.kern.transform(X_s, return_embedding_only=True) - for j, em in enumerate(embedding): - # Remove some of the spurious features that pop up sometimes - embedding[j] = em[:, : len_feat_map[j]] + def transform(self, gr: Sequence[WLInput]) -> torch.Tensor: + assert self.wl_kernel_ is not None - # Generate the final embedding - embedding = torch.tensor(np.concatenate(embedding, axis=1)) - return embedding, list(self.feature_map(flatten=True).values()) + K = self.wl_kernel_.transform(gr) + return torch.as_tensor(K, dtype=torch.float64) diff --git a/neps/optimizers/bayesian_optimization/models/deepGP.py b/neps/optimizers/bayesian_optimization/models/deepGP.py index 82355ec5..ffc3606f 100644 --- a/neps/optimizers/bayesian_optimization/models/deepGP.py +++ b/neps/optimizers/bayesian_optimization/models/deepGP.py @@ -211,7 +211,7 @@ def encode_configs( # before inserting the one-hot encoding. offset = len(self.numericals) for hp_name, hp in self.categoricals.items(): - budget_tensor = torch.tensor( + cat_tensor = torch.tensor( [config[hp_name]._value_index for config in configs], # type: ignore device=self.device, dtype=torch.float64, @@ -219,7 +219,7 @@ def encode_configs( # .. and insert one-hot encoding (ChatGPT solution, verified locally) portion = x_buffer[:, offset : offset + len(hp.choices)] - portion.scatter_(1, budget_tensor.unsqueeze(1), 1) + portion.scatter_(1, cat_tensor.unsqueeze(1), 1) offset += len(hp.choices) diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index 6a878748..eef6b771 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -1,303 +1,267 @@ from __future__ import annotations import logging -from copy import deepcopy -from typing import Iterable, Literal, Sequence, Any - -import numpy as np -import contextlib import torch - -from neps.optimizers.bayesian_optimization.kernels.combine_kernels import ( - ProductKernel, - SumKernel, +import numpy as np +from typing import Literal, Sequence, Any, Mapping +from typing_extensions import Literal +from dataclasses import dataclass, field +from itertools import product + +from neps.optimizers.bayesian_optimization.kernels.kernel import ( + Kernel, + NumericKernel, + compute_normalized_log_marginal_likelihood, + compute_pd_inverse, ) -from neps.optimizers.bayesian_optimization.kernels.graph_kernel import GraphKernels -from neps.optimizers.bayesian_optimization.kernels.utils import extract_configs from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary -from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import WeisfilerLehman +from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import ( + WeisfilerLehman, +) +from neps.search_spaces.encoding import TensorEncodedConfigs from neps.search_spaces.search_space import SearchSpace logger = logging.getLogger(__name__) +f64 = torch.float64 -class ComprehensiveGP: - def __init__( - self, - space: SearchSpace, - graph_kernels: Iterable, - hp_kernels: Iterable, - initial_likelihood: float = 1e-3, - weights: Sequence[float] | torch.Tensor | None = None, - combined_kernel: Literal["sum", "product"] = "sum", - surrogate_model_fit_args: dict | None = None, - optimizer_kwargs: dict[str, Any] | None = None, - wl_subtree_candidates: Sequence[int] = (1, 2, 3, 4, 5), - wl_lengthscales: Sequence[float] = tuple(np.e**i for i in range(-2, 3)), - optimize_likelihood: bool = True, - optimizer: Literal["adam", "sgd"] = "adam", - optimizer_iters: int = 20, - max_likelihood: float = 0.01, - optimize_wl_layer_weights: bool = False, - ): - graph_kernels = list(graph_kernels) - hp_kernels = list(hp_kernels) - n_graph_kernels = len(graph_kernels) - n_vector_kernels = len(hp_kernels) - n_kernels = n_graph_kernels + n_vector_kernels - domain_kernels = [*graph_kernels, *hp_kernels] - - fixed_weights = weights is not None - if weights is not None: - if weights is not None: - assert len(weights) == n_kernels, ( - "the weights vector, if supplied, needs to have the same length as " - "the number of kernel_operators!" - ) - init_weights = torch.as_tensor(weights).flatten() - else: - uniform_weight = 1.0 / self.n_kernels - init_weights = torch.full((n_kernels,), uniform_weight, dtype=torch.float64) - if combined_kernel == "product": - _combined_kernel = ProductKernel(*domain_kernels, weights=weights) - elif combined_kernel == "sum": - _combined_kernel = SumKernel(*domain_kernels, weights=weights) - else: - raise NotImplementedError( - f'Combining kernel {combined_kernel} is not yet implemented! Only "sum" ' - f'or "product" are currently supported. ' - ) +GRID_WL_LENGTHSCALES = torch.tensor([np.e**i for i in range(-2, 3)], dtype=f64) +GRID_WL_SUBTREE_CANDIDATES = (1, 2, 3, 4, 5) + + +def _default_param_grid() -> dict[type[Kernel], list[dict[str, Any]]]: + return { + WeisfilerLehman: [ + {"h": h, "se_kernel": Stationary(lengthscale=l)} + for h, l in product(GRID_WL_SUBTREE_CANDIDATES, GRID_WL_LENGTHSCALES) + ] + } - # TODO: Clone only needed while it can act like configurations - self.space = space.clone() - self.init_weights = init_weights - self.fixed_weights = fixed_weights - self.combined_kernel = _combined_kernel - self.initial_likelihood = initial_likelihood - self.surrogate_model_fit_args = surrogate_model_fit_args or {} - self.domain_kernels: list = [*graph_kernels, *hp_kernels] - self.n_kernels: int = len(self.domain_kernels) - self.n_graph_kernels: int = len(graph_kernels) - self.n_vector_kernels: int = len(hp_kernels) - self.optimizer_kwargs = optimizer_kwargs or {"lr": 0.1} - self.optimize_likelihood = optimize_likelihood - self.optimize_wl_layer_weights = optimize_wl_layer_weights - self.optimizer = optimizer - self.optimizer_iters = optimizer_iters - self.max_likelihood = max_likelihood - self.wl_subtree_candidates = wl_subtree_candidates - self.wl_lengthscales = wl_lengthscales - - # Cache the Gram matrix inverse and its log-determinant - self.K_ = None - self.K_i_ = None - self.logDetK_ = None - self.theta_vector_ = None - self.layer_weights_ = None - self.nlml_ = None - self.likelihood_: float | None = None - self.weights_: torch.Tensor | None = None - self.x_configs_: list[SearchSpace] | None = None - self.y_: torch.Tensor | None = None - self.y_normalized_: torch.Tensor | None = None - self.y_mean_: float | None = None - self.y_std_: float | None = None - self.n_: int | None = None - - def fit(self, train_x: list[SearchSpace], train_y: list[float]) -> None: - """Called by self.fit""" - self.x_configs = train_x - self.n_ = len(train_x) - self.y_ = torch.as_tensor(train_y, dtype=torch.float64) + +@dataclass +class ComprehensiveGP: + space: SearchSpace + kernels: Sequence[tuple[Kernel, Sequence[str]]] + combined_kernel: Literal["sum", "product"] = "sum" + initial_likelihood: float = 1e-3 + optimize_likelihood: bool = True + max_likelihood: float = 0.01 + optimizer: Literal["adam", "sgd"] = "adam" + optimizer_iters: int = 20 + optimize_wl_layer_weights: bool = False + surrogate_model_fit_args: Mapping[str, Any] = field(default_factory=dict) + optimizer_kwargs: Mapping[str, Any] = field(default_factory=lambda: {"lr": 0.1}) + kernel_hp_grids: Mapping[type[Kernel], Sequence[Mapping[str, Any]]] = field( + default_factory=_default_param_grid + ) + + # Post fit attributes + K_i_: torch.Tensor | None = None + n_train_: int | None = None + likelihood_: float | None = None + y_: torch.Tensor | None = None + y_normalized_: torch.Tensor | None = None + y_mean_: float | None = None + y_std_: float | None = None + optimized_kernels_: ( + list[tuple[NumericKernel | WeisfilerLehman, Sequence[str]]] | None + ) = None + kernel_weights_: torch.Tensor | None = None + + def __post_init__(self): + # TODO: Remove when search space is just definition and does not hold values. + self.space = self.space.clone() + + def fit(self, x: TensorEncodedConfigs, train_y: torch.Tensor) -> None: + # Preprocessing + y_ = torch.as_tensor(train_y, dtype=f64) # TODO: Dunno if I like this silent hack, setting std to 1 if no std - self.y_std_ = s if (s := torch.std(self.y_).item()) != 0 else 1 - self.y_mean_ = torch.mean(self.y_).item() - self.y_normalized_ = (self.y_ - self.y_mean_) / self.y_std_ - - # The Gram matrix of the training data - self.K_i_, self.logDetK_ = None, None - - if len(self.wl_subtree_candidates) > 0: - graphs, _ = extract_configs(self.x_configs) - graph_kernels = [ - k for k in self.domain_kernels if isinstance(k, GraphKernels) - ] - for i, kernel in enumerate(graph_kernels): - if not isinstance(kernel, WeisfilerLehman): - logger.warning(f"No kernel opt. for {type(kernel).__name__}.") - continue - - _xs = ( - [x[i] for x in graphs] - if isinstance(graphs[0], list) - else [x for x in graphs] - ) - _grid_search_wl_kernel( - kernel=kernel, - subtree_candidates=self.wl_subtree_candidates, - train_x=_xs, - train_y=self.y_, - likelihood=self.initial_likelihood, - lengthscales=self.wl_lengthscales, - ) - - weights = self.init_weights.clone() - - if not self.fixed_weights and self.n_kernels > 1: - weights.requires_grad_(True) - - n_cat = len(self.space.categoricals) - n_num = len(self.space.numerical) - theta_categorical = torch.ones( - n_cat, requires_grad=n_cat > 1, dtype=torch.float64 - ) - theta_numerical = torch.ones(n_num, requires_grad=n_num > 1, dtype=torch.float64) + self.y_std_ = s if (s := torch.std(y_).item()) != 0 else 1 + self.y_mean_ = torch.mean(y_).item() + self.y_normalized_ = (y_ - self.y_mean_) / self.y_std_ + + optimized_kernels: list[ + tuple[NumericKernel | WeisfilerLehman, Sequence[str]] + ] = [] + _grids = self.kernel_hp_grids + + def _eval_kernel(_K: torch.Tensor) -> float: + assert y_ is not None + K_i, logDetK = compute_pd_inverse(_K) + nlml = -compute_normalized_log_marginal_likelihood(K_i, logDetK, y_) + return float(nlml) + + for kernel, hps in self.kernels: + if isinstance(kernel, WeisfilerLehman): + assert len(hps) == 1, "Only support single kernel per graph." + _xs = x.wl_graph_input(hps[0]) + elif isinstance(kernel, NumericKernel): + _xs = x.tensor(hps) + else: + raise ValueError(f"Unsupported kernel type {type(kernel)}") + + grid = next((g for t, g in _grids.items() if isinstance(kernel, t)), None) + if grid is None: + optimized_kernel = kernel.clone() + _ = optimized_kernel.fit_transform(_xs) # type: ignore + optimized_kernels.append((kernel, hps)) + continue + + optimized_kernel, _ = kernel.grid_search( + x=_xs, # type: ignore + grid=grid, + to_minimize=_eval_kernel, + ) + optimized_kernels.append((optimized_kernel, hps)) - theta_vectors = { - "categorical": theta_categorical, - "continuous": theta_numerical, # NOTE: This actually includes integers too -_- - } + # Optimization weights likelihood = torch.tensor( - self.initial_likelihood, requires_grad=self.optimize_likelihood + self.initial_likelihood, + requires_grad=self.optimize_likelihood, ) - layer_weights = None - if self.optimize_wl_layer_weights: - for kernel in self.domain_kernels: - if isinstance(kernel, WeisfilerLehman) and kernel.h != 0: - layer_weights = torch.ones(kernel.h + 1, requires_grad=True) - break + kernel_weights = torch.ones( + len(optimized_kernels), + requires_grad=len(optimized_kernels) > 1, + dtype=f64, + ) + should_optimize = lambda p: p.is_leaf and p.requires_grad # Linking the optimizer variables to the sum kernel - optim_vars = [ + optim_vars: list[torch.Tensor] = [ a - for a in ( - weights, - likelihood, - layer_weights, - theta_categorical, - theta_numerical, - ) - if a is not None and a.is_leaf and a.requires_grad + for a in (kernel_weights, likelihood) + if a is not None and should_optimize(a) + ] + layer_weights = [ + kernel.layer_weights_ + for kernel, _ in optimized_kernels + if isinstance(kernel, WeisfilerLehman) + and kernel.layer_weights_ is not None + and should_optimize(kernel.layer_weights_) + ] + lengthscales = [ + kernel.layer_weights_ + for kernel, _ in optimized_kernels + if isinstance(kernel, NumericKernel) and should_optimize(kernel.lengthscale) + ] + lengthscalebounds = [ + kernel.lengthscale_bounds + for kernel, _ in optimized_kernels + if isinstance(kernel, NumericKernel) and should_optimize(kernel.lengthscale) ] - nlml = None - if len(optim_vars) == 0: # Skip optimisation - K = self.combined_kernel.fit_transform( - weights, - self.x_configs, - feature_lengthscale=theta_vectors, - layer_weights=layer_weights, - rebuild_model=True, - ) - K_i, logDetK = compute_pd_inverse(K, jitter=likelihood) + # Select the optimizer + if self.optimizer == "adam": + optim = torch.optim.Adam(optim_vars, **self.optimizer_kwargs) # type: ignore + elif self.optimizer == "sgd": + optim = torch.optim.SGD(optim_vars, **self.optimizer_kwargs) # type: ignore else: - # Select the optimizer - if self.optimizer == "adam": - optim = torch.optim.Adam(optim_vars, **self.optimizer_kwargs) # type: ignore - elif self.optimizer == "sgd": - optim = torch.optim.SGD(optim_vars, **self.optimizer_kwargs) # type: ignore - else: - raise ValueError(f"Invalid optimizer {self.optimizer}") - - K: torch.Tensor | None = None - for i in range(self.optimizer_iters): - optim.zero_grad() - K = self.combined_kernel.fit_transform( - weights=weights, - configs=train_x, # TODO - feature_lengthscale=theta_vectors, - layer_weights=layer_weights, - rebuild_model=True, - save_gram_matrix=True, - ) - K_i, logDetK = compute_pd_inverse(K, jitter=likelihood) - nlml = -compute_log_marginal_likelihood( - K_i, logDetK, y=self.y_normalized_ - ) - nlml.backward() - if i % 10 == 0: - logger.debug( - f"Iteration: {i}/{self.optimizer_iters} " - f"Negative log-marginal likelihood:" - f"{nlml.item()} {theta_vectors} {weights} {likelihood}" - ) - - optim.step() # TODO - - with torch.no_grad(): - if weights.is_leaf: - weights.clamp_(0.0, 1.0) - - theta_vectors = self.combined_kernel.clamp_theta_vector(theta_vectors) - - if likelihood.is_leaf: - likelihood.clamp_(1e-5, self.max_likelihood) - - if layer_weights is not None and layer_weights.is_leaf: - layer_weights.clamp_(0.0, 1.0) - - optim.zero_grad(set_to_none=True) - - assert K is not None + raise ValueError(f"Invalid optimizer {self.optimizer}") + + K: torch.Tensor | None = None + N = len(x) + for _ in range(self.optimizer_iters): + optim.zero_grad() + + # Now we iterate over kernels to build up K + _init = torch.zeros if self.combined_kernel == "sum" else torch.ones + K = _init(N, N, dtype=f64) + for (kernel, hps), weight in zip(self.kernels, kernel_weights): + if isinstance(kernel, WeisfilerLehman): + assert len(hps) == 1, "Only support single kernel per graph." + _xs = x.wl_graph_input(hps[0]) + gram = kernel.fit_transform(_xs) + elif isinstance(kernel, NumericKernel): + _xs = x.tensor(hps) + gram = kernel.fit_transform(_xs) + else: + raise ValueError(f"Unsupported kernel type {type(kernel)}") + + if self.combined_kernel == "sum": + K.add_(weight * gram) + elif self.combined_kernel == "product": + K.mul_(weight * gram) + else: + raise ValueError(f"Invalid combined_kernel {self.combined_kernel}") + + # Normalize + K_diag = torch.sqrt(torch.diag(K)) + K /= torch.ger(K_diag, K_diag) K_i, logDetK = compute_pd_inverse(K, jitter=likelihood) + # If there's nothing to optimize, break out early + if len(optim_vars) == 0: + break + + nlml = -compute_normalized_log_marginal_likelihood( + K_i, logDetK, y=self.y_normalized_ + ) + nlml.backward() + optim.step() + + with torch.no_grad(): + kernel_weights.clamp_(0.0, 1.0) + if likelihood.is_leaf: + likelihood.clamp_(1e-9, self.max_likelihood) + + for ls, ls_bounds in zip(lengthscales, lengthscalebounds): + ls.clamp_(*ls_bounds) + + for lw in layer_weights: + lw.clamp_(0.0, 1.0) + + optim.zero_grad() + + assert K is not None + K_i, logDetK = compute_pd_inverse(K, jitter=likelihood) + # Apply the optimal hyperparameters - self.weights_ = weights.clone() / torch.sum(weights) self.K_i_ = K_i.clone() - self.K_ = K.clone() - self.logDetK_ = logDetK.clone() self.likelihood_ = likelihood.item() - self.theta_vector_ = theta_vectors - self.layer_weights_ = layer_weights - self.nlml_ = nlml.detach().cpu() if nlml is not None else None - - for kernel in self.combined_kernel.kernels: - if isinstance(kernel, Stationary): - kernel.update_hyperparameters(lengthscale=self.theta_vector_) - - logger.debug("Optimisation summary: ") - logger.debug(f"Optimal NLML: {nlml}") - logger.debug(f"Lengthscales: {theta_vectors}") - with contextlib.suppress(AttributeError): - logger.debug(f"Optimal h: {self.domain_kernels[0]._h}") - logger.debug(f"Weights: {self.weights_}") - logger.debug(f"Lik: {self.likelihood_}") - logger.debug(f"Optimal layer weights {layer_weights}") - - def predict(self, x_configs: list[SearchSpace]) -> tuple[torch.Tensor, torch.Tensor]: - """Kriging predictions""" - if not isinstance(x_configs, list): - x_configs = [x_configs] + self.optimized_kernels_ = optimized_kernels + self.kernel_weights_ = kernel_weights + self.n_train_ = N - if self.K_i_ is None or self.logDetK_ is None or self.weights_ is None: + def predict(self, x: TensorEncodedConfigs) -> tuple[torch.Tensor, torch.Tensor]: + """Kriging predictions""" + if self.K_i_ is None or self.n_train_ is None or self.kernel_weights_ is None: raise ValueError( "Inverse of Gram matrix is not instantiated. Please call the optimize " "function to fit on the training data first!" ) + _init = torch.zeros if self.combined_kernel == "sum" else torch.ones + N = self.n_train_ + len(x) + K = _init(N, N, dtype=f64) + for (kernel, hps), weight in zip(self.kernels, self.kernel_weights_): + if isinstance(kernel, WeisfilerLehman): + assert len(hps) == 1, "Only support single kernel per graph." + _x_test = x.wl_graph_input(hps[0]) + gram = kernel.transform(_x_test) + elif isinstance(kernel, NumericKernel): + _x_test = x.tensor(hps) + gram = kernel.fit_transform(_x_test) + else: + raise ValueError(f"Unsupported kernel type {type(kernel)}") + + if self.combined_kernel == "sum": + K.add_(weight * gram) + elif self.combined_kernel == "product": + K.mul_(weight * gram) + else: + raise ValueError(f"Invalid combined_kernel {self.combined_kernel}") + # Concatenate the full list - X_configs_all = self.x_configs + x_configs - n_train = len(self.x_configs) - n_test = len(x_configs) - - K_full = self.combined_kernel.fit_transform( - weights=self.weights_, - configs=X_configs_all, - layer_weights=self.layer_weights_, - feature_lengthscale=self.theta_vector_, - rebuild_model=True, - save_gram_matrix=False, - gp_fit=False, - ) + n_test = len(x) - K_s = K_full[:n_train:, n_train:] - K_ss = K_full[n_train:, n_train:] + self.likelihood_ * torch.eye(n_test) + K_s = K[: self.n_train_ :, self.n_train_ :] + K_ss = K[self.n_train_ :, self.n_train_ :] + self.likelihood_ * torch.eye(n_test) - mu_s = K_s.t() @ self.K_i_ @ self.y_ + mu_s = K_s.t() @ self.K_i_ @ self.y_normalized_ mu_s = mu_s * self.y_std_ + self.y_mean_ cov_s = K_ss - K_s.t() @ self.K_i_ @ K_s @@ -305,97 +269,3 @@ def predict(self, x_configs: list[SearchSpace]) -> tuple[torch.Tensor, torch.Ten cov_s = (torch.sqrt(cov_s) * self.y_std_) ** 2 return mu_s, cov_s - - -def _grid_search_wl_kernel( - kernel: WeisfilerLehman, - subtree_candidates, - train_x: list, - train_y: torch.Tensor, - likelihood: float, - lengthscales=None, -): - """Optimize the *discrete hyperparameters* of Weisfeiler Lehman kernel. - k: a Weisfeiler-Lehman kernel instance - hyperparameter_candidate: list of candidate hyperparameter to try - train_x: the train data - train_y: the train label - lik: likelihood - lengthscale: if using RBF kernel for successive embedding, the list of lengthscale to be grid searched over - """ - # lik = 1e-6 - assert len(train_x) == len(train_y) - best_nlml = torch.tensor(np.inf) - best_subtree_depth = None - best_lengthscale = None - best_K = None - if lengthscales is not None and kernel.se is not None: - candidates = [(h_, l_) for h_ in subtree_candidates for l_ in lengthscales] - else: - candidates = [(h_, None) for h_ in subtree_candidates] - - for i in candidates: - if kernel.se is not None: - kernel.change_se_params({"lengthscale": i[1]}) - - kernel.change_kernel_params({"h": i[0]}) - K = kernel.fit_transform(train_x, rebuild_model=True, save_gram_matrix=True) - K_i, logDetK = compute_pd_inverse(K, jitter=likelihood) - nlml = -compute_log_marginal_likelihood(K_i, logDetK, train_y) - if nlml < best_nlml: - best_nlml = nlml - best_subtree_depth, best_lengthscale = i - best_K = torch.clone(K) - - kernel.change_kernel_params({"h": best_subtree_depth}) - if kernel.se is not None: - kernel.change_se_params({"lengthscale": best_lengthscale}) - kernel._gram = best_K - - -def compute_log_marginal_likelihood( - K_i: torch.Tensor, - logDetK: torch.Tensor, - y: torch.Tensor, - *, - normalize: bool = True, -) -> torch.Tensor: - """Compute the zero mean Gaussian process log marginal likelihood given the inverse of Gram matrix K(x2,x2), its - log determinant, and the training label vector y. - Option: - - normalize: normalize the log marginal likelihood by the length of the label vector, as per the gpytorch - routine. - """ - lml = ( - -0.5 * (y.t() @ K_i @ y) - + 0.5 * logDetK - - y.shape[0] / 2.0 * torch.log(2 * torch.tensor(np.pi)) - ) - return lml / y.shape[0] if normalize else lml - - -def compute_pd_inverse( - K: torch.Tensor, - *, - jitter: float | torch.Tensor = 1e-6, - attempts: int = 3, -) -> tuple[torch.Tensor, torch.Tensor]: - """Compute the inverse of a postive-(semi)definite matrix K using Cholesky inversion.""" - n = K.shape[0] - assert ( - isinstance(jitter, float) or jitter.ndim == 0 - ), "only homoscedastic noise variance is allowed here!" - for i in range(attempts): - try: - jitter_diag = jitter * torch.eye(n, device=K.device) * 10**i - Kc = torch.linalg.cholesky(K + jitter_diag) - break - except RuntimeError: - pass - else: - raise RuntimeError(f"Gram matrix not positive definite despite of jitter:\n{K}") - - logDetK = -2 * torch.sum(torch.log(torch.diag(Kc))) - K_i = torch.cholesky_inverse(Kc) - return K_i.to(dtype=torch.float64), logDetK.to(dtype=torch.float64) diff --git a/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py b/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py index a359b937..2c9993be 100644 --- a/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py +++ b/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py @@ -637,7 +637,7 @@ def _fit( for k in self.combined_kernel.kernels: if isinstance(k, Stationary): - k.update_hyperparameters(lengthscale=torch.exp(theta_vector)) + k.update_lengthscales(lengthscale=torch.exp(theta_vector)) self.combined_kernel.weights = weights.clone() @@ -693,55 +693,6 @@ def predict(self, x_configs, preserve_comp_graph: bool = False): del combined_kernel_copy return mu_s, cov_s - def predict_single_hierarchy( - self, x_configs, hierarchy_id=0, preserve_comp_graph: bool = False - ): - """Kriging predictions""" - - if not isinstance(x_configs, list): - # Convert a single input X_s to a singleton list - x_configs = [x_configs] - - if self.K_i is None or self.logDetK is None: - raise ValueError( - "Inverse of Gram matrix is not instantiated. Please call the optimize function to " - "fit on the training data first!" - ) - - # Concatenate the full list - X_configs_all = self.x_configs + x_configs - - # Make a copy of the sum_kernels for this step, to avoid breaking the autodiff if grad guided mutation is used - if preserve_comp_graph: - combined_kernel_copy = deepcopy(self.combined_kernel) - else: - combined_kernel_copy = self.combined_kernel - - K_sub_full = combined_kernel_copy.fit_transform_single_hierarchy( - self.weights, - X_configs_all, - normalize=self.normalize_combined_kernel, - hierarchy_id=hierarchy_id, - feature_lengthscale=torch.exp(self.theta_vector), - layer_weights=self.layer_weights, - rebuild_model=True, - save_gram_matrix=False, - gp_fit=False, - ) - - K_s = K_sub_full[: self.n :, self.n :] - K_ss = K_sub_full[self.n :, self.n :] - mu_s = K_s.t() @ self.K_i @ self.y - cov_s_full = K_ss - K_s.t() @ self.K_i @ K_s - cov_s = torch.clamp(cov_s_full, self.likelihood, np.inf) - mu_s = unnormalize_y(mu_s, self.y_mean, self.y_std) - std_s = torch.sqrt(cov_s) - std_s = unnormalize_y(std_s, None, self.y_std, True) - cov_s = std_s**2 - if preserve_comp_graph: - del combined_kernel_copy - return mu_s, cov_s - @property def x(self): return self.x_configs @@ -759,115 +710,6 @@ def _reset_XY(self, train_x: Iterable, train_y: Union[Iterable, torch.Tensor]): # The Gram matrix of the training data self.K_i, self.logDetK = None, None - def dmu_dphi( - self, - X_s=None, - # compute_grad_var=False, - average_across_features=True, - average_across_occurrences=False, - ): - r""" - Compute the derivative of the GP posterior mean at the specified input location with respect to the - *vector embedding* of the graph (e.g., if using WL-subtree, this function computes the gradient wrt - each subtree pattern) - - The derivative is given by - $ - \frac{\partial \mu^*}{\partial \phi ^*} = \frac{\partial K(\phi, \phi^*)}{\partial \phi ^ *}K(\phi, \phi)^{-1} - \mathbf{y} - $ - - which derives directly from the GP posterior mean formula, and since the term $K(\phi, \phi)^{-1} and \mathbf{y} - are both independent of the testing points (X_s, or \phi^*}, the posterior gradient is simply the matrix - produce of the kernel gradient with the inverse Gram and the training label vector. - - Parameters - ---------- - X_s: The locations on which the GP posterior mean derivatives should be evaluated. If left blank, the - derivatives will be evaluated at the training points. - - compute_grad_var: bool. If true, also compute the gradient variance. - - The derivative of GP is also a GP, and thus the predictive distribution of the posterior gradient is Gaussian. - The posterior mean is given above, and the posterior variance is: - $ - \mathbb{V}[\frac{\partial f^*}{\partial \phi^*}]= \frac{\partial^2k(\phi^*, \phi^*)}{\partial \phi^*^2} - - \frac{\partial k(\phi^*, \Phi)}{\partial \phi^*}K(X, X)^{-1}\frac{\partial k{(\Phi, \phi^*)}}{\partial \phi^*} - $ - - Returns - ------- - list of K torch.Tensor of the shape N x2 D, where N is the length of the X_s list (each element of which is a - networkx graph), K is the number of kernel_operators in the combined kernel and D is the dimensionality of the - feature vector (this is determined by the specific graph kernel. - - OR - - list of K torch.Tensor of shape D, if averaged_over_samples flag is enabled. - """ - if self.K_i is None or self.logDetK is None: - raise ValueError( - "Inverse of Gram matrix is not instantiated. Please call the optimize " - "function to fit on the training data first!" - ) - if self.n_vector_kernels: - if X_s is not None: - V_s = self._get_vectorial_features(X_s, self.vectorial_feactures) - V_s, _, _ = standardize_x(V_s, self.x_features_min, self.x_features_max) - else: - V_s = self.x_features - X_s = self.x[:] - else: - V_s = None - X_s = X_s if X_s is not None else self.x[:] - - alpha = (self.K_i @ self.y).double().reshape(1, -1) - dmu_dphi = [] - # dmu_dphi_var = [] if compute_grad_var else None - - Ks_handles = [] - feature_matrix = [] - for j, x_s in enumerate(X_s): - jacob_vecs = [] - if V_s is None: - handles = self.combined_kernel.forward_t( - self.weights, - [x_s], - ) - else: - handles = self.combined_kernel.forward_t(self.weights, [x_s], V_s[j]) - Ks_handles.append(handles) - # Each handle is a 2-tuple. first element is the Gram matrix, second element is the leaf variable - feature_vectors = [] - for handle in handles: - k_s, y, _ = handle - # k_s is output, leaf is input, alpha is the K_i @ y term which is constant. - # When compute_grad_var is not required, computational graphs do not need to be saved. - jacob_vecs.append( - torch.autograd.grad( - outputs=k_s, inputs=y, grad_outputs=alpha, retain_graph=False - )[0] - ) - feature_vectors.append(y) - feature_matrix.append(feature_vectors) - jacob_vecs = torch.cat(jacob_vecs) - dmu_dphi.append(jacob_vecs) - - feature_matrix = torch.cat([f[0] for f in feature_matrix]) - if average_across_features: - dmu_dphi = torch.cat(dmu_dphi) - # compute the weighted average of the gradient across N_t. - # feature matrix is of shape N_t x K x D - avg_mu, avg_var, incidences = get_grad( - dmu_dphi, feature_matrix, average_across_occurrences - ) - return avg_mu, avg_var, incidences - return ( - dmu_dphi, - None, - feature_matrix.sum(dim=0) if average_across_occurrences else feature_matrix, - ) - def get_grad(grad_matrix, feature_matrix, average_occurrences=False): r""" @@ -982,7 +824,7 @@ def _grid_search_wl_kernel( k.change_kernel_params({"h": best_subtree_depth}) if k.se is not None: k.change_se_params({"lengthscale": best_lengthscale}) - k._gram = best_K + k.gram_ = best_K def get_theta_vector(vectorial_features): diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 9fc3aeae..2002aeab 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -25,7 +25,7 @@ from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( AcquisitionSampler, ) -from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_kernels +from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_default_kernels from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping if TYPE_CHECKING: @@ -142,7 +142,7 @@ def __init__( self.sample_default_first = sample_default_first surrogate_model_args = surrogate_model_args or {} - graph_kernels, hp_kernels = get_kernels( + graph_kernels, hp_kernels = get_default_kernels( self.pipeline_space, domain_se_kernel, graph_kernels, diff --git a/neps/optimizers/multi_fidelity/dyhpo.py b/neps/optimizers/multi_fidelity/dyhpo.py index 59804637..bb4879b9 100755 --- a/neps/optimizers/multi_fidelity/dyhpo.py +++ b/neps/optimizers/multi_fidelity/dyhpo.py @@ -20,7 +20,7 @@ from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( AcquisitionSampler, ) -from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_kernels +from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_default_kernels from neps.optimizers.multi_fidelity.mf_bo import FreezeThawModel, PFNSurrogate from neps.optimizers.multi_fidelity.utils import MFObservedData @@ -116,7 +116,7 @@ def __init__( ) # Preparing model - self.graph_kernels, self.hp_kernels = get_kernels( + self.graph_kernels, self.hp_kernels = get_default_kernels( pipeline_space=pipeline_space, domain_se_kernel=domain_se_kernel, graph_kernels=graph_kernels, diff --git a/neps/optimizers/multi_fidelity/sampling_policy.py b/neps/optimizers/multi_fidelity/sampling_policy.py index 9321633c..fceb44e9 100644 --- a/neps/optimizers/multi_fidelity/sampling_policy.py +++ b/neps/optimizers/multi_fidelity/sampling_policy.py @@ -22,7 +22,7 @@ from ..bayesian_optimization.acquisition_samplers.base_acq_sampler import ( AcquisitionSampler, ) -from ..bayesian_optimization.kernels.get_kernels import get_kernels +from ..bayesian_optimization.kernels.get_kernels import get_default_kernels from ..bayesian_optimization.models import SurrogateModelMapping from ..multi_fidelity_prior.utils import ( compute_config_dist, @@ -170,16 +170,13 @@ def sample( policy_idx = np.random.choice(range(len(prob_weights)), p=prob_weights) policy = sorted(self.policy_map.keys())[policy_idx] - self.logger.info( - f"Sampling from {policy} with weights (i, p, r)={prob_weights}" - ) + self.logger.info(f"Sampling from {policy} with weights (i, p, r)={prob_weights}") if policy == "prior": config = self.pipeline_space.sample( patience=self.patience, user_priors=True, ignore_fidelity=True ) elif policy == "inc": - if ( hasattr(self.pipeline_space, "has_prior") and self.pipeline_space.has_prior @@ -213,9 +210,7 @@ def sample( # the weight distributed across prior adnd inc _w_priors = 1 - self.policy_map["random"] # re-calculate normalized score ratio for prior-inc - w_prior = np.clip( - self.policy_map["prior"] / _w_priors, a_min=0, a_max=1 - ) + w_prior = np.clip(self.policy_map["prior"] / _w_priors, a_min=0, a_max=1) w_inc = np.clip(self.policy_map["inc"] / _w_priors, a_min=0, a_max=1) # calculating difference of prior and inc score score_diff = np.abs(w_prior - w_inc) @@ -288,7 +283,7 @@ def __init__( surrogate_model_args = surrogate_model_args or {} - graph_kernels, hp_kernels = get_kernels( + graph_kernels, hp_kernels = get_default_kernels( pipeline_space=pipeline_space, domain_se_kernel=domain_se_kernel, graph_kernels=graph_kernels, @@ -302,9 +297,9 @@ def __init__( if not surrogate_model_args["hp_kernels"]: raise ValueError("No kernels are provided!") if "vectorial_features" not in surrogate_model_args: - surrogate_model_args[ - "vectorial_features" - ] = pipeline_space.get_vectorial_dim() + surrogate_model_args["vectorial_features"] = ( + pipeline_space.get_vectorial_dim() + ) self.surrogate_model = instance_from_map( SurrogateModelMapping, @@ -439,7 +434,7 @@ def __init__( surrogate_model_args = surrogate_model_args or {} - graph_kernels, hp_kernels = get_kernels( + graph_kernels, hp_kernels = get_default_kernels( pipeline_space=pipeline_space, domain_se_kernel=domain_se_kernel, graph_kernels=graph_kernels, @@ -453,9 +448,9 @@ def __init__( if not surrogate_model_args["hp_kernels"]: raise ValueError("No kernels are provided!") if "vectorial_features" not in surrogate_model_args: - surrogate_model_args[ - "vectorial_features" - ] = pipeline_space.get_vectorial_dim() + surrogate_model_args["vectorial_features"] = ( + pipeline_space.get_vectorial_dim() + ) self.surrogate_model = instance_from_map( SurrogateModelMapping, diff --git a/neps/search_spaces/__init__.py b/neps/search_spaces/__init__.py index 7eb4332a..8a289100 100644 --- a/neps/search_spaces/__init__.py +++ b/neps/search_spaces/__init__.py @@ -1,5 +1,6 @@ from neps.search_spaces.architecture.api import ArchitectureParameter, FunctionParameter from neps.search_spaces.architecture.graph_grammar import ( + CoreGraphGrammar, GraphGrammar, GraphGrammarCell, GraphGrammarRepetitive, @@ -23,6 +24,7 @@ "ArchitectureParameter", "CategoricalParameter", "ConstantParameter", + "CoreGraphGrammar", "FloatParameter", "FunctionParameter", "GraphGrammar", diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py new file mode 100644 index 00000000..1ab1f92a --- /dev/null +++ b/neps/search_spaces/encoding.py @@ -0,0 +1,132 @@ +from __future__ import annotations + +from collections.abc import Sized + +from dataclasses import dataclass +from grakel.utils import graph_from_networkx + +from typing import Sequence, Iterable, TypeAlias +from typing_extensions import Self +from more_itertools import split_when +from itertools import chain +import torch + +from neps.search_spaces.search_space import SearchSpace + +WLInput: TypeAlias = tuple[dict, dict | None, dict | None] + + +@dataclass +class TensorEncodedConfigs(Sized): + _tensor_pack: torch.Tensor | None + """Layout such that _tensor_pack[0] is the first config. + + In the case that there are no numeric/categorical hyperparameters, + this is None. + + index config_row_id | fidelities... | numericals... | one_hot_categoricals... + 0 + 1 + 2 + ... + + NOTE: A slight memory innefficiency here is that we store the one-hot encoded + as a float tensor, rather than a byte tensor. This makes joint numerical/categorical + kernels more efficient, as well as entire config row access at the cost of memory. + This should not be a problem if we do not have a large number of categorical + hyperparameters with a high number of choices. + """ + _graphs: dict[str, Sequence[WLInput]] + _col_lookup: dict[str, tuple[int, int]] # range(inclusive, exclusive) + + def __len__(self) -> int: + return self._tensor_pack.shape[0] if self._tensor_pack is not None else 0 + + def wl_graph_input(self, hp: str) -> Sequence[WLInput]: + return self._graphs[hp] + + def tensor(self, hps: Iterable[str]) -> torch.Tensor: + if self._tensor_pack is None: + raise ValueError("No numerical/categorical hyperparameters were encoded.") + + cols: list[tuple[int, int]] = [] + for hp in hps: + _cols = self._col_lookup.get(hp) + if _cols is None: + raise ValueError(f"Hyperparameter {hp} not found in the lookup table.") + cols.append(_cols) + + # OPTIM: This code with `split_when` and `chunks` makes sure to grab + # consecutive chunks of memory where possible. For example, + # if we want all categoricals, this will just return the entire + # categorical tensor, rather than subselecting each part and then concatenating. + # Also works for numericals. + sorted_indices = sorted(cols) + non_consecutive_tuple = lambda x, y: x[1] != y[0] + chunks = list(split_when(sorted_indices, non_consecutive_tuple)) + slices = [slice(chunk[0][0], chunk[-1][1]) for chunk in chunks] + tensors = [self._tensor_pack[:, s] for s in slices] + + if len(tensors) == 1: + return tensors[0].clone() + + return torch.cat(tensors, dim=1) + + @classmethod + def encode( + cls, + space: SearchSpace, + configs: list[SearchSpace], + *, + node_label: str = "op_name", + device: torch.device, + ) -> Self: + assert node_label == "op_name", "Only 'op_name' is supported for node_label" + + _graphs: dict[str, Sequence[WLInput]] = {} + for hp_name in space.graphs.keys(): + gs = [conf.graphs[hp_name].value for conf in configs] + if ( + len(gs) > 0 + and isinstance(gs[0], list) + and len(gs[0]) > 0 + and isinstance(gs[0][0], list) + ): + gs = [_list for list_of_list in gs for _list in list_of_list] + _graphs[hp_name] = graph_from_networkx(gs) # type: ignore + + _lookup: dict[str, tuple[int, int]] = {} + + n_fids = len(space.fidelities) + n_nums = len(space.numerical) + n_cats = sum(len(hp.choices) for hp in space.categoricals.values()) + + width = n_fids + n_nums + n_cats + if width == 0: + return cls(_graphs=_graphs, _tensor_pack=None, _col_lookup={}) + + _tensor_pack = torch.empty(size=(len(configs), width), dtype=torch.float64) + + offset = 0 + for hp_name in chain(space.fidelities, space.numerical): + _lookup[hp_name] = (offset, offset + 1) + _xs = [config.fidelities[hp_name].normalized_value for config in configs] + values = torch.tensor(_xs, torch.float64, device=device) + + _tensor_pack[:, offset] = values + + offset += 1 + + for hp_name, cat in space.categoricals.items(): + n_choices = len(cat.choices) + _lookup[hp_name] = (offset, offset + n_choices) + + # .. and insert one-hot encoding (ChatGPT solution, verified locally) + _xs = [config[hp_name].normalized_value for config in configs] + cat_tensor = torch.tensor(_xs, torch.float64, device=device).unsqueeze(1) + + _tensor_pack[:, offset : offset + n_choices].scatter_(1, cat_tensor, 1) + + offset += n_choices + + return cls(_graphs=_graphs, _tensor_pack=_tensor_pack, _col_lookup=_lookup) From 0f235ee39a47739dbed20e8761e8d4ff2de1020b Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Thu, 15 Aug 2024 18:57:23 +0200 Subject: [PATCH 07/63] fix: typo --- .../kernels/grakel_replace/vertex_histogram.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py index 103818ae..e59b5433 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py @@ -262,7 +262,7 @@ def _calculate_kernel_matrix(self, Y=None): K[j, i] = K[i, j] else: if self.se_kernel is not None: - K = self.se_kernel._forwardd(self.X, self.X) + K = self.se_kernel._forward(self.X, self.X) else: K = self.X @ self.X.T else: @@ -275,7 +275,7 @@ def _calculate_kernel_matrix(self, Y=None): ) else: if self.se_kernel is not None: - K = self.se_kernel._forwardd(self.X, Y) + K = self.se_kernel._forward(self.X, Y) else: K = Y[:, : self.X.shape[1]] @ self.X.T From 83331d9db60b313b0f9a5ef23adffea189d11085 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Thu, 15 Aug 2024 19:02:55 +0200 Subject: [PATCH 08/63] fix: unscaled distance --- .../bayesian_optimization/kernels/vectorial_kernels.py | 2 +- neps/optimizers/bayesian_optimization/models/gp.py | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py index 8e7a1074..b25446d4 100644 --- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py +++ b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py @@ -89,7 +89,7 @@ def _unscaled_square_distance( """The unscaled distance between X and X2.""" assert X.ndim == 2 X1sq = torch.sum(X**2, 1) - X2sq = X1sq if X is X2 else torch.sum(X**2, 1) + X2sq = X1sq if (X2 is None or X is X2) else torch.sum(X2**2, 1) X2 = X if X2 is None else X2 r2 = -2 * X @ X2.T + X1sq[:, None] + X2sq[None, :] diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index eef6b771..8173aaac 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -253,13 +253,9 @@ def predict(self, x: TensorEncodedConfigs) -> tuple[torch.Tensor, torch.Tensor]: elif self.combined_kernel == "product": K.mul_(weight * gram) else: - raise ValueError(f"Invalid combined_kernel {self.combined_kernel}") - - # Concatenate the full list - n_test = len(x) K_s = K[: self.n_train_ :, self.n_train_ :] - K_ss = K[self.n_train_ :, self.n_train_ :] + self.likelihood_ * torch.eye(n_test) + K_ss = K[self.n_train_ :, self.n_train_ :] + self.likelihood_ * torch.eye(len(x)) mu_s = K_s.t() @ self.K_i_ @ self.y_normalized_ mu_s = mu_s * self.y_std_ + self.y_mean_ From a3a230d6eaaca1b8324364015f2dfd043940b578 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Sun, 18 Aug 2024 17:07:43 +0200 Subject: [PATCH 09/63] fix: GP Fixed up --- .../bayesian_optimization/cost_cooling.py | 1 - .../bayesian_optimization/kernels/__init__.py | 6 - .../kernels/combine_kernels_hierarchy.py | 201 ---- .../kernels/get_kernels.py | 58 -- .../kernels/grakel_replace/edge_histogram.py | 12 +- .../bayesian_optimization/kernels/kernel.py | 159 ++- .../kernels/vectorial_kernels.py | 187 ++-- .../kernels/weisfilerlehman.py | 29 +- .../bayesian_optimization/models/__init__.py | 4 +- .../bayesian_optimization/models/deepGP.py | 28 +- .../bayesian_optimization/models/gp.py | 379 +++---- .../models/gp_hierarchy.py | 957 ------------------ .../bayesian_optimization/optimizer.py | 60 +- neps/optimizers/multi_fidelity/dyhpo.py | 1 - .../multi_fidelity/sampling_policy.py | 23 - neps/search_spaces/encoding.py | 430 ++++++-- neps_examples/basic_usage/hyperparameters.py | 2 +- pyproject.toml | 1 - 18 files changed, 765 insertions(+), 1773 deletions(-) delete mode 100644 neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py delete mode 100644 neps/optimizers/bayesian_optimization/kernels/get_kernels.py delete mode 100644 neps/optimizers/bayesian_optimization/models/gp_hierarchy.py diff --git a/neps/optimizers/bayesian_optimization/cost_cooling.py b/neps/optimizers/bayesian_optimization/cost_cooling.py index 0d77fbc6..5a8926c7 100644 --- a/neps/optimizers/bayesian_optimization/cost_cooling.py +++ b/neps/optimizers/bayesian_optimization/cost_cooling.py @@ -23,7 +23,6 @@ from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( AcquisitionSampler, ) -from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_default_kernels from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization diff --git a/neps/optimizers/bayesian_optimization/kernels/__init__.py b/neps/optimizers/bayesian_optimization/kernels/__init__.py index 44c8e0ac..6ab80672 100644 --- a/neps/optimizers/bayesian_optimization/kernels/__init__.py +++ b/neps/optimizers/bayesian_optimization/kernels/__init__.py @@ -1,13 +1,8 @@ from __future__ import annotations -from dataclasses import dataclass from functools import partial from typing import Callable -from typing_extensions import TypeAlias -from neps.optimizers.bayesian_optimization.kernels.graph_kernel import GraphKernels -from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary -from .encoding import NASBOTDistance from .vectorial_kernels import HammingKernel, Matern32Kernel, Matern52Kernel, RBFKernel from .weisfilerlehman import WeisfilerLehman @@ -29,5 +24,4 @@ h=0, oa=False, ), - "nasbot": NASBOTDistance, } diff --git a/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py b/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py deleted file mode 100644 index 086cfc03..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/combine_kernels_hierarchy.py +++ /dev/null @@ -1,201 +0,0 @@ -import logging - -import numpy as np -import torch - -from .utils import extract_configs_hierarchy -from .vectorial_kernels import HammingKernel, Stationary -from .weisfilerlehman import GraphKernels - - -# normalise weights in front of additive kernels -def transform_weights(weights): - return torch.exp(weights) / torch.sum(torch.exp(weights)) - - -def _select_dimensions(k): - if isinstance(k, HammingKernel): - return "categorical" - return "continuous" - - -class CombineKernel: - def __init__( - self, - combined_by="sum", - *kernels: list, - **kwargs, - ): - if combined_by not in ["sum", "product"]: - raise ValueError(f"Invalid value for combined_by ({combined_by})") - - self.has_graph_kernels = False - self.has_vector_kernels = False - self.hierarchy_consider = kwargs["hierarchy_consider"] - self.d_graph_features = kwargs["d_graph_features"] - # if use global graph features of the final architecture graph, prepare for normalising - # them based on training data - if self.d_graph_features > 0: - self.train_graph_feature_mean = None - self.train_graph_feature_std = None - - self.lengthscale_bounds = (None, None) - for k in kernels: - if isinstance(k, GraphKernels): - self.has_graph_kernels = True - if not isinstance(k, GraphKernels): - self.has_vector_kernels = True - self.lengthscale_bounds = k.lengthscale_bounds - self.kernels = kernels - # Store the training graphs and vector features.. - self._gram = None - self.gr, self.x = None, None - self.combined_by = combined_by - - def fit_transform( - self, - weights: torch.Tensor, - configs: list, - normalize: bool = True, - rebuild_model: bool = True, - save_gram_matrix: bool = True, - gp_fit: bool = True, - feature_lengthscale: list = None, - **kwargs, - ): - weights = transform_weights(weights.clone()) - N = len(configs) - K = torch.zeros(N, N) if self.combined_by == "sum" else torch.ones(N, N) - - gr1, x1 = extract_configs_hierarchy( - configs, - d_graph_features=self.d_graph_features, - hierarchy_consider=self.hierarchy_consider, - ) - - # normalise the global graph features if we plan to use them - if self.d_graph_features > 0: - if gp_fit: - # compute the mean and std based on training data - self.train_graph_feature_mean = np.mean(x1, 0) - self.train_graph_feature_std = np.std(x1, 0) - x1 = (x1 - self.train_graph_feature_mean) / self.train_graph_feature_std - # k_values = [] # for debug - # k_features = [] # for debug - for i, k in enumerate(self.kernels): - if isinstance(k, GraphKernels) and None not in gr1: - if len(gr1) == N and self.hierarchy_consider is None: - # only the final graph is used - k_i = k.fit_transform( - [g[i] for g in gr1] if isinstance(gr1[0], (list, tuple)) else gr1, - rebuild_model=rebuild_model, - save_gram_matrix=save_gram_matrix, - gp_fit=gp_fit, - **kwargs, - ) - if normalize: - K_i_diag = torch.sqrt(torch.diag(k_i)) - k_i /= torch.ger(K_i_diag, K_i_diag) - update_val = weights[i] * k_i - - else: - # graphs in the early hierarchies are also used; - # assume the combined kernel list always start with graph kernels i.e. kernels=[graph kernels, hp kernels] - gr1_i = gr1[i] - k_i = k.fit_transform( - [g[i] for g in gr1_i] - if isinstance(gr1_i[0], (list, tuple)) - else gr1_i, - rebuild_model=rebuild_model, - save_gram_matrix=save_gram_matrix, - gp_fit=gp_fit, - **kwargs, - ) - if normalize: - K_i_diag = torch.sqrt(torch.diag(k_i)) - k_i /= torch.ger(K_i_diag, K_i_diag) - - update_val = weights[i] * k_i - # k_features.append([value.X.shape[1] for key, value in k.kern.X.items()]) - - elif isinstance(k, Stationary) and None not in x1: - k_i = k.fit_transform( - x1, - rebuild_model=rebuild_model, - save_gram_matrix=save_gram_matrix, - l=feature_lengthscale, - ) - update_val = (weights[i] * k_i).double() - else: - raise NotImplementedError( - " For now, only the Stationary custom built kernel_operators are supported!" - ) - - # k_values.append(k_i) # for debug - - if self.combined_by == "sum": - K += update_val - elif self.combined_by == "product": - K *= update_val - - # self.k_values = k_values # for debug - # self.k_features = k_features # for debug - # self.weights_trans = weights # for debug - # if not normalize: - # K_diag = torch.sqrt(torch.diag(K)) - # K /= torch.ger(K_diag, K_diag) - - if save_gram_matrix: - self._gram = K.clone() - - return K - - def fit_transform_single_hierarchy( - self, - weights: torch.Tensor, - configs: list, - hierarchy_id: int, - normalize: bool = True, - rebuild_model: bool = True, - gp_fit: bool = True, - **kwargs, - ): - weights = transform_weights(weights.clone()) - # N = len(configs) - # K = torch.zeros(N, N) if self.combined_by == "sum" else torch.ones(N, N) - - gr1, _ = extract_configs_hierarchy( - configs, - d_graph_features=self.d_graph_features, - hierarchy_consider=self.hierarchy_consider, - ) - # get the corresponding graph kernel and hierarchy graph data - graph_kernel_list = [k for k in self.kernels if isinstance(k, GraphKernels)] - # first graph kernel is on the final architecture graph - k_single_hierarchy = graph_kernel_list[int(hierarchy_id + 1)] - gr1_single_hierarchy = gr1[int(hierarchy_id + 1)] - weight_single_hierarchy = weights[int(hierarchy_id + 1)] - k_raw = k_single_hierarchy.fit_transform( - gr1_single_hierarchy, - rebuild_model=rebuild_model, - gp_fit=gp_fit, - **kwargs, - ) - k_raw = k_raw.to(torch.float32) - if normalize: - K_diag = torch.sqrt(torch.diag(k_raw)) - k_raw /= torch.ger(K_diag, K_diag) - - K = weight_single_hierarchy * k_raw - - return K - - -class SumKernel(CombineKernel): - def __init__(self, *kernels, **kwargs): - super().__init__("sum", *kernels, **kwargs) - - -class ProductKernel(CombineKernel): - def __init__(self, *kernels, **kwargs): - super().__init__("product", *kernels, **kwargs) diff --git a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py b/neps/optimizers/bayesian_optimization/kernels/get_kernels.py deleted file mode 100644 index 3ed9b5b9..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import annotations - -from neps.optimizers.bayesian_optimization.kernels import Kernel -from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import ( - HammingKernel, - Matern52Kernel, -) -import torch -from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import WeisfilerLehman - -from neps.search_spaces import SearchSpace - - -# TODO: Option to combine numerical and categorical into one. -def get_default_kernels( - *, - space: SearchSpace, - optimizable: bool = True, -) -> list[tuple[Kernel, list[str]],]: - kernels: list[tuple[Kernel, list[str]]] = [] - if any(space.graphs): - h = 2 - if optimizable: - layer_weights = torch.nn.Parameter(torch.ones(h + 1)) - else: - layer_weights = None - - kernels.append( - ( - WeisfilerLehman(h=2, layer_weights=layer_weights, oa=True), - list(space.graphs.keys()), - ) - ) - - if any(space.categoricals): - if optimizable: - lengthscales = torch.nn.Parameter(torch.ones(len(space.categoricals))) - else: - lengthscales = torch.ones(len(space.categoricals)) - - kernels.append( - ( - HammingKernel(lengthscale=lengthscales), - list(space.categoricals.keys()), - ) - ) - - if any(space.numerical): - if optimizable: - lengthscales = torch.nn.Parameter(torch.ones(len(space.numerical))) - else: - lengthscales = torch.ones(len(space.numerical)) - - kernels.append( - (Matern52Kernel(lengthscale=lengthscales), list(space.numerical.keys())) - ) - - return kernels diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py index f643dcc8..1b0b37d6 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py @@ -1,4 +1,5 @@ """The Edge Histogram kernel as defined in :cite:`sugiyama2015halting`.""" + from collections import Counter from collections.abc import Iterable from warnings import warn @@ -102,9 +103,7 @@ def parse_input(self, X: Iterable, **kwargs): # Initialise the feature matrix if self._method_calling in [1, 2]: if self.sparse == "auto": - self.sparse_ = ( - len(cols) / float(ni * len(labels)) <= 0.5 - ) + self.sparse_ = len(cols) / float(ni * len(labels)) <= 0.5 else: self.sparse_ = bool(self.sparse) @@ -119,8 +118,11 @@ def parse_input(self, X: Iterable, **kwargs): features[rows, cols] = data except MemoryError: warn("memory-error: switching to sparse") - self.sparse_, features = True, csr_matrix( - (data, (rows, cols)), shape=(ni, len(labels)), copy=False + self.sparse_, features = ( + True, + csr_matrix( + (data, (rows, cols)), shape=(ni, len(labels)), copy=False + ), ) if ni == 0: diff --git a/neps/optimizers/bayesian_optimization/kernels/kernel.py b/neps/optimizers/bayesian_optimization/kernels/kernel.py index 52db2751..57cd4895 100644 --- a/neps/optimizers/bayesian_optimization/kernels/kernel.py +++ b/neps/optimizers/bayesian_optimization/kernels/kernel.py @@ -1,29 +1,37 @@ from __future__ import annotations -import math -import inspect import copy -from typing import TypeVar, Generic, Any, Sequence, Mapping, Callable +import inspect +from abc import ABC, abstractmethod +import math +from typing import Any, ClassVar, Generic, Mapping, Sequence, TypeVar from typing_extensions import Self + import torch -import torch.nn as nn +from torch import nn from neps.utils.types import NotSet T = TypeVar("T") -class Kernel(nn.Module, Generic[T]): - def fit_transform(self, x: T) -> torch.Tensor: - raise NotImplementedError +class Kernel(ABC, nn.Module, Generic[T]): + suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] + + def __init__(self) -> None: + super().__init__() - def transform(self, x: T) -> torch.Tensor: + @abstractmethod + def as_optimizable(self) -> Self: ... + + @abstractmethod + def forward(self, x: T, x2: T | None = None) -> torch.Tensor: raise NotImplementedError def clone(self) -> Self: return self.clone_with() - def clone_with(self, **params: dict[str, Any]) -> Self: + def clone_with(self, **params: Any) -> Self: # h ttps://github.com/scikit-learn/scikit-learn/blob/70fdc843a4b8182d97a3508c1a426acc5e87e980/sklearn/base.py#L197 sig = inspect.signature(self.__init__) @@ -46,65 +54,114 @@ def clone_with(self, **params: dict[str, Any]) -> Self: def grid_search( self, x: T, + y: torch.Tensor, *, grid: Sequence[Mapping[str, Any]], - to_minimize: Callable[[torch.Tensor], float], - ) -> tuple[Self, float]: + noise_variances: Sequence[float] = (1e-6,), + ) -> tuple[Self, float] | Exception: + # Returns: (Kernel[T], float) | None if failed if len(grid) == 0: raise ValueError("Grid must have at least one element.") - def _fit_and_eval(_params: Mapping[str, Any]) -> tuple[Kernel[T], float]: + def _fit_and_eval( + _params: Mapping[str, Any], + ) -> tuple[Kernel[T], float] | Exception: cloned_kernel = self.clone_with(**_params) - K = cloned_kernel.fit_transform(x) - metric = to_minimize(K) - return cloned_kernel, metric + K = cloned_kernel.forward(x) + + best_lml = -float("inf") + exception: Exception | None = None + for noise_variance in noise_variances: + K.diag().add_(noise_variance) - return min( - (_fit_and_eval(params) for params in grid), - key=lambda x: x[1], - ) + K_inv, logDetK = compute_pd_inverse(K) + lml = log_marginal_likelihood(K_inv, logDetK, y).item() + if lml > best_lml: + best_lml = lml + + K.diag().sub_(noise_variance) + + if exception is None: + return cloned_kernel, best_lml + + return exception + + evals = [_fit_and_eval(params) for params in grid] + evals_with_score = [e for e in evals if not isinstance(e, Exception)] + if not any(evals_with_score): + raise evals[-1] # type: ignore + + best_eval = max(evals_with_score, key=lambda e: e[1]) # type: ignore + return best_eval class NumericKernel(Kernel[torch.Tensor]): ... -PI = torch.tensor(math.pi) +TWO_LOG_2_PI = 2 * torch.log(torch.tensor(2 * math.pi)) -def compute_normalized_log_marginal_likelihood( - K_i: torch.Tensor, +def log_marginal_likelihood( + K_inv: torch.Tensor, logDetK: torch.Tensor, y: torch.Tensor, ) -> torch.Tensor: - """Compute the zero mean Gaussian process log marginal likelihood - given the inverse of Gram matrix K(x2,x2), its log determinant, - and the training label vector y. - """ - lml = -0.5 * (y.t() @ K_i @ y) + 0.5 * logDetK - y.shape[0] / 2.0 * torch.log(2 * PI) - return lml / y.shape[0] + # y.T @ K_inv @ y --- Benchmarked to be twice as fast + quad_form = torch.matmul(y, torch.matmul(K_inv, y)) + n = y.shape[0] + # TODO: We can drop the `n / 2 * TWO_LOG_2_PI` term for the grid + # search above as it's constant between the different kernel grids + # as it's purely data dependant with the `n` + return -0.5 * quad_form + 0.5 * logDetK - n / TWO_LOG_2_PI -def compute_pd_inverse( + +class _CholeskyError(RuntimeError): + """Raised when the Cholesky decomposition fails.""" + + +# https://github.com/cornellius-gp/linear_operator/blob/eec70f9e1cd9106c32b05a3e774ea29d00d71cea/linear_operator/utils/cholesky.py#L12 +def _cholesky_routine( K: torch.Tensor, - *, - jitter: float | torch.Tensor = 1e-9, - attempts: int = 3, -) -> tuple[torch.Tensor, torch.Tensor]: - """Compute the inverse of a postive-(semi)definite matrix K using Cholesky inversion.""" - n = K.shape[0] - assert ( - isinstance(jitter, float) or jitter.ndim == 0 - ), "only homoscedastic noise variance is allowed here!" - for i in range(attempts): - try: - jitter_diag = jitter * torch.eye(n, device=K.device) * 10**i - Kc = torch.linalg.cholesky(K + jitter_diag) - break - except RuntimeError: - pass - else: - raise RuntimeError(f"Gram matrix not positive definite despite of jitter:\n{K}") - - logDetK = -2 * torch.sum(torch.log(torch.diag(Kc))) - K_i = torch.cholesky_inverse(Kc) - return K_i.to(dtype=torch.float64), logDetK.to(dtype=torch.float64) + jitter: float | torch.Tensor = 1e-6, + max_tries: int = 4, +) -> torch.Tensor: + L, info = torch.linalg.cholesky_ex(K) + if not torch.any(info): + return L + + # Clone as we will modify in place, still cheaper + # than creating a new full tensor for identity. + K_prime = K.clone() + jitter_prev = 0 + for i in range(max_tries): + jitter_new = jitter * (10**i) + K_prime.diagonal().add_(jitter_new - jitter_prev) + L, info = torch.linalg.cholesky_ex(K_prime) + if not torch.any(info): + return L + + jitter_prev = jitter_new + + raise _CholeskyError("Failed to compute Cholesky decomposition.") + + +def compute_pd_inverse(K: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + # Adding noise to the diagonal of K helps with numerical stability + # when K is singular or near-singular, (i.e. it helps K be more "positive") which + # is required for the decomposition. + + try: + # L @ L.T = K_inv --- solves for L + L = _cholesky_routine(K) + logDetK = 2 * torch.sum(torch.log(torch.diag(L))) + + # K_inv = L_inv @ L_inv.T --- Efficiently solve for K_inv using just L + K_inv = torch.cholesky_inverse(L) + except _CholeskyError: + # If we fail to compute the Cholesky decomposition, + # then just compute the inverse directly. + K_inv = torch.linalg.inv(K) + logDetK = torch.linalg.slogdet(K)[1] + + return K_inv, logDetK diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py index b25446d4..8bcbd45b 100644 --- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py +++ b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py @@ -1,150 +1,103 @@ from __future__ import annotations from math import sqrt -from typing_extensions import override -from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel +from typing import Any, Mapping, Sequence, ClassVar +from typing_extensions import override, Self -import numpy as np +from itertools import product import torch +import torch.nn as nn + +from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel -DEFAULT_LENGTHSCALE_BOUNDS = np.exp(-6.754111155189306), np.exp(0.0858637988771976) +# TODO: +# We should try some variations of singular length scales +# (1 scale shared across all dimensions) +# and individual ARD lengthscales (1 for each dimension) +# ARD can overfit if not properly tuned... +LENGTHSCALE_GRID = (1e-2, 1e-1, 1, 1e1, 1e2) +STD_ENCODED_OUTPUT_SCALE = (1e-2, 1e-1, 1, 1e1, 1e2) class Stationary(Kernel[torch.Tensor]): - """Here we follow the structure of GPy to build a sub class of stationary kernel. - - All the classes (i.e. the class of stationary kernel_operators) derived from this - class use the scaled distance to compute the Gram matrix. - """ + suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] = [ + {"lengthscale": l, "output_scale": o} + for l, o in product(LENGTHSCALE_GRID, STD_ENCODED_OUTPUT_SCALE) + ] def __init__( self, *, - lengthscale: torch.Tensor, - outputscale: float | torch.Tensor = 1.0, - lengthscale_bounds: tuple[float, float] = DEFAULT_LENGTHSCALE_BOUNDS, + lengthscale: torch.Tensor | None = None, + outputscale: torch.Tensor | None = None, + lengthscale_bounds: tuple[float, float] | None = (1e-2, 1e2), + outputscale_bounds: tuple[float, float] | None = (1e-2, 1e2), + device: torch.device | None = None, ): - self.lengthscale = lengthscale - self.outputscale = outputscale + super().__init__() + self.lengthscale = ( + torch.as_tensor(lengthscale, dtype=torch.float64, device=device) + if lengthscale is not None + else torch.tensor(1, dtype=torch.float64, device=device) + ) + self.outputscale = ( + torch.as_tensor(outputscale, dtype=torch.float64, device=device) + if outputscale is not None + else torch.tensor(1, dtype=torch.float64, device=device) + ) self.lengthscale_bounds = lengthscale_bounds + self.outputscale_bounds = outputscale_bounds + self.device = device - self.gram_: torch.Tensor | None = None self.train_: torch.Tensor | None = None - def fit_transform(self, x: torch.Tensor) -> torch.Tensor: - K = self._forward(x) - self.train_ = x.clone().detach() - return K + def as_optimizable(self) -> Self: + return self.clone_with( + lengthscale=nn.Parameter(self.lengthscale), + outputscale=nn.Parameter(self.outputscale), + ) + + def forward(self, x: torch.Tensor, x2: torch.Tensor | None = None) -> torch.Tensor: + # NOTE: I don't think this is the right way to do this... + with torch.no_grad(): + self.lengthscale.data.clamp_(*self.lengthscale_bounds) + self.outputscale.data.clamp_(*self.outputscale_bounds) - def transform(self, x: torch.Tensor) -> torch.Tensor: - if self.train_ is None: - raise ValueError("The kernel has not been fitted. Run fit_transform first") - return self._forward(self.train_, x) + x2 = x if x2 is None else x2 + return self._forward(x, x2) - def _forward(self, x1: torch.Tensor, x2: torch.Tensor | None = None) -> torch.Tensor: - return _scaled_distance(self.lengthscale, x1, x2) + def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: + return self.outputscale * torch.cdist(x1, x2, p=2) class RBFKernel(Stationary): @override - def _forward( - self, - x1: torch.Tensor, - x2: torch.Tensor | None = None, - ) -> torch.Tensor: - dist_sq = _scaled_distance(self.lengthscale, x1, x2, sq_dist=True) - return self.outputscale * torch.exp(-0.5 * dist_sq) + def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: + dist_sq = torch.cdist(x1, x2, p=2) ** 2 + return self.outputscale * torch.exp(-dist_sq / (2 * self.lengthscale**2)) class Matern32Kernel(Stationary): @override - def _forward( - self, - x1: torch.Tensor, - x2: torch.Tensor | None = None, - ) -> torch.Tensor: - dist = _scaled_distance(self.lengthscale, x1, x2) - return self.outputscale * (1 + sqrt(3.0) * dist) * torch.exp(-sqrt(3.0) * dist) + def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: + dist = torch.cdist(x1, x2, p=2) / self.lengthscale + factor = sqrt(3.0) * dist + matern32 = (1 + factor) * torch.exp(-factor) + return self.outputscale * matern32 -class Matern52Kernel(Stationary): +class HammingKernel(Stationary): @override - def _forward( - self, - x1: torch.Tensor, - x2: torch.Tensor | None = None, - ) -> torch.Tensor: - dist = _scaled_distance(self.lengthscale, x1, x2, sq_dist=True) - return ( - self.outputscale - * (1 + sqrt(5.0) * dist + 5.0 / 3.0 * dist) - * torch.exp(-sqrt(5.0) * dist) - ) + def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: + dists = (x1.unsqueeze(1) != x2.unsqueeze(0)).float().sum(-1) / x1.shape[-1] + scaled_dists = dists / self.lengthscale + return self.outputscale * torch.exp(-scaled_dists) -def _unscaled_square_distance( - X: torch.Tensor, - X2: torch.Tensor | None = None, -) -> torch.Tensor: - """The unscaled distance between X and X2.""" - assert X.ndim == 2 - X1sq = torch.sum(X**2, 1) - X2sq = X1sq if (X2 is None or X is X2) else torch.sum(X2**2, 1) - X2 = X if X2 is None else X2 - - r2 = -2 * X @ X2.T + X1sq[:, None] + X2sq[None, :] - r2 += 1e-15 - return torch.clamp_min(r2, 0.0) - - -def _scaled_distance( - lengthscale: torch.Tensor, - X: torch.Tensor, - X2: torch.Tensor | None = None, - *, - sq_dist: bool = False, -) -> torch.Tensor: - """Compute the *scaled* distance between X and x2 (or, if X2 is not supplied, - the distance between X and itself) by the lengthscale. if a scalar (float) or a - dim=1 lengthscale vector is supplied, then it is assumed that we use one - lengthscale for all dimensions. Otherwise, we have an ARD kernel and in which case - the length of the lengthscale vector must be the same as the dimensionality of the - problem.""" - if len(lengthscale) == 1: - if sq_dist is False: - return torch.sqrt(_unscaled_square_distance(X, X2)) / (lengthscale**2) - - return _unscaled_square_distance(X, X2) / lengthscale - - # ARD kernel - one lengthscale per dimension - assert len(lengthscale) == X.shape[1], ( - f"Lengthscale must have the same dimensionality as the input data." - f"Got {len(lengthscale)} and {X.shape[1]}" - ) - rescaled_X = X / lengthscale - if X2 is None: - dist = _unscaled_square_distance(rescaled_X) - else: - rescaled_X2 = X2 / lengthscale - dist = _unscaled_square_distance(rescaled_X, rescaled_X2) - - return dist if sq_dist else torch.sqrt(dist) - - -def _hamming_distance( - lengthscale: torch.Tensor, - X: torch.Tensor, - X2: torch.Tensor | None = None, -) -> torch.Tensor: - if X2 is None: - X2 = X - - indicator = X.unsqueeze(1) != X2 - C = -1 / (2 * lengthscale**2) - scaled_indicator = C * indicator - diffs = scaled_indicator.sum(dim=2) - - if len(lengthscale) == 1: - return torch.exp(diffs) / lengthscale - - return torch.exp(diffs) +class Matern52Kernel(Stationary): + @override + def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: + dist = torch.cdist(x1, x2, p=2) / self.lengthscale + factor = sqrt(5.0) * dist + matern52 = (1 + factor + (factor**2) / 3) * torch.exp(-factor) + return self.outputscale * matern52 diff --git a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py index b1d4cd7e..68d257b1 100644 --- a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py +++ b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py @@ -1,8 +1,13 @@ from __future__ import annotations +from typing import Any, ClassVar, Mapping, Sequence +from typing_extensions import Self + import torch +import torch.nn as nn +from itertools import product -from typing import Sequence +import numpy as np from neps.optimizers.bayesian_optimization.kernels.grakel_replace import ( VertexHistogram, WeisfeilerLehman as _WL, @@ -11,9 +16,17 @@ from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary from neps.search_spaces.encoding import WLInput +GRID_WL_LENGTHSCALES = torch.tensor([np.e**i for i in range(-2, 3)]) +GRID_WL_SUBTREE_CANDIDATES = (1, 2, 3, 4, 5) + class WeisfilerLehman(Kernel[Sequence[WLInput]]): - """Weisfiler Lehman kernel using grakel functions""" + """Weisfiler Lehman kernel using grakel functions.""" + + suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] = [ + {"h": h, "se_kernel": Stationary(lengthscale=l)} + for h, l in product(GRID_WL_SUBTREE_CANDIDATES, GRID_WL_LENGTHSCALES) + ] def __init__( self, @@ -44,7 +57,9 @@ def __init__( self.h = h self.se_kernel = se_kernel - self.layer_weights = layer_weights + self.layer_weights = ( + layer_weights if layer_weights is not None else torch.ones(h + 1) + ) self.oa = oa self.node_label = node_label if node_label != "op_name": @@ -52,7 +67,11 @@ def __init__( self.wl_kernel_: _WL | None = None + def as_optimizable(self) -> Self: + return self.clone_with(layer_weights=nn.Parameter(self.layer_weights)) + def fit_transform(self, gr: Sequence[WLInput]) -> torch.Tensor: + self.layer_weights.clamp_(0, 1) self.wl_kernel_ = _WL( h=self.h, base_graph_kernel=( # type: ignore @@ -68,14 +87,12 @@ def fit_transform(self, gr: Sequence[WLInput]) -> torch.Tensor: normalize=True, ) - # TODO: This could probably be lifted to the caller K = self.wl_kernel_.fit_transform(gr) - K = torch.as_tensor(K, dtype=torch.float64) - self.layer_weights_ = self.wl_kernel_.layer_weights return torch.as_tensor(K, dtype=torch.float64) def transform(self, gr: Sequence[WLInput]) -> torch.Tensor: assert self.wl_kernel_ is not None + self.layer_weights.clamp_(0, 1) K = self.wl_kernel_.transform(gr) return torch.as_tensor(K, dtype=torch.float64) diff --git a/neps/optimizers/bayesian_optimization/models/__init__.py b/neps/optimizers/bayesian_optimization/models/__init__.py index 6279e973..6ce65b61 100755 --- a/neps/optimizers/bayesian_optimization/models/__init__.py +++ b/neps/optimizers/bayesian_optimization/models/__init__.py @@ -1,7 +1,6 @@ from neps.utils.common import MissingDependencyError -from .gp import ComprehensiveGP -from .gp_hierarchy import ComprehensiveGPHierarchy +from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP try: from neps.optimizers.models.deepGP import DeepGP @@ -16,6 +15,5 @@ SurrogateModelMapping = { "deep_gp": DeepGP, "gp": ComprehensiveGP, - "gp_hierarchy": ComprehensiveGPHierarchy, "pfn": PFN_SURROGATE, } diff --git a/neps/optimizers/bayesian_optimization/models/deepGP.py b/neps/optimizers/bayesian_optimization/models/deepGP.py index ffc3606f..a98242a1 100644 --- a/neps/optimizers/bayesian_optimization/models/deepGP.py +++ b/neps/optimizers/bayesian_optimization/models/deepGP.py @@ -1,18 +1,17 @@ from __future__ import annotations -from dataclasses import dataclass, field import logging from copy import deepcopy +from dataclasses import dataclass, field from pathlib import Path import gpytorch import numpy as np import torch -import torch.nn as nn -from neps.search_spaces.architecture.graph_grammar import GraphParameter +from torch import nn from neps.exceptions import SurrogateFailedToFit - +from neps.search_spaces.architecture.graph_grammar import GraphParameter from neps.search_spaces.search_space import ( CategoricalParameter, FloatParameter, @@ -50,9 +49,7 @@ def count_non_improvement_steps(root_directory: Path | str) -> int: class NeuralFeatureExtractor(nn.Module): - """ - Neural network to be used in the DeepGP - """ + """Neural network to be used in the DeepGP.""" def __init__(self, input_size: int, **kwargs): super().__init__() @@ -121,15 +118,11 @@ def forward(self, x, budgets, learning_curves): # put learning curve features into the last layer along with the higher level features. x = torch.cat((x, lc_features), dim=1) - x = self.activation(getattr(self, f"fc{self.n_layers}")(x)) - - return x + return self.activation(getattr(self, f"fc{self.n_layers}")(x)) class GPRegressionModel(gpytorch.models.ExactGP): - """ - A simple GP model. - """ + """A simple GP model.""" def __init__( self, @@ -137,8 +130,7 @@ def __init__( train_y: torch.Tensor, likelihood: gpytorch.likelihoods.GaussianLikelihood, ): - """ - Constructor of the GPRegressionModel. + """Constructor of the GPRegressionModel. Args: train_x: The initial train examples for the GP. @@ -237,7 +229,7 @@ def encode_configs( def encode_learning_curves(self, learning_curves: list[list[float]]) -> torch.Tensor: lc_height = len(learning_curves) lc_width = max( - max(len(lc) for lc in learning_curves), self.min_learning_curve_length + *(len(lc) for lc in learning_curves), self.min_learning_curve_length ) lc_buffer = torch.full( (lc_width, lc_height), @@ -285,7 +277,7 @@ def _train_model( optimizer = torch.optim.Adam( [ - dict({"params": model.parameters()}, **optimizer_args), + dict({"model_params": model.parameters()}, **optimizer_args), dict({"params": nn.parameters()}, **optimizer_args), ] ) @@ -294,7 +286,7 @@ def _train_model( min_avg_loss_val = np.inf average_loss: float = 0.0 - for epoch_nr in range(0, n_epochs): + for epoch_nr in range(n_epochs): if early_stopping and count_down == 0: logger.info( f"Epoch: {epoch_nr - 1} surrogate training stops due to early " diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index 8173aaac..f46e89b9 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -1,160 +1,132 @@ from __future__ import annotations import logging -import torch -import numpy as np -from typing import Literal, Sequence, Any, Mapping -from typing_extensions import Literal from dataclasses import dataclass, field -from itertools import product +from typing import TYPE_CHECKING, Any, Literal, Mapping, Sequence +from typing_extensions import Literal +import torch.nn as nn + +import numpy as np +import torch from neps.optimizers.bayesian_optimization.kernels.kernel import ( Kernel, - NumericKernel, - compute_normalized_log_marginal_likelihood, + log_marginal_likelihood, compute_pd_inverse, ) - -from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary +from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import ( + HammingKernel, + Matern52Kernel, +) from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import ( WeisfilerLehman, ) -from neps.search_spaces.encoding import TensorEncodedConfigs -from neps.search_spaces.search_space import SearchSpace - -logger = logging.getLogger(__name__) - -f64 = torch.float64 +from neps.search_spaces import SearchSpace +from neps.search_spaces.encoding import ( + IntegerCategoricalTransformer, + JointTransformer, + MinMaxNormalizer, + OneHotEncoder, + TensorTransformer, + Transformer, + WLInputTransformer, +) +from neps.search_spaces.hyperparameters.float import FloatParameter +from neps.search_spaces.hyperparameters.integer import IntegerParameter +if TYPE_CHECKING: + from neps.search_spaces.search_space import SearchSpace -GRID_WL_LENGTHSCALES = torch.tensor([np.e**i for i in range(-2, 3)], dtype=f64) -GRID_WL_SUBTREE_CANDIDATES = (1, 2, 3, 4, 5) +logger = logging.getLogger(__name__) -def _default_param_grid() -> dict[type[Kernel], list[dict[str, Any]]]: - return { - WeisfilerLehman: [ - {"h": h, "se_kernel": Stationary(lengthscale=l)} - for h, l in product(GRID_WL_SUBTREE_CANDIDATES, GRID_WL_LENGTHSCALES) - ] - } +# The optimization we do for the noise is relatively cheap while the matrices +NOISE_VARIANCE_GRID = (1e-6, 1e-4, 1e-2, 1, 1e1, 1e2) @dataclass class ComprehensiveGP: space: SearchSpace - kernels: Sequence[tuple[Kernel, Sequence[str]]] + kernels: dict[str, tuple[Kernel, Transformer]] combined_kernel: Literal["sum", "product"] = "sum" - initial_likelihood: float = 1e-3 - optimize_likelihood: bool = True - max_likelihood: float = 0.01 + + noise_variance: Sequence[float] = NOISE_VARIANCE_GRID + kernel_parameter_grid: Mapping[str, Sequence[Mapping[str, Any]]] | bool = True + optimizer: Literal["adam", "sgd"] = "adam" - optimizer_iters: int = 20 - optimize_wl_layer_weights: bool = False - surrogate_model_fit_args: Mapping[str, Any] = field(default_factory=dict) optimizer_kwargs: Mapping[str, Any] = field(default_factory=lambda: {"lr": 0.1}) - kernel_hp_grids: Mapping[type[Kernel], Sequence[Mapping[str, Any]]] = field( - default_factory=_default_param_grid - ) + optimizer_iters: int = 20 + device: torch.device | None = None # Post fit attributes - K_i_: torch.Tensor | None = None + K_inv_: torch.Tensor | None = None n_train_: int | None = None likelihood_: float | None = None y_: torch.Tensor | None = None y_normalized_: torch.Tensor | None = None y_mean_: float | None = None y_std_: float | None = None - optimized_kernels_: ( - list[tuple[NumericKernel | WeisfilerLehman, Sequence[str]]] | None - ) = None - kernel_weights_: torch.Tensor | None = None + optimized_kernels_: dict[str, Kernel] | None = None + train_data_: dict[str, Any] | None = None def __post_init__(self): # TODO: Remove when search space is just definition and does not hold values. self.space = self.space.clone() - def fit(self, x: TensorEncodedConfigs, train_y: torch.Tensor) -> None: + def fit(self, x: list[dict[str, Any]], train_y: torch.Tensor) -> None: # Preprocessing - y_ = torch.as_tensor(train_y, dtype=f64) + y_ = torch.as_tensor(train_y, device=self.device, dtype=torch.float64) # TODO: Dunno if I like this silent hack, setting std to 1 if no std self.y_std_ = s if (s := torch.std(y_).item()) != 0 else 1 self.y_mean_ = torch.mean(y_).item() self.y_normalized_ = (y_ - self.y_mean_) / self.y_std_ - - optimized_kernels: list[ - tuple[NumericKernel | WeisfilerLehman, Sequence[str]] - ] = [] - _grids = self.kernel_hp_grids - - def _eval_kernel(_K: torch.Tensor) -> float: - assert y_ is not None - K_i, logDetK = compute_pd_inverse(_K) - nlml = -compute_normalized_log_marginal_likelihood(K_i, logDetK, y_) - return float(nlml) - - for kernel, hps in self.kernels: - if isinstance(kernel, WeisfilerLehman): - assert len(hps) == 1, "Only support single kernel per graph." - _xs = x.wl_graph_input(hps[0]) - elif isinstance(kernel, NumericKernel): - _xs = x.tensor(hps) - else: - raise ValueError(f"Unsupported kernel type {type(kernel)}") - - grid = next((g for t, g in _grids.items() if isinstance(kernel, t)), None) - if grid is None: - optimized_kernel = kernel.clone() - _ = optimized_kernel.fit_transform(_xs) # type: ignore - optimized_kernels.append((kernel, hps)) - continue - - optimized_kernel, _ = kernel.grid_search( - x=_xs, # type: ignore + self.y_ = y_ + + _data = { + key: transformer.encode(x, self.space) + for key, (_, transformer) in self.kernels.items() + } + + # optimized kernel parameters + noise variance + optim_vars: list[nn.Parameter] = [] + + grids = { + name: k.suggested_grid + for name, (k, _) in self.kernels.items() + if k.suggested_grid is not None + } + + kernels: dict[str, Kernel] = {} + for kernel_name, (kernel, _) in self.kernels.items(): + xs = _data[kernel_name] + grid = grids[kernel_name] + + maybe_optimized_kernel = kernel.grid_search( + x=xs, + y=self.y_normalized_, grid=grid, - to_minimize=_eval_kernel, ) - optimized_kernels.append((optimized_kernel, hps)) - - # Optimization weights - likelihood = torch.tensor( - self.initial_likelihood, - requires_grad=self.optimize_likelihood, + if isinstance(maybe_optimized_kernel, Exception): + raise ValueError( + f"Failed to optimize kernel {kernel_name} with grid {grid}." + ) from maybe_optimized_kernel + + opt_kernel, _ = maybe_optimized_kernel + gradient_enabled_kernel = opt_kernel.as_optimizable() + kernels[kernel_name] = gradient_enabled_kernel + + optim_vars.extend(gradient_enabled_kernel.parameters()) + + # Now that we've optimized the kernels, we convert go convert their + # parameters into a tensor we can further refine with some optimizer iterations + # - Optimize kernel-lengthscales, kernel-outputscale, noise-variance + # and any additional parameters they wish to advertise. + noise_variance = nn.Parameter( + torch.tensor(1e-3, device=self.device, dtype=torch.float64) ) + optim_vars.append(noise_variance) - kernel_weights = torch.ones( - len(optimized_kernels), - requires_grad=len(optimized_kernels) > 1, - dtype=f64, - ) - should_optimize = lambda p: p.is_leaf and p.requires_grad - - # Linking the optimizer variables to the sum kernel - optim_vars: list[torch.Tensor] = [ - a - for a in (kernel_weights, likelihood) - if a is not None and should_optimize(a) - ] - layer_weights = [ - kernel.layer_weights_ - for kernel, _ in optimized_kernels - if isinstance(kernel, WeisfilerLehman) - and kernel.layer_weights_ is not None - and should_optimize(kernel.layer_weights_) - ] - lengthscales = [ - kernel.layer_weights_ - for kernel, _ in optimized_kernels - if isinstance(kernel, NumericKernel) and should_optimize(kernel.lengthscale) - ] - lengthscalebounds = [ - kernel.lengthscale_bounds - for kernel, _ in optimized_kernels - if isinstance(kernel, NumericKernel) and should_optimize(kernel.lengthscale) - ] - - # Select the optimizer if self.optimizer == "adam": optim = torch.optim.Adam(optim_vars, **self.optimizer_kwargs) # type: ignore elif self.optimizer == "sgd": @@ -162,106 +134,153 @@ def _eval_kernel(_K: torch.Tensor) -> float: else: raise ValueError(f"Invalid optimizer {self.optimizer}") - K: torch.Tensor | None = None + K_inv: torch.Tensor | None = None N = len(x) - for _ in range(self.optimizer_iters): + for i in range(self.optimizer_iters): optim.zero_grad() - # Now we iterate over kernels to build up K _init = torch.zeros if self.combined_kernel == "sum" else torch.ones - K = _init(N, N, dtype=f64) - for (kernel, hps), weight in zip(self.kernels, kernel_weights): - if isinstance(kernel, WeisfilerLehman): - assert len(hps) == 1, "Only support single kernel per graph." - _xs = x.wl_graph_input(hps[0]) - gram = kernel.fit_transform(_xs) - elif isinstance(kernel, NumericKernel): - _xs = x.tensor(hps) - gram = kernel.fit_transform(_xs) - else: - raise ValueError(f"Unsupported kernel type {type(kernel)}") + K = _init(N, N, device=self.device, dtype=torch.float64) + for kernel_name, kernel in kernels.items(): + data = _data[kernel_name] + gram = kernel.forward(data, data) if self.combined_kernel == "sum": - K.add_(weight * gram) - elif self.combined_kernel == "product": - K.mul_(weight * gram) + K.add_(gram) else: - raise ValueError(f"Invalid combined_kernel {self.combined_kernel}") + K.mul_(gram) - # Normalize - K_diag = torch.sqrt(torch.diag(K)) - K /= torch.ger(K_diag, K_diag) - K_i, logDetK = compute_pd_inverse(K, jitter=likelihood) + K.diag().add_(noise_variance) - # If there's nothing to optimize, break out early - if len(optim_vars) == 0: - break + K_inv, logDetK = compute_pd_inverse(K) + nlml = -log_marginal_likelihood(K_inv, logDetK, y=self.y_normalized_) - nlml = -compute_normalized_log_marginal_likelihood( - K_i, logDetK, y=self.y_normalized_ - ) + # TODO: Could early stop here... nlml.backward() optim.step() with torch.no_grad(): - kernel_weights.clamp_(0.0, 1.0) - if likelihood.is_leaf: - likelihood.clamp_(1e-9, self.max_likelihood) - - for ls, ls_bounds in zip(lengthscales, lengthscalebounds): - ls.clamp_(*ls_bounds) - - for lw in layer_weights: - lw.clamp_(0.0, 1.0) - - optim.zero_grad() - - assert K is not None - K_i, logDetK = compute_pd_inverse(K, jitter=likelihood) + noise_variance.clamp_(1e-6, np.inf) # Apply the optimal hyperparameters - self.K_i_ = K_i.clone() - self.likelihood_ = likelihood.item() - self.optimized_kernels_ = optimized_kernels - self.kernel_weights_ = kernel_weights + assert K_inv is not None + self.K_inv_ = K_inv.clone() + self.noise_variance_ = noise_variance.item() + self.optimized_kernels_ = kernels self.n_train_ = N - - def predict(self, x: TensorEncodedConfigs) -> tuple[torch.Tensor, torch.Tensor]: - """Kriging predictions""" - if self.K_i_ is None or self.n_train_ is None or self.kernel_weights_ is None: + self.train_data_ = _data + + def predict(self, x: list[dict[str, Any]]) -> tuple[torch.Tensor, torch.Tensor]: + """Kriging predictions.""" + if ( + self.K_inv_ is None + or self.n_train_ is None + or self.optimized_kernels_ is None + or self.train_data_ is None + or self.y_normalized_ is None + or self.y_std_ is None + ): raise ValueError( "Inverse of Gram matrix is not instantiated. Please call the optimize " "function to fit on the training data first!" ) + _data = { + key: transformer.encode(x, self.space) + for key, (_, transformer) in self.kernels.items() + } _init = torch.zeros if self.combined_kernel == "sum" else torch.ones - N = self.n_train_ + len(x) - K = _init(N, N, dtype=f64) - for (kernel, hps), weight in zip(self.kernels, self.kernel_weights_): - if isinstance(kernel, WeisfilerLehman): - assert len(hps) == 1, "Only support single kernel per graph." - _x_test = x.wl_graph_input(hps[0]) - gram = kernel.transform(_x_test) - elif isinstance(kernel, NumericKernel): - _x_test = x.tensor(hps) - gram = kernel.fit_transform(_x_test) + n_test = len(x) + + K_train_test = _init( + self.n_train_, n_test, device=self.device, dtype=torch.float64 + ) + K_test_test = _init(n_test, n_test, device=self.device, dtype=torch.float64) + + for kernel_name, kernel in self.optimized_kernels_.items(): + train_x = self.train_data_[kernel_name] + test_x = _data[kernel_name] + + gram = kernel.forward(train_x, test_x) + if self.combined_kernel == "sum": + K_train_test.add_(gram) else: - raise ValueError(f"Unsupported kernel type {type(kernel)}") + K_train_test.mul_(gram) + gram = kernel.forward(test_x, test_x) if self.combined_kernel == "sum": - K.add_(weight * gram) - elif self.combined_kernel == "product": - K.mul_(weight * gram) + K_test_test.add_(gram) else: + K_test_test.mul_(gram) - K_s = K[: self.n_train_ :, self.n_train_ :] - K_ss = K[self.n_train_ :, self.n_train_ :] + self.likelihood_ * torch.eye(len(x)) + # Compute the predictive mean - mu_s = K_s.t() @ self.K_i_ @ self.y_normalized_ + # Scale by the standard deviation and mean + mu_s = K_train_test.t() @ self.K_inv_ @ self.y_normalized_ mu_s = mu_s * self.y_std_ + self.y_mean_ - cov_s = K_ss - K_s.t() @ self.K_i_ @ K_s - cov_s = torch.clamp(cov_s, self.likelihood_, np.inf) - cov_s = (torch.sqrt(cov_s) * self.y_std_) ** 2 + cov_s = K_test_test - K_train_test.t() @ self.K_inv_ @ K_train_test + cov_s.diagonal().clamp_(self.noise_variance_, np.inf) + cov_s *= self.y_std_**2 return mu_s, cov_s + + @classmethod + def get_default( + cls, space: SearchSpace, *, include_fidelities: bool = False + ) -> ComprehensiveGP: + kernels = get_default_kernels(space=space, include_fidelities=include_fidelities) + return cls(space=space, kernels=kernels) + + +def get_default_kernels( + *, + space: SearchSpace, + include_fidelities: bool = False, +) -> dict[str, tuple[Kernel, Transformer]]: + kernels: dict[str, tuple[Kernel, Transformer]] = {} + + # We will always need to use a graph kernel for graphs and there's no + # possibility to embed them into a tensor. + if any(space.graphs): + for hp_name in space.graphs: + kernels[f"graph_{hp_name}"] = ( + WeisfilerLehman(h=2, oa=True), + WLInputTransformer((hp_name,)), + ) + + assert all( + isinstance(f, (IntegerParameter, FloatParameter)) for f in space.fidelities + ), "Assumption for numeric represetnation of fidelity broken" + + any_numerical = any(space.numerical) or (include_fidelities and any(space.fidelities)) + if any_numerical: + # At least one numerical, fuse numeric + categoricals into one tensor encoding + transformers: list[TensorTransformer] = [] + if any(space.categoricals): + transformers.append(OneHotEncoder(tuple(space.categoricals))) + + if include_fidelities: + min_max_normalizer = MinMaxNormalizer( + tuple(space.numerical) + tuple(space.fidelities) + ) + else: + min_max_normalizer = MinMaxNormalizer(tuple(space.numerical)) + + transformers.append(min_max_normalizer) + kernels["vectorial"] = (Matern52Kernel(), JointTransformer.join(*transformers)) + else: + # At this point, we assume only categoricals and maybe fidelities + assert any(space.categoricals) + + if include_fidelities and any(space.fidelities): + fid_normalizer = MinMaxNormalizer(tuple(space.fidelities)) + one_hot_encoder = OneHotEncoder(tuple(space.categoricals)) + + transformer = JointTransformer.join(one_hot_encoder, fid_normalizer) + kernels["vectorial"] = (Matern52Kernel(), transformer) + else: + transformer = IntegerCategoricalTransformer(tuple(space.categoricals)) + kernels["categorical"] = (HammingKernel(), transformer) + + return kernels diff --git a/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py b/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py deleted file mode 100644 index 2c9993be..00000000 --- a/neps/optimizers/bayesian_optimization/models/gp_hierarchy.py +++ /dev/null @@ -1,957 +0,0 @@ -import itertools -import logging -import warnings -from copy import deepcopy -from typing import Iterable, Union - -import numpy as np -import torch - -from ..kernels.combine_kernels_hierarchy import ProductKernel, SumKernel - -# GP model as a weighted average between the vanilla vectorial GP and the graph GP -from ..kernels.graph_kernel import GraphKernels -from ..kernels.utils import extract_configs_hierarchy -from ..kernels.vectorial_kernels import Stationary -from ..kernels.weisfilerlehman import WeisfilerLehman - -import logging - -logger = logging.getLogger(__name__) - - -# Code for psd_safe_cholesky from gypytorch -class _value_context: - _global_value = None - - @classmethod - def value(cls): - return cls._global_value - - @classmethod - def _set_value(cls, value): - cls._global_value = value - - def __init__(self, value): - self._orig_value = self.__class__.value() - self._instance_value = value - - def __enter__( - self, - ): - self.__class__._set_value(self._instance_value) - - def __exit__(self, *args): - self.__class__._set_value(self._orig_value) - return False - - -class _dtype_value_context: - _global_float_value = None - _global_double_value = None - _global_half_value = None - - @classmethod - def value(cls, dtype): - if torch.is_tensor(dtype): - dtype = dtype.dtype - if dtype == torch.float: - return cls._global_float_value - elif dtype == torch.double: - return cls._global_double_value - elif dtype == torch.half: - return cls._global_half_value - else: - raise RuntimeError(f"Unsupported dtype for {cls.__name__}.") - - @classmethod - def _set_value(cls, float_value, double_value, half_value): - if float_value is not None: - cls._global_float_value = float_value - if double_value is not None: - cls._global_double_value = double_value - if half_value is not None: - cls._global_half_value = half_value - - def __init__(self, float=None, double=None, half=None): - self._orig_float_value = self.__class__.value() - self._instance_float_value = float - self._orig_double_value = self.__class__.value() - self._instance_double_value = double - self._orig_half_value = self.__class__.value() - self._instance_half_value = half - - def __enter__( - self, - ): - self.__class__._set_value( - self._instance_float_value, - self._instance_double_value, - self._instance_half_value, - ) - - def __exit__(self, *args): - self.__class__._set_value( - self._orig_float_value, self._orig_double_value, self._orig_half_value - ) - return False - - -class cholesky_jitter(_dtype_value_context): - """ - The jitter value used by `psd_safe_cholesky` when using cholesky solves. - - Default for `float`: 1e-6 - - Default for `double`: 1e-8 - """ - - _global_float_value = 1e-6 # type: ignore[assignment] - _global_double_value = 1e-8 # type: ignore[assignment] - - @classmethod - def value(cls, dtype=None): - if dtype is None: - # Deprecated in 1.4: remove in 1.5 - warnings.warn( - "cholesky_jitter is now a _dtype_value_context and should be called with a dtype argument", - DeprecationWarning, - ) - return cls._global_float_value - return super().value(dtype=dtype) - - -class _feature_flag: - r"""Base class for feature flag settings with global scope. - The default is set via the `_default` class attribute. - """ - - _default = False - _state = None - - @classmethod - def is_default(cls): - return cls._state is None - - @classmethod - def on(cls): - if cls.is_default(): - return cls._default - return cls._state - - @classmethod - def off(cls): - return not cls.on() - - @classmethod - def _set_state(cls, state): - cls._state = state - - def __init__(self, state=True): - self.prev = self.__class__._state - self.state = state - - def __enter__(self): - self.__class__._set_state(self.state) - - def __exit__(self, *args): - self.__class__._set_state(self.prev) - return False - - -class verbose_linalg(_feature_flag): - """ - Print out information whenever running an expensive linear algebra routine (e.g. Cholesky, CG, Lanczos, CIQ, etc.) - (Default: False) - """ - - _default = False - - # Create a global logger - logger = logging.getLogger("LinAlg (Verbose)") - logger.setLevel(logging.DEBUG) - - # Output logging results to the stdout stream - ch = logging.StreamHandler() - ch.setLevel(logging.DEBUG) - formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s") - ch.setFormatter(formatter) - logger.addHandler(ch) - - -class cholesky_max_tries(_value_context): - """ - The max_tries value used by `psd_safe_cholesky` when using cholesky solves. - (Default: 3) - """ - - _global_value = 3 # type: ignore[assignment] - - -class NumericalWarning(RuntimeWarning): - """ - Warning thrown when convergence criteria are not met, or when comptuations require extra stability. - """ - - pass - - -class NanError(RuntimeError): - pass - - -class NotPSDError(RuntimeError): - pass - - -def _psd_safe_cholesky(A, out=None, jitter=None, max_tries=None): - # Maybe log - if verbose_linalg.on(): - verbose_linalg.logger.debug(f"Running Cholesky on a matrix of size {A.shape}.") - - if out is not None: - out = (out, torch.empty(A.shape[:-2], dtype=torch.int32, device=out.device)) - - L, info = torch.linalg.cholesky_ex(A, out=out) - if not torch.any(info): - return L - - isnan = torch.isnan(A) - if isnan.any(): - raise NanError( - f"cholesky_cpu: {isnan.sum().item()} of {A.numel()} elements of the {A.shape} tensor are NaN." - ) - - if jitter is None: - jitter = cholesky_jitter.value(A.dtype) - if max_tries is None: - max_tries = cholesky_max_tries.value() - Aprime = A.clone() - jitter_prev = 0 - for i in range(max_tries): - jitter_new = jitter * (10**i) - # add jitter only where needed - diag_add = ( - ((info > 0) * (jitter_new - jitter_prev)) - .unsqueeze(-1) - .expand(*Aprime.shape[:-1]) - ) - Aprime.diagonal(dim1=-1, dim2=-2).add_(diag_add) - jitter_prev = jitter_new - warnings.warn( - f"A not p.d., added jitter of {jitter_new:.1e} to the diagonal", - NumericalWarning, - ) - L, info = torch.linalg.cholesky_ex(Aprime, out=out) - if not torch.any(info): - return L - raise NotPSDError( - f"Matrix not positive definite after repeatedly adding jitter up to {jitter_new:.1e}." - ) - - -def psd_safe_cholesky(A, upper=False, out=None, jitter=None, max_tries=None): - """Compute the Cholesky decomposition of A. If A is only p.s.d, add a small jitter to the diagonal. - Args: - A (Tensor): - The tensor to compute the Cholesky decomposition of - upper (bool, optional): - See torch.cholesky - out (Tensor, optional): - See torch.cholesky - jitter (float, optional): - The jitter to add to the diagonal of A in case A is only p.s.d. If omitted, - uses settings.cholesky_jitter.value() - max_tries (int, optional): - Number of attempts (with successively increasing jitter) to make before raising an error. - """ - L = _psd_safe_cholesky(A, out=out, jitter=jitter, max_tries=max_tries) - if upper: - if out is not None: - out = out.transpose_(-1, -2) - else: - L = L.transpose(-1, -2) - return L - - -# Code for psd_safe_cholesky from gypytorch - - -class ComprehensiveGPHierarchy: - def __init__( - self, - graph_kernels: Iterable, - hp_kernels: Iterable, - likelihood: float = 1e-3, - weights=None, - learn_all_h=False, - graph_feature_ard=True, - d_graph_features: int = 0, - normalize_combined_kernel=True, - hierarchy_consider: list = None, # or a list of integers e.g. [0,1,2,3] - vectorial_features: list = None, - combined_kernel: str = "sum", - verbose: bool = False, - surrogate_model_fit_args: dict = None, - gpytorch_kinv: bool = False, - ): - self.likelihood = likelihood - self.surrogate_model_fit_args = surrogate_model_fit_args or {} - self.learn_all_h = learn_all_h - self.hierarchy_consider = hierarchy_consider - self.normalize_combined_kernel = normalize_combined_kernel - if self.hierarchy_consider is None: - self.learn_all_h = False - self.domain_kernels: list = [] - if bool(graph_kernels): - self.domain_kernels += list(graph_kernels) - if bool(hp_kernels): - self.domain_kernels += list(hp_kernels) - - self.hp_kernels = hp_kernels # impose on scalar graph features - self.n_kernels: int = len(self.domain_kernels) - self.n_graph_kernels: int = len( - [i for i in self.domain_kernels if isinstance(i, GraphKernels)] - ) - self.n_vector_kernels: int = self.n_kernels - self.n_graph_kernels - self.graph_feature_ard = graph_feature_ard - self.vectorial_features = vectorial_features - self.d_graph_features = d_graph_features - - if weights is not None: - self.fixed_weights = True - if weights is not None: - assert len(weights) == self.n_kernels, ( - "the weights vector, if supplied, needs to have the same length as " - "the number of kernel_operators!" - ) - self.init_weights = ( - weights - if isinstance(weights, torch.Tensor) - else torch.tensor(weights).flatten() - ) - else: - self.fixed_weights = False - # Initialise the domain kernel weights to uniform - self.init_weights = torch.tensor( - [1.0 / self.n_kernels] * self.n_kernels, - ) - - self.weights = self.init_weights.clone() - - if combined_kernel == "product": - self.combined_kernel = ProductKernel( - *self.domain_kernels, - weights=self.weights, - hierarchy_consider=self.hierarchy_consider, - d_graph_features=self.d_graph_features, - ) - elif combined_kernel == "sum": - self.combined_kernel = SumKernel( - *self.domain_kernels, - weights=self.weights, - hierarchy_consider=self.hierarchy_consider, - d_graph_features=self.d_graph_features, - ) - else: - raise NotImplementedError( - f'Combining kernel {combined_kernel} is not yet implemented! Only "sum" ' - f'or "product" are currently supported. ' - ) - # Verbose mode - self.verbose = verbose - # Cache the Gram matrix inverse and its log-determinant - self.K, self.K_i, self.logDetK = [None] * 3 - self.layer_weights = None - self.nlml = None - - self.x_configs: list = None # type: ignore[assignment] - self.y: torch.Tensor = None - self.y_: torch.Tensor = None - self.y_mean: torch.Tensor = None - self.y_std: torch.Tensor = None - self.n: int = None # type: ignore[assignment] - - self.gpytorch_kinv = gpytorch_kinv - - def _optimize_graph_kernels(self, h_: int, lengthscale_): - weights = self.init_weights.clone() - if self.hierarchy_consider is None: - graphs, _ = extract_configs_hierarchy( - self.x_configs, - d_graph_features=self.d_graph_features, - hierarchy_consider=self.hierarchy_consider, - ) - for i, k in enumerate(self.combined_kernel.kernels): - if not isinstance(k, GraphKernels): - continue - elif isinstance(k, WeisfilerLehman): - _grid_search_wl_kernel( - k, - h_, - [x[i] for x in graphs] - if isinstance(graphs[0], list) - else [c for c in graphs], - self.y, - self.likelihood, - lengthscales=lengthscale_, - gpytorch_kinv=self.gpytorch_kinv, - ) - else: - logging.warning( - "(Graph) kernel optimisation for " - + type(k).__name__ - + " not implemented yet." - ) - else: - if self.learn_all_h: - best_nlml = torch.tensor(np.inf) - best_subtree_depth_combo = None - best_K = None - train_y = self.y - h_combo_candidates = generate_h_combo_candidates(self.hierarchy_consider) - - for h_combo in h_combo_candidates: - for i, k in enumerate(self.combined_kernel.kernels): - if isinstance(k, WeisfilerLehman): - k.change_kernel_params({"h": h_combo[i]}) - K = self.combined_kernel.fit_transform( - weights, - self.x_configs, - normalize=self.normalize_combined_kernel, - layer_weights=None, - rebuild_model=True, - save_gram_matrix=True, - ) - K_i, logDetK = compute_pd_inverse( - K, self.likelihood, self.gpytorch_kinv - ) - nlml = -compute_log_marginal_likelihood(K_i, logDetK, train_y) - if nlml < best_nlml: - best_nlml = nlml - best_subtree_depth_combo = h_combo - best_K = torch.clone(K) - for i, k in enumerate(self.combined_kernel.kernels): - if isinstance(k, WeisfilerLehman): - k.change_kernel_params({"h": best_subtree_depth_combo[i]}) # type: ignore[index] - self.combined_kernel._gram = best_K - else: - best_nlml = torch.tensor(np.inf) - best_subtree_depth = None - best_K = None - train_y = self.y - - for h_i in list(h_): # type: ignore[call-overload] - # only optimize h in wl kernel - if isinstance(self.combined_kernel.kernels[0], WeisfilerLehman): - self.combined_kernel.kernels[0].change_kernel_params({"h": h_i}) - K = self.combined_kernel.fit_transform( - weights, - self.x_configs, - normalize=self.normalize_combined_kernel, - layer_weights=None, - rebuild_model=True, - save_gram_matrix=True, - ) - K_i, logDetK = compute_pd_inverse( - K, self.likelihood, self.gpytorch_kinv - ) - nlml = -compute_log_marginal_likelihood(K_i, logDetK, train_y) - if nlml < best_nlml: - best_nlml = nlml - best_subtree_depth = h_i - best_K = torch.clone(K) - if isinstance(self.combined_kernel.kernels[0], WeisfilerLehman): - self.combined_kernel.kernels[0].change_kernel_params( - {"h": best_subtree_depth} - ) - self.combined_kernel._gram = best_K - - def fit(self, train_x: Iterable, train_y: Union[Iterable, torch.Tensor]): - self._fit(train_x, train_y, **self.surrogate_model_fit_args) - - def _fit( - self, - train_x: Iterable, - train_y: Union[Iterable, torch.Tensor], - iters: int = 20, - optimizer: str = "adam", - wl_subtree_candidates: tuple = tuple(range(5)), - wl_lengthscales: tuple = tuple( - np.e**i - for i in range(-2, 3) # type: ignore[name-defined] - ), - optimize_lik: bool = True, - max_lik: float = 0.5, - optimize_wl_layer_weights: bool = False, - optimizer_kwargs: dict = None, - ): - # Called by self._fit - self._reset_XY(train_x, train_y) - - # Get the node weights, if needed - if optimizer_kwargs is None: - optimizer_kwargs = {"lr": 0.1} - if len(wl_subtree_candidates) > 0: - self._optimize_graph_kernels( - wl_subtree_candidates, # type: ignore[arg-type] - wl_lengthscales, - ) - - weights = self.init_weights.clone() - - if (not self.fixed_weights) and len(self.domain_kernels) > 1: - weights.requires_grad_(True) - - # set the prior values for the lengthscales of the two global features of the final architecture graph - if self.graph_feature_ard: - theta_vector = torch.log(torch.tensor([0.6, 0.6])) - else: - theta_vector = torch.log(torch.tensor([0.6])) - - # if use continuous graph properties and we set to use stationary kernels - if self.d_graph_features > 0 and len(self.hp_kernels) > 0: # type: ignore[arg-type] - # TODO modify the code on theta_vector betlow to be compatibale with HPO - # theta in this case are the lengthscales for the two global property of - # the final architecture graph - # theta_vector = get_theta_vector(vectorial_features=self.vectorial_features) - theta_vector.requires_grad_(True) - - # Whether to include the likelihood (jitter or noise variance) as a hyperparameter - likelihood = torch.tensor( - self.likelihood, - ) - if optimize_lik: - likelihood.requires_grad_(True) - - layer_weights = None - if optimize_wl_layer_weights: - for k in self.domain_kernels: - if isinstance(k, WeisfilerLehman): - layer_weights = torch.ones(k.h + 1).requires_grad_(True) - if layer_weights.shape[0] <= 1: - layer_weights = None - else: - break - - # Linking the optimizer variables to the sum kernel - optim_vars = [] - # if theta_vector is not None: # TODO used for HPO - # for a in theta_vector.values(): - # if a is not None and a.requires_grad: - # optim_vars.append(a) - # if we use graph features, we will optimize the corresponding stationary kernel lengthscales - if self.d_graph_features > 0 and theta_vector.requires_grad: - optim_vars.append(theta_vector) - - for a in [weights, likelihood, layer_weights]: - if a is not None and a.is_leaf and a.requires_grad: - optim_vars.append(a) - - nlml = None - if len(optim_vars) == 0: # Skip optimisation - K = self.combined_kernel.fit_transform( - weights, - self.x_configs, - normalize=self.normalize_combined_kernel, - feature_lengthscale=torch.exp(theta_vector), - layer_weights=layer_weights, - rebuild_model=True, - ) - K_i, logDetK = compute_pd_inverse(K, likelihood, self.gpytorch_kinv) - else: - # Select the optimizer - assert optimizer.lower() in ["adam", "sgd"] - if optimizer.lower() == "adam": - optim = torch.optim.Adam(optim_vars, **optimizer_kwargs) - else: - optim = torch.optim.SGD(optim_vars, **optimizer_kwargs) - - K = None - optim_vars_list = [] - nlml_list = [] - for i in range(iters): - optim.zero_grad() - K = self.combined_kernel.fit_transform( - weights, - self.x_configs, - normalize=self.normalize_combined_kernel, - feature_lengthscale=torch.exp(theta_vector), - layer_weights=layer_weights, - rebuild_model=True, - save_gram_matrix=True, - ) - K_i, logDetK = compute_pd_inverse(K, likelihood, self.gpytorch_kinv) - nlml = -compute_log_marginal_likelihood(K_i, logDetK, self.y) - nlml.backward(create_graph=True) - if self.verbose and i % 10 == 0: - logger.info( - "Iteration:", - i, - "/", - iters, - "Negative log-marginal likelihood:", - nlml.item(), - theta_vector, - weights, - likelihood, - ) - optim.step() - - with torch.no_grad(): - likelihood.clamp_( - 1e-5, max_lik - ) if likelihood is not None and likelihood.is_leaf else None - - optim_vars_list.append( - [ - theta_vector.clone().detach(), - weights.clone().detach(), - likelihood.clone().detach(), - ] - ) - nlml_list.append(nlml.item()) - - optim.zero_grad(set_to_none=True) - - theta_vector, weights, likelihood = optim_vars_list[np.argmin(nlml_list)] - K = self.combined_kernel.fit_transform( - weights, - self.x_configs, - normalize=self.normalize_combined_kernel, - feature_lengthscale=torch.exp(theta_vector), - layer_weights=layer_weights, - rebuild_model=True, - save_gram_matrix=True, - ) - K_i, logDetK = compute_pd_inverse(K, likelihood, self.gpytorch_kinv) - - # Apply the optimal hyperparameters - # transform the weights in the combine_kernel function - self.weights = weights - self.K_i = K_i.clone() - self.K = K.clone() - self.logDetK = logDetK.clone() - self.likelihood = likelihood.item() - self.theta_vector = theta_vector - self.layer_weights = layer_weights - self.nlml = nlml.detach().cpu() if nlml is not None else None - - for k in self.combined_kernel.kernels: - if isinstance(k, Stationary): - k.update_lengthscales(lengthscale=torch.exp(theta_vector)) - - self.combined_kernel.weights = weights.clone() - - def predict(self, x_configs, preserve_comp_graph: bool = False): - """Kriging predictions""" - - if not isinstance(x_configs, list): - # Convert a single input X_s to a singleton list - x_configs = [x_configs] - - if self.K_i is None or self.logDetK is None: - raise ValueError( - "Inverse of Gram matrix is not instantiated. Please call the optimize " - "function to fit on the training data first!" - ) - - # Concatenate the full list - X_configs_all = self.x_configs + x_configs - - # Make a copy of the sum_kernels for this step, to avoid breaking the autodiff - # if grad guided mutation is used - if preserve_comp_graph: - combined_kernel_copy = deepcopy(self.combined_kernel) - else: - combined_kernel_copy = self.combined_kernel - - K_full = combined_kernel_copy.fit_transform( - self.weights, - X_configs_all, - layer_weights=self.layer_weights, - normalize=self.normalize_combined_kernel, - feature_lengthscale=torch.exp(self.theta_vector), - rebuild_model=True, - save_gram_matrix=False, - gp_fit=False, - ) - - K_s = K_full[: self.n :, self.n :] - - K_ss = K_full[self.n :, self.n :] + self.likelihood * torch.eye( - len(x_configs), - ) - - mu_s = K_s.t() @ self.K_i @ self.y - cov_s = K_ss - K_s.t() @ self.K_i @ K_s - # TODO not taking the diag? - cov_s = torch.clamp(cov_s, self.likelihood, np.inf) - mu_s = unnormalize_y(mu_s, self.y_mean, self.y_std) - std_s = torch.sqrt(cov_s) - std_s = unnormalize_y(std_s, None, self.y_std, True) - cov_s = std_s**2 - if preserve_comp_graph: - del combined_kernel_copy - return mu_s, cov_s - - @property - def x(self): - return self.x_configs - - def _reset_XY(self, train_x: Iterable, train_y: Union[Iterable, torch.Tensor]): - self.x_configs = train_x # type: ignore[assignment] - self.n = len(self.x_configs) - train_y_tensor = ( - train_y - if isinstance(train_y, torch.Tensor) - else torch.tensor(train_y, dtype=torch.get_default_dtype()) - ) - self.y_ = train_y_tensor - self.y, self.y_mean, self.y_std = normalize_y(train_y_tensor) - # The Gram matrix of the training data - self.K_i, self.logDetK = None, None - - -def get_grad(grad_matrix, feature_matrix, average_occurrences=False): - r""" - Average across the samples via a Monte Carlo sampling scheme. Also estimates the - empirical variance. :param average_occurrences: if True, do a weighted summation - based on the frequency distribution of the occurrence to compute a gradient *per - each feature*. Otherwise, each different occurrence (\phi_i = k) will get a - different gradient estimate. - """ - assert grad_matrix.shape == feature_matrix.shape - # Prune out the all-zero columns that pop up sometimes - valid_cols = [] - for col_idx in range(feature_matrix.size(1)): - if not torch.all(feature_matrix[:, col_idx] == 0): - valid_cols.append(col_idx) - feature_matrix = feature_matrix[:, valid_cols] - grad_matrix = grad_matrix[:, valid_cols] - - _, D = feature_matrix.shape - if average_occurrences: - avg_grad = torch.zeros(D) - avg_grad_var = torch.zeros(D) - for d in range(D): - current_feature = feature_matrix[:, d].clone().detach() - instances, indices, counts = torch.unique( - current_feature, return_inverse=True, return_counts=True - ) - weight_vector = torch.tensor([counts[i] for i in indices]).type(torch.float) - weight_vector /= weight_vector.sum() - mean = torch.sum(weight_vector * grad_matrix[:, d]) - # Compute the empirical variance of gradients - variance = torch.sum(weight_vector * grad_matrix[:, d] ** 2) - mean**2 - avg_grad[d] = mean - avg_grad_var[d] = variance - return avg_grad, avg_grad_var, feature_matrix.sum(dim=0) - else: - # The maximum number possible occurrences -- 7 is an example, if problem occurs, maybe we can increase this - # number. But for now, for both NAS-Bench datasets, this should be more than enough! - max_occur = 7 - avg_grad = torch.zeros(D, max_occur) - avg_grad_var = torch.zeros(D, max_occur) - incidences = torch.zeros(D, max_occur) - for d in range(D): - current_feature = feature_matrix[:, d].clone().detach() - instances, indices, counts = torch.unique( - current_feature, return_inverse=True, return_counts=True - ) - for i, val in enumerate(instances): - # Find index of all feature counts that are equal to the current val - feature_at_val = grad_matrix[current_feature == val] - avg_grad[d, int(val)] = torch.mean(feature_at_val) - avg_grad_var[d, int(val)] = torch.var(feature_at_val) - incidences[d, int(val)] = counts[i] - return avg_grad, avg_grad_var, incidences - - -# Optimize Graph kernel -def getBack(var_grad_fn, logger): - logger.debug(var_grad_fn) - for n in var_grad_fn.next_functions: - if n[0]: - try: - tensor = getattr(n[0], "variable") - logger.debug(n[0]) - logger.debug(f"Tensor with grad found: {tensor}") - logger.debug(f" - gradient: {tensor.grad}") - except AttributeError: - getBack(n[0], logger) - - -def _grid_search_wl_kernel( - k: WeisfilerLehman, - subtree_candidates, - train_x: list, - train_y: torch.Tensor, - lik: float, - subtree_prior=None, - lengthscales=None, - lengthscales_prior=None, - gpytorch_kinv: bool = False, -): - """Optimize the *discrete hyperparameters* of Weisfeiler Lehman kernel. - k: a Weisfeiler-Lehman kernel instance - hyperparameter_candidate: list of candidate hyperparameter to try - train_x: the train data - train_y: the train label - lik: likelihood - lengthscale: if using RBF kernel for successive embedding, the list of lengthscale to be grid searched over - """ - # lik = 1e-6 - assert len(train_x) == len(train_y) - best_nlml = torch.tensor(np.inf) - best_subtree_depth = None - best_lengthscale = None - best_K = None - if lengthscales is not None and k.se is not None: - candidates = [(h_, l_) for h_ in subtree_candidates for l_ in lengthscales] - else: - candidates = [(h_, None) for h_ in subtree_candidates] - - for i in candidates: - if k.se is not None: - k.change_se_params({"lengthscale": i[1]}) - k.change_kernel_params({"h": i[0]}) - K = k.fit_transform(train_x, rebuild_model=True, save_gram_matrix=True) - K_i, logDetK = compute_pd_inverse(K, lik, gpytorch_kinv) - nlml = -compute_log_marginal_likelihood(K_i, logDetK, train_y) - if nlml < best_nlml: - best_nlml = nlml - best_subtree_depth, best_lengthscale = i - best_K = torch.clone(K) - k.change_kernel_params({"h": best_subtree_depth}) - if k.se is not None: - k.change_se_params({"lengthscale": best_lengthscale}) - k.gram_ = best_K - - -def get_theta_vector(vectorial_features): - if vectorial_features is None: - return None - theta_vector = {} - for key, dim in vectorial_features.items(): - t = torch.ones(dim) - if t.shape[0] > 1: - t.requires_grad_(True) - theta_vector[key] = t - return theta_vector - - -def normalize_y(y: torch.Tensor): - y_mean = torch.mean(y) if isinstance(y, torch.Tensor) else np.mean(y) - y_std = torch.std(y) if isinstance(y, torch.Tensor) else np.std(y) - if y_std == 0: - y_std = 1 - y = (y - y_mean) / y_std - return y, y_mean, y_std - - -def unnormalize_y(y, y_mean, y_std, scale_std=False): - """Similar to the undoing of the pre-processing step above, but on the output predictions""" - if not scale_std: - y = y * y_std + y_mean - else: - y *= y_std - return y - - -def standardize_x( - x: torch.Tensor, x_min: torch.Tensor = None, x_max: torch.Tensor = None -): - """Standardize the vectorial input into a d-dimensional hypercube [0, 1]^d, where d is the number of features. - if x_min ond x_max are supplied, x2 will be standardised using these instead. This is used when standardising the - validation/test inputs. - """ - if (x_min is not None and x_max is None) or (x_min is None and x_max is not None): - raise ValueError( - "Either *both* or *neither* of x_min, x_max need to be supplied!" - ) - if x_min is None: - x_min = torch.min(x, 0)[0] - x_max = torch.max(x, 0)[0] - x = (x - x_min) / (x_max - x_min) - return x, x_min, x_max - - -def compute_log_marginal_likelihood( - K_i: torch.Tensor, - logDetK: torch.Tensor, - y: torch.Tensor, - normalize: bool = True, - log_prior_dist=None, -): - """Compute the zero mean Gaussian process log marginal likelihood given the inverse of Gram matrix K(x2,x2), its - log determinant, and the training label vector y. - Option: - - normalize: normalize the log marginal likelihood by the length of the label vector, as per the gpytorch - routine. - - prior: A pytorch distribution object. If specified, the hyperparameter prior will be taken into consideration and - we use Type-II MAP instead of Type-II MLE (compute log_posterior instead of log_evidence) - """ - lml = ( - -0.5 * y.t() @ K_i @ y - + 0.5 * logDetK - - y.shape[0] - / 2.0 - * torch.log( - 2 - * torch.tensor( - np.pi, - ) - ) - ) - if log_prior_dist is not None: - lml -= log_prior_dist - return lml / y.shape[0] if normalize else lml - - -def generate_h_combo_candidates(hierarchy_consider): - h_range_all_hierarchy = [range(min(hier + 2, 4)) for hier in hierarchy_consider] - h_range_all_hierarchy = [range(5)] + h_range_all_hierarchy - h_combo_all = list(itertools.product(*h_range_all_hierarchy)) - h_combo_sub = [] - for h_combo in h_combo_all: - sorted_h_combo = sorted(h_combo) - if sorted_h_combo not in h_combo_sub: - h_combo_sub.append(sorted_h_combo) - return h_combo_sub - - -def compute_pd_inverse( - K: torch.tensor, jitter: float = 1e-5, gpytorch_kinv: bool = False -): - """Compute the inverse of a postive-(semi)definite matrix K using Cholesky inversion.""" - if gpytorch_kinv: - Kc = psd_safe_cholesky(K) - try: - Kc.required_grad = True - except Exception: - Kc = torch.Tensor(Kc) - else: - n = K.shape[0] - assert ( - isinstance(jitter, float) or jitter.ndim == 0 - ), "only homoscedastic noise variance is allowed here!" - is_successful = False - fail_count = 0 - max_fail = 3 - while fail_count < max_fail and not is_successful: - try: - jitter_diag = jitter * torch.eye(n, device=K.device) * 10**fail_count - K_ = K + jitter_diag - Kc = torch.linalg.cholesky(K_) - is_successful = True - except RuntimeError: - fail_count += 1 - if not is_successful: - raise RuntimeError( - f"Gram matrix not positive definite despite of jitter:\n{K}" - ) - - logDetK = -2 * torch.sum(torch.log(torch.diag(Kc))) - K_i = torch.cholesky_inverse(Kc) - return K_i.to(torch.get_default_dtype()), logDetK.to(torch.get_default_dtype()) diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 2002aeab..c5c47332 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -3,8 +3,9 @@ import random from typing import Any, TYPE_CHECKING, Literal from typing_extensions import override +from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP -from neps.state.optimizer import BudgetInfo, OptimizationState +from neps.state.optimizer import BudgetInfo from neps.utils.types import ConfigResult, RawConfig from neps.utils.common import instance_from_map from neps.search_spaces import ( @@ -25,7 +26,6 @@ from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( AcquisitionSampler, ) -from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_default_kernels from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping if TYPE_CHECKING: @@ -51,11 +51,6 @@ def __init__( pipeline_space: SearchSpace, initial_design_size: int = 10, surrogate_model: str | Any = "gp", - surrogate_model_args: dict = None, - optimal_assignment: bool = False, - domain_se_kernel: str = None, - graph_kernels: list = None, - hp_kernels: list = None, acquisition: str | BaseAcquisition = "EI", log_prior_weighted: bool = False, acquisition_sampler: str | AcquisitionSampler = "mutation", @@ -77,12 +72,6 @@ def __init__( initial_design_size: Number of 'x' samples that need to be evaluated before selecting a sample using a strategy instead of randomly. surrogate_model: Surrogate model - surrogate_model_args: Arguments that will be given to the surrogate model - (the Gaussian processes model). - optimal_assignment: whether the optimal assignment kernel should be used. - domain_se_kernel: Stationary kernel name - graph_kernels: Kernels for NAS - hp_kernels: Kernels for HPO acquisition: Acquisition strategy log_prior_weighted: if to use log for prior acquisition_sampler: Acquisition function fetching strategy @@ -141,36 +130,21 @@ def __init__( self._model_update_failed: bool = False self.sample_default_first = sample_default_first - surrogate_model_args = surrogate_model_args or {} - graph_kernels, hp_kernels = get_default_kernels( - self.pipeline_space, - domain_se_kernel, - graph_kernels, - hp_kernels, - optimal_assignment, - ) - if "graph_kernels" not in surrogate_model_args: - surrogate_model_args["graph_kernels"] = graph_kernels - if "hp_kernels" not in surrogate_model_args: - surrogate_model_args["hp_kernels"] = hp_kernels - - if ( - not surrogate_model_args["graph_kernels"] - and not surrogate_model_args["hp_kernels"] - ): - raise ValueError("No kernels are provided!") - - if "vectorial_features" not in surrogate_model_args: - surrogate_model_args["vectorial_features"] = ( - self.pipeline_space.get_vectorial_dim() - ) - - self.surrogate_model = instance_from_map( - SurrogateModelMapping, - surrogate_model, - name="surrogate model", - kwargs=surrogate_model_args, - ) + if isinstance(surrogate_model, str): + if surrogate_model == "gp": + self.surrogate_model = ComprehensiveGP.get_default( + space=pipeline_space, + include_fidelities=False, + ) + else: + self.surrogate_model = instance_from_map( + SurrogateModelMapping, + surrogate_model, + name="surrogate model", + kwargs=surrogate_model_args, + ) + else: + self.surrogate_model = surrogate_model self.acquisition = instance_from_map( AcquisitionMapping, diff --git a/neps/optimizers/multi_fidelity/dyhpo.py b/neps/optimizers/multi_fidelity/dyhpo.py index bb4879b9..1a063e39 100755 --- a/neps/optimizers/multi_fidelity/dyhpo.py +++ b/neps/optimizers/multi_fidelity/dyhpo.py @@ -20,7 +20,6 @@ from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( AcquisitionSampler, ) -from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_default_kernels from neps.optimizers.multi_fidelity.mf_bo import FreezeThawModel, PFNSurrogate from neps.optimizers.multi_fidelity.utils import MFObservedData diff --git a/neps/optimizers/multi_fidelity/sampling_policy.py b/neps/optimizers/multi_fidelity/sampling_policy.py index fceb44e9..8626f7ab 100644 --- a/neps/optimizers/multi_fidelity/sampling_policy.py +++ b/neps/optimizers/multi_fidelity/sampling_policy.py @@ -22,7 +22,6 @@ from ..bayesian_optimization.acquisition_samplers.base_acq_sampler import ( AcquisitionSampler, ) -from ..bayesian_optimization.kernels.get_kernels import get_default_kernels from ..bayesian_optimization.models import SurrogateModelMapping from ..multi_fidelity_prior.utils import ( compute_config_dist, @@ -269,9 +268,6 @@ def __init__( self, pipeline_space: SearchSpace, surrogate_model: str | Any = "gp", - domain_se_kernel: str = None, - graph_kernels: list = None, - hp_kernels: list = None, surrogate_model_args: dict = None, acquisition: str | BaseAcquisition = "EI", log_prior_weighted: bool = False, @@ -282,25 +278,6 @@ def __init__( super().__init__(pipeline_space=pipeline_space, logger=logger) surrogate_model_args = surrogate_model_args or {} - - graph_kernels, hp_kernels = get_default_kernels( - pipeline_space=pipeline_space, - domain_se_kernel=domain_se_kernel, - graph_kernels=graph_kernels, - hp_kernels=hp_kernels, - optimal_assignment=False, - ) - if "graph_kernels" not in surrogate_model_args: - surrogate_model_args["graph_kernels"] = None - if "hp_kernels" not in surrogate_model_args: - surrogate_model_args["hp_kernels"] = hp_kernels - if not surrogate_model_args["hp_kernels"]: - raise ValueError("No kernels are provided!") - if "vectorial_features" not in surrogate_model_args: - surrogate_model_args["vectorial_features"] = ( - pipeline_space.get_vectorial_dim() - ) - self.surrogate_model = instance_from_map( SurrogateModelMapping, surrogate_model, diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py index 1ab1f92a..592cfa88 100644 --- a/neps/search_spaces/encoding.py +++ b/neps/search_spaces/encoding.py @@ -1,132 +1,360 @@ from __future__ import annotations -from collections.abc import Sized - -from dataclasses import dataclass +from dataclasses import dataclass, field from grakel.utils import graph_from_networkx -from typing import Sequence, Iterable, TypeAlias -from typing_extensions import Self -from more_itertools import split_when +from typing import Any, TypeAlias, TypeVar, Generic +from typing_extensions import Self, override, Self from itertools import chain import torch +from neps.search_spaces import ( + CategoricalParameter, + IntegerParameter, + FloatParameter, +) -from neps.search_spaces.search_space import SearchSpace +from neps.search_spaces.search_space import SearchSpace, Parameter WLInput: TypeAlias = tuple[dict, dict | None, dict | None] @dataclass -class TensorEncodedConfigs(Sized): - _tensor_pack: torch.Tensor | None - """Layout such that _tensor_pack[0] is the first config. - - In the case that there are no numeric/categorical hyperparameters, - this is None. - - index config_row_id | fidelities... | numericals... | one_hot_categoricals... - 0 - 1 - 2 - ... - - NOTE: A slight memory innefficiency here is that we store the one-hot encoded - as a float tensor, rather than a byte tensor. This makes joint numerical/categorical - kernels more efficient, as well as entire config row access at the cost of memory. - This should not be a problem if we do not have a large number of categorical - hyperparameters with a high number of choices. - """ - _graphs: dict[str, Sequence[WLInput]] - _col_lookup: dict[str, tuple[int, int]] # range(inclusive, exclusive) - - def __len__(self) -> int: - return self._tensor_pack.shape[0] if self._tensor_pack is not None else 0 - - def wl_graph_input(self, hp: str) -> Sequence[WLInput]: - return self._graphs[hp] - - def tensor(self, hps: Iterable[str]) -> torch.Tensor: - if self._tensor_pack is None: - raise ValueError("No numerical/categorical hyperparameters were encoded.") - - cols: list[tuple[int, int]] = [] - for hp in hps: - _cols = self._col_lookup.get(hp) - if _cols is None: - raise ValueError(f"Hyperparameter {hp} not found in the lookup table.") - cols.append(_cols) - - # OPTIM: This code with `split_when` and `chunks` makes sure to grab - # consecutive chunks of memory where possible. For example, - # if we want all categoricals, this will just return the entire - # categorical tensor, rather than subselecting each part and then concatenating. - # Also works for numericals. - sorted_indices = sorted(cols) - non_consecutive_tuple = lambda x, y: x[1] != y[0] - chunks = list(split_when(sorted_indices, non_consecutive_tuple)) - slices = [slice(chunk[0][0], chunk[-1][1]) for chunk in chunks] - tensors = [self._tensor_pack[:, s] for s in slices] - - if len(tensors) == 1: - return tensors[0].clone() - - return torch.cat(tensors, dim=1) +class GraphEncoder: + hps: tuple[str] - @classmethod def encode( - cls, + self, + x: list[dict[str, Any]], space: SearchSpace, - configs: list[SearchSpace], - *, - node_label: str = "op_name", - device: torch.device, - ) -> Self: - assert node_label == "op_name", "Only 'op_name' is supported for node_label" + ) -> dict[str, list[WLInput]]: + return {hp: [config[hp].value for config in x] for hp in self.hps} + + +T = TypeVar("T") + + +@dataclass +class Transformer(Generic[T]): + hps: tuple[str] + + def encode(self, x: list[dict[str, Any]], space: SearchSpace) -> T: ... + + def value_decode(self, x: T, space: SearchSpace) -> dict[str, list[Any]]: ... + + def decode(self, x: T, space: SearchSpace) -> list[dict[str, Any]]: + values = self.value_decode(x, space) + return [(dict(zip(values, t))) for t in zip(*values.values())] + - _graphs: dict[str, Sequence[WLInput]] = {} +@dataclass +class WLInputTransformer(Transformer[WLInput]): + def encode( + self, + x: list[dict[str, Any]], + space: SearchSpace, + ) -> dict[str, list[WLInput]]: + _graphs: dict[str, list[WLInput]] = {} for hp_name in space.graphs.keys(): - gs = [conf.graphs[hp_name].value for conf in configs] - if ( - len(gs) > 0 - and isinstance(gs[0], list) - and len(gs[0]) > 0 - and isinstance(gs[0][0], list) - ): - gs = [_list for list_of_list in gs for _list in list_of_list] + gs = [conf[hp_name].value for conf in x] _graphs[hp_name] = graph_from_networkx(gs) # type: ignore - _lookup: dict[str, tuple[int, int]] = {} + return _graphs - n_fids = len(space.fidelities) - n_nums = len(space.numerical) - n_cats = sum(len(hp.choices) for hp in space.categoricals.values()) + def value_decode( + self, + x: dict[str, list[WLInput]], + space: SearchSpace, + ) -> dict[str, list[Any]]: + raise NotImplementedError("Cannot decode WLInput to values.") - width = n_fids + n_nums + n_cats - if width == 0: - return cls(_graphs=_graphs, _tensor_pack=None, _col_lookup={}) - _tensor_pack = torch.empty(size=(len(configs), width), dtype=torch.float64) +@dataclass +class TensorTransformer(Transformer[torch.Tensor]): + def output_cols(self, space: SearchSpace) -> int: ... - offset = 0 - for hp_name in chain(space.fidelities, space.numerical): - _lookup[hp_name] = (offset, offset + 1) - _xs = [config.fidelities[hp_name].normalized_value for config in configs] - values = torch.tensor(_xs, torch.float64, device=device) + def encode( + self, + x: list[dict[str, Any]], + space: SearchSpace, + *, + device: torch.device | None = None, + dtype: torch.dtype | None = None, + ) -> torch.Tensor: + width = len(self.hps) + buffer = torch.empty(size=(len(x), width), dtype=dtype, device=device) + + for i, name in enumerate(self.hps): + hp = space[name] + assert isinstance(hp, CategoricalParameter) + values = torch.tensor( + [config[name]._value_index for config in x], dtype=dtype, device=device + ) + + return buffer + + +@dataclass +class IntegerCategoricalTransformer(TensorTransformer): + def output_cols(self, space: SearchSpace) -> int: + return len(self.hps) + + @override + def encode( + self, + x: list[dict[str, Any]], + space: SearchSpace, + *, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: + if dtype is None: + dtype = torch.int + + buffer = torch.empty(size=(len(x), len(self.hps)), dtype=dtype, device=device) + for i, name in enumerate(self.hps): + hp = space[name] + assert isinstance(hp, CategoricalParameter) + values = torch.tensor( + [config[name].value for config in x], dtype=dtype, device=device + ) + buffer[:, i] = values + + return buffer + + @override + def value_decode(self, x: torch.Tensor, space: SearchSpace) -> dict[str, list[Any]]: + values: dict[str, list[Any]] = {} + for i, name in enumerate(self.hps): + hp = space[name] + assert isinstance(hp, CategoricalParameter) + enc = x[:, i] + values[name] = [hp.choices[i] for i in enc.tolist()] + + return values + + +@dataclass +class MinMaxNormalizer(TensorTransformer): + def output_cols(self, space: SearchSpace) -> int: + return len(self.hps) + + @override + def encode( + self, + x: list[dict[str, Any]], + space: SearchSpace, + *, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: + if dtype is None: + dtype = torch.float64 + + width = len(self.hps) + buffer = torch.empty(size=(len(x), width), dtype=dtype, device=device) + + for i, name in enumerate(self.hps): + hp = space[name] + assert isinstance(hp, (FloatParameter, IntegerParameter)) + values = torch.tensor( + [config[name].value for config in x], dtype=dtype, device=device + ) + if hp.log_bounds: + lower, upper = hp.log_bounds + buffer[:, i] = (torch.log(values) - lower) / (upper - lower) + else: + lower, upper = hp.lower, hp.upper + buffer[:, i] = (values - lower) / (upper - lower) + + return buffer + + @override + def value_decode( + self, + x: torch.Tensor, + space: SearchSpace, + ) -> dict[str, list[Any]]: + values: dict[str, list[Any]] = {} + + for i, name in enumerate(self.hps): + hp = space[name] + assert isinstance(hp, (FloatParameter, IntegerParameter)) + enc = x[:, i] + if hp.log_bounds: + lower, upper = hp.log_bounds + enc = torch.exp(enc * (upper - lower) + lower) + else: + lower, upper = hp.lower, hp.upper + enc = enc * (upper - lower) + lower - _tensor_pack[:, offset] = values + if isinstance(hp, IntegerParameter): + enc = torch.round(enc).to(torch.int) - offset += 1 + values[name] = enc.tolist() - for hp_name, cat in space.categoricals.items(): - n_choices = len(cat.choices) - _lookup[hp_name] = (offset, offset + n_choices) + return values - # .. and insert one-hot encoding (ChatGPT solution, verified locally) - _xs = [config[hp_name].normalized_value for config in configs] - cat_tensor = torch.tensor(_xs, torch.float64, device=device).unsqueeze(1) - _tensor_pack[:, offset : offset + n_choices].scatter_(1, cat_tensor, 1) +@dataclass +class StandardNormalizer(TensorTransformer): + std_means: dict[str, tuple[float, float]] = field(default_factory=dict) + + def output_cols(self, space: SearchSpace) -> int: + return len(self.hps) + + @override + def encode( + self, + x: list[dict[str, Any]], + space: SearchSpace, + *, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: + if dtype is None: + dtype = torch.float64 + + width = len(self.hps) + buffer = torch.empty(size=(len(x), width), dtype=dtype, device=device) + std_means: dict[str, tuple[float, float]] = {} + + for i, name in enumerate(self.hps): + hp = space[name] + assert isinstance(hp, (FloatParameter, IntegerParameter)) + values = torch.tensor( + [config[name].value for config in x], dtype=dtype, device=device + ) + if hp.log_bounds: + values = torch.log(values) + + mean, std = values.mean(), values.std() + std_means[name] = (mean.item(), std.item()) + + buffer[:, i] = (values - mean) / std + + self.std_means = std_means + return buffer + + @override + def value_decode(self, x: torch.Tensor, space: SearchSpace) -> dict[str, list[Any]]: + values: dict[str, list[Any]] = {} + + for i, name in enumerate(self.hps): + hp = space[name] + assert isinstance(hp, Parameter) + enc = x[:, i] + if isinstance(hp, (FloatParameter, IntegerParameter)): + std, mean = self.std_means[name] + if hp.log_bounds: + enc = torch.exp(enc * std + mean) + else: + enc = enc * std + mean + + if isinstance(hp, IntegerParameter): + enc = torch.round(enc).to(torch.int) + + values[name] = enc.tolist() + else: + raise ValueError(f"Invalid hyperparameter type: {type(hp)}") + + return values + + +@dataclass +class OneHotEncoder(TensorTransformer): + def output_cols(self, space: SearchSpace) -> int: + return sum(len(hp.choices) for hp in (space[name] for name in self.hps)) # type: ignore + + @override + def encode( + self, + x: list[dict[str, Any]], + space: SearchSpace, + *, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: + if dtype is None: + dtype = torch.bool + + categoricals: dict[str, CategoricalParameter] = {} + for name in self.hps: + hp = space[name] + assert isinstance(hp, CategoricalParameter) + categoricals[name] = hp + width = sum(len(hp.choices) for hp in categoricals.values()) + buffer = torch.zeros(size=(len(x), width), dtype=dtype, device=device) + + offset = 0 + for name, hp in categoricals.items(): + n_choices = len(hp.choices) + _xs = [config[name]._value_index for config in x] + cat_tensor = torch.tensor(_xs, dtype=torch.int64, device=device).unsqueeze(1) + buffer[:, offset : offset + n_choices].scatter_(1, cat_tensor, 1) + offset += n_choices + + return buffer + + @override + def value_decode( + self, + x: torch.Tensor, + space: SearchSpace, + ) -> dict[str, list[Any]]: + values: dict[str, list[Any]] = {} + + offset = 0 + for name in self.hps: + hp = space[name] + assert isinstance(hp, CategoricalParameter) + n_choices = len(hp.choices) + enc = x[:, offset : offset + n_choices].argmax(dim=1) + + values[name] = [hp.choices[i] for i in enc] offset += n_choices - return cls(_graphs=_graphs, _tensor_pack=_tensor_pack, _col_lookup=_lookup) + return values + + +@dataclass +class JointTransformer(TensorTransformer): + transforms: tuple[TensorTransformer, ...] + + def output_cols(self, space: SearchSpace) -> int: + return sum(t.output_cols(space) for t in self.transforms) + + @classmethod + def join(cls, *transforms: TensorTransformer) -> Self: + hps = tuple(chain.from_iterable(t.hps for t in transforms)) + return cls(hps, transforms) + + @override + def encode( + self, + x: list[dict[str, Any]], + space: SearchSpace, + *, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: + return torch.cat( + [t.encode(x, space, dtype=dtype, device=device) for t in self.transforms], + dim=1, + ) + + @override + def value_decode( + self, + x: torch.Tensor, + space: SearchSpace, + ) -> dict[str, list[Any]]: + values: dict[str, list[Any]] = {} + offset = 0 + for t in self.transforms: + width = t.output_cols(space) + t_values = t.value_decode(x[:, offset : offset + width], space) + values.update(t_values) + offset += width + + return values diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py index 164b49cb..2a20399d 100644 --- a/neps_examples/basic_usage/hyperparameters.py +++ b/neps_examples/basic_usage/hyperparameters.py @@ -26,5 +26,5 @@ def run_pipeline(float1, float2, categorical, integer1, integer2): pipeline_space=pipeline_space, root_directory="results/hyperparameters_example", post_run_summary=True, - max_evaluations_total=15, + max_evaluations_total=50, ) diff --git a/pyproject.toml b/pyproject.toml index 06b4baa4..b5be06c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,7 +103,6 @@ src = ["neps"] # TODO(eddiebergman): Include more of these as we go on in migration exclude = [ - "neps/optimizers/**/*.py", "neps/search_spaces/architecture/**/*.py", "neps/search_spaces/yaml_search_space_utils.py", "neps/utils/run_args_from_yaml.py", From 27b31196ace834750e26c05e87fdd2da4efbfbe8 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Tue, 20 Aug 2024 14:24:46 +0200 Subject: [PATCH 10/63] checkpoint: not working yet --- .../acquisition_functions/aq_functions.py | 88 +++ .../acquisition_functions/ei.py | 20 +- .../acquisition_sampler_2/__init__.py | 0 .../acquisition_sampler_2/aq_samplers.py | 22 + .../acquisition_sampler_2/mutation_sampler.py | 163 +++++ .../acquisition_sampler_2/random_sampler.py | 15 + .../acquisition_samplers/mutation_sampler.py | 38 +- .../grakel_replace/vertex_histogram.py | 64 +- .../grakel_replace/weisfeiler_lehman.py | 294 ++++----- .../bayesian_optimization/kernels/kernel.py | 18 +- .../kernels/vectorial_kernels.py | 39 +- .../kernels/weisfilerlehman.py | 40 +- .../bayesian_optimization/models/gp.py | 143 ++--- .../bayesian_optimization/optimizer.py | 222 +++---- neps/search_spaces/distributions/__init__.py | 16 + .../distributions/distribution.py | 21 + neps/search_spaces/distributions/truncnorm.py | 112 ++++ .../distributions/uniform_float.py | 47 ++ .../distributions/uniform_int.py | 46 ++ .../distributions/weighted_ints.py | 91 +++ neps/search_spaces/domain.py | 316 +++++++++ neps/search_spaces/encoding.py | 604 ++++++++++-------- neps/search_spaces/neighborhoods.py | 281 ++++++++ neps/search_spaces/samplers/__init__.py | 9 + neps/search_spaces/samplers/model.py | 186 ++++++ neps/search_spaces/samplers/prior.py | 110 ++++ neps/search_spaces/samplers/sampler.py | 22 + neps/search_spaces/samplers/uniform.py | 79 +++ .../samplers/weighted_sampler.py | 51 ++ neps/state/__init__.py | 4 + neps/state/optimizer.py | 1 - neps/state/trial.py | 6 +- neps/utils/types.py | 2 +- 33 files changed, 2434 insertions(+), 736 deletions(-) create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py create mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/__init__.py create mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py create mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py create mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py create mode 100644 neps/search_spaces/distributions/__init__.py create mode 100644 neps/search_spaces/distributions/distribution.py create mode 100644 neps/search_spaces/distributions/truncnorm.py create mode 100644 neps/search_spaces/distributions/uniform_float.py create mode 100644 neps/search_spaces/distributions/uniform_int.py create mode 100644 neps/search_spaces/distributions/weighted_ints.py create mode 100644 neps/search_spaces/domain.py create mode 100644 neps/search_spaces/neighborhoods.py create mode 100644 neps/search_spaces/samplers/__init__.py create mode 100644 neps/search_spaces/samplers/model.py create mode 100644 neps/search_spaces/samplers/prior.py create mode 100644 neps/search_spaces/samplers/sampler.py create mode 100644 neps/search_spaces/samplers/uniform.py create mode 100644 neps/search_spaces/samplers/weighted_sampler.py diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py b/neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py new file mode 100644 index 00000000..70b6b4e6 --- /dev/null +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import math + +import torch + + +def ei( + mu: torch.Tensor, + cov: torch.Tensor, + optimum: float | torch.Tensor, + *, + augmented_ei_regularizer: float | None = None, # 0.01 + xi: float = 0.0, + log_ei: bool = False, + log_ei_epsilon: float = 1e-6, +) -> torch.Tensor: + improvement = optimum - mu - xi + + sigma_sq = torch.diag(cov) + sigma = torch.sqrt(sigma_sq) + + Z = improvement / sigma + + # If we calculate it ourselves, we save some computation as mu = 0 + # and sigma = 1 cancel a few terms out + # https://en.wikipedia.org/wiki/Normal_distribution + Z_cdf = 0.5 * (1 + torch.erf(Z / math.sqrt(2))) + Z_pdf = 1 / (math.sqrt(2 * math.pi)) * torch.exp(-0.5 * Z**2) + ei = improvement * Z_cdf + sigma * Z_pdf + + if augmented_ei_regularizer is not None: + regularization_term = 1 + sigma_sq / augmented_ei_regularizer + ei = ei / regularization_term + + if log_ei: + ei = torch.log(ei + log_ei_epsilon) + + return ei + + +def acq_by_confidence( + mu: torch.Tensor, + cov: torch.Tensor, + *, + confidence_scale: float = 1.0, +) -> torch.Tensor: + # Assumes we are trying to minimize our objective but + # this acquisition function will be maximized, i.e. optimize + # this function to find the point which is most likely to be + # the minimum of the objective. + + # **** + # * / \** + # ***** / \- **** + # * / \ *** + # * / \ | * *** + # ---/ \ | +** + # -/ \ | / \ + # \|/ --- + # - <- lcb = mu - c * sigma + # ______________________________ + lcb = mu - confidence_scale * torch.sqrt(torch.diag(cov)) + + return -lcb # Negate to make maximization + + +def weight_by_cost( + acquisition_scores: torch.Tensor, +) -> torch.Tensor: + # Assumes we are trying to minimize our objective but + # this acquisition function will be maximized, i.e. optimize + # this function to find the point which is most likely to be + # the minimum of the objective. + + # **** + # * / \** + # ***** / \- **** + # * / \ *** + # * / \ | * *** + # ---/ \ | +** + # -/ \ | / \ + # \|/ --- + # - <- lcb = mu - c * sigma + # ______________________________ + lcb = mu - cost_scale * torch.sqrt(torch.diag(cov)) + + return -lcb # Negate to make maximization diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py index ba5eb38b..cc13cc8e 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py @@ -1,15 +1,18 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Sequence, Union -import numpy as np +from typing import TYPE_CHECKING, Sequence + import torch from torch.distributions import Normal from .base_acquisition import BaseAcquisition if TYPE_CHECKING: + import numpy as np + from neps.search_spaces import SearchSpace + class ComprehensiveExpectedImprovement(BaseAcquisition): def __init__( self, @@ -51,11 +54,11 @@ def __init__( self.optimize_on_max_fidelity = optimize_on_max_fidelity def eval( - self, x: Sequence[SearchSpace], asscalar: bool = False, - ) -> Union[np.ndarray, torch.Tensor, float]: - """ - Return the negative expected improvement at the query point x2 - """ + self, + x: Sequence[SearchSpace], + asscalar: bool = False, + ) -> np.ndarray | torch.Tensor | float: + """Return the negative expected improvement at the query point x2.""" assert self.incumbent is not None, "EI function not fitted on model" if x[0].has_fidelity and self.optimize_on_max_fidelity: @@ -70,6 +73,7 @@ def eval( except ValueError as e: raise e # return -1.0 # in case of error. return ei of -1 + std = torch.sqrt(torch.diag(cov)) mu_star = self.incumbent gauss = Normal(torch.zeros(1, device=mu.device), torch.ones(1, device=mu.device)) @@ -103,11 +107,9 @@ def set_state(self, surrogate_model, **kwargs): # Compute incumbent if self.in_fill == "best": - # return torch.max(surrogate_model.y_) self.incumbent = torch.min(self.surrogate_model.y_) else: x = self.surrogate_model.x mu_train, _ = self.surrogate_model.predict(x) - # incumbent_idx = torch.argmax(mu_train) incumbent_idx = torch.argmin(mu_train) self.incumbent = self.surrogate_model.y_[incumbent_idx] diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/__init__.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py new file mode 100644 index 00000000..f799252b --- /dev/null +++ b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import torch + + from neps.search_spaces import SearchSpace + + +def random_sample(search_space: SearchSpace, *, seed: torch.Generator) -> SearchSpace: + """Sample a random value from a search space. + + Args: + search_space: The search space to sample from. + user_priors: Whether to sample from user priors. + seed: The seed to use for sampling. + + Returns: + A search space with a sampled value. + """ + return search_space.sample_value(user_priors=user_priors) diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py new file mode 100644 index 00000000..972ad6c3 --- /dev/null +++ b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py @@ -0,0 +1,163 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Callable, Sequence + +import numpy as np +import torch +from more_itertools import first +from typing_extensions import override + +from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( + AcquisitionSampler, +) +from neps.optimizers.bayesian_optimization.acquisition_samplers.random_sampler import ( + RandomSampler, +) + +if TYPE_CHECKING: + from neps.search_spaces.search_space import SearchSpace + + +def _propose_location( + acquisition_function: Callable, + candidates: list[SearchSpace], + top_n: int = 5, + return_distinct: bool = True, +) -> tuple[list[SearchSpace], np.ndarray | torch.Tensor, np.ndarray]: + """top_n: return the top n candidates wrt the acquisition function.""" + if return_distinct: + eis = acquisition_function(candidates, asscalar=True) # faster + eis_, unique_idx = np.unique(eis, return_index=True) + try: + i = np.argpartition(eis_, -top_n)[-top_n:] + indices = np.array([unique_idx[j] for j in i]) + except ValueError: + eis = torch.tensor([acquisition_function(c) for c in candidates]) + _, indices = eis.topk(top_n) + else: + eis = torch.tensor([acquisition_function(c) for c in candidates]) + _, indices = eis.topk(top_n) + + xs = [candidates[int(i)] for i in indices] + return xs, eis, indices + + +class MutationSampler(AcquisitionSampler): + def __init__( + self, + pipeline_space, + pool_size: int = 250, + n_best: int = 10, + mutate_size: float | int = 0.5, + allow_isomorphism: bool = False, + check_isomorphism_history: bool = True, + patience: int = 50, + ): + super().__init__(pipeline_space=pipeline_space, patience=patience) + self.pool_size = pool_size + self.n_best = n_best + self.mutate_size = mutate_size + if isinstance(mutate_size, int): + assert ( + pool_size >= mutate_size + ), " pool_size must be larger or equal to mutate_size" + + self.allow_isomorphism = allow_isomorphism + self.check_isomorphism_history = ( + check_isomorphism_history # check for isomorphisms also in previous graphs + ) + self.random_sampling = RandomSampler( + pipeline_space=pipeline_space, patience=patience + ) + + @override + def set_state( + self, x: list[SearchSpace], y: Sequence[float] | np.ndarray | torch.Tensor + ) -> None: + super().set_state(x, y) + self.random_sampling.set_state(x, y) + + @override + def sample(self, acquisition_function: Callable) -> SearchSpace: + return first(self.sample_batch(acquisition_function, batch=1)) + + @override + def sample_batch( + self, + acquisition_function: Callable, + batch: int, + ) -> list[SearchSpace]: + pool = self.create_pool( + x=self.x, + y=self.y, + acquisition_function=acquisition_function, + pool_size=self.pool_size, + ) + + samples, _, _ = _propose_location( + acquisition_function=acquisition_function, + top_n=batch, + candidates=pool, + ) + return samples + + def create_pool( + self, + x: list[SearchSpace], + y: Sequence[float] | np.ndarray | torch.Tensor, + acquisition_function: Callable, + pool_size: int, + ) -> list[SearchSpace]: + if len(x) == 0: + return self.random_sampling.sample_batch(acquisition_function, pool_size) + + if isinstance(self.mutate_size, int): + mutate_size = self.mutate_size + else: + mutate_size = int(self.mutate_size * pool_size) + + n_best = len(x) if len(x) < self.n_best else self.n_best + best_configs = [x for (_, x) in sorted(zip(y, x), key=lambda pair: pair[0])][ + :n_best + ] + + seen: set[int] = set() + + def _hash(_config: SearchSpace) -> int: + return hash(_config.hp_values().values()) + + evaluation_pool = [] + per_arch = mutate_size // n_best + + for config in best_configs: + remaining_patience = self.patience + for _ in range(per_arch): + while remaining_patience: + try: + # needs to throw an Exception if config is not valid, e.g., empty graph etc.! + child = config.mutate() + except Exception: + remaining_patience -= 1 + continue + hash_child = _hash(child) + + if not self.allow_isomorphism: + # if disallow isomorphism, we enforce that each time, we mutate n distinct graphs. + # For now we do not check the isomorphism in all of the previous graphs though + if child == config or hash_child in seen: + remaining_patience -= 1 + continue + + evaluation_pool.append(child) + seen.add(hash_child) + break + + # Fill missing pool with random samples + nrandom_archs = max(pool_size - len(evaluation_pool), 0) + if nrandom_archs: + random_evaluation_pool = self.random_sampling.sample_batch( + acquisition_function, nrandom_archs + ) + evaluation_pool += random_evaluation_pool + + return evaluation_pool diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py new file mode 100644 index 00000000..f7a4da76 --- /dev/null +++ b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py @@ -0,0 +1,15 @@ +from __future__ import annotations + +import torch +from neps.search_spaces import SearchSpace +from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( + AcquisitionSampler, +) + + +class RandomSampler(AcquisitionSampler): + + def sample(self, n: int, space: SearchSpace) -> torch.Tensor: + return self.pipeline_space.sample( + patience=self.patience, user_priors=False, ignore_fidelity=False + ) diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py index 4c6b17df..cafc05dd 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py +++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py @@ -48,7 +48,7 @@ def __init__( pipeline_space, pool_size: int = 250, n_best: int = 10, - mutate_size: int | None = None, + mutate_size: float | int = 0.5, allow_isomorphism: bool = False, check_isomorphism_history: bool = True, patience: int = 50, @@ -57,6 +57,11 @@ def __init__( self.pool_size = pool_size self.n_best = n_best self.mutate_size = mutate_size + if isinstance(mutate_size, int): + assert ( + pool_size >= mutate_size + ), " pool_size must be larger or equal to mutate_size" + self.allow_isomorphism = allow_isomorphism self.check_isomorphism_history = ( check_isomorphism_history # check for isomorphisms also in previous graphs @@ -83,7 +88,12 @@ def sample_batch( acquisition_function: Callable, batch: int, ) -> list[SearchSpace]: - pool = self.create_pool(acquisition_function, self.pool_size) + pool = self.create_pool( + x=self.x, + y=self.y, + acquisition_function=acquisition_function, + pool_size=self.pool_size, + ) samples, _, _ = _propose_location( acquisition_function=acquisition_function, @@ -94,23 +104,23 @@ def sample_batch( def create_pool( self, + x: list[SearchSpace], + y: Sequence[float] | np.ndarray | torch.Tensor, acquisition_function: Callable, pool_size: int, ) -> list[SearchSpace]: - if len(self.x) == 0: + if len(x) == 0: return self.random_sampling.sample_batch(acquisition_function, pool_size) - mutate_size = ( - int(0.5 * pool_size) if self.mutate_size is None else self.mutate_size - ) - assert ( - pool_size >= mutate_size - ), " pool_size must be larger or equal to mutate_size" - - n_best = len(self.x) if len(self.x) < self.n_best else self.n_best - best_configs = [ - x for (_, x) in sorted(zip(self.y, self.x), key=lambda pair: pair[0]) - ][:n_best] + if isinstance(self.mutate_size, int): + mutate_size = self.mutate_size + else: + mutate_size = int(self.mutate_size * pool_size) + + n_best = len(x) if len(x) < self.n_best else self.n_best + best_configs = [x for (_, x) in sorted(zip(y, x), key=lambda pair: pair[0])][ + :n_best + ] seen: set[int] = set() diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py index e59b5433..4a4dfc79 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py @@ -1,8 +1,11 @@ """The vertex kernel as defined in :cite:`sugiyama2015halting`.""" +from __future__ import annotations + import logging from collections import Counter from collections.abc import Iterable +from typing import TYPE_CHECKING from warnings import warn import numpy as np @@ -14,7 +17,10 @@ from sklearn.exceptions import NotFittedError from sklearn.utils.validation import check_is_fitted -from ..vectorial_kernels import Stationary +if TYPE_CHECKING: + from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import ( + NumericKernel, + ) class VertexHistogram(Kernel): @@ -42,7 +48,7 @@ class VertexHistogram(Kernel): If supplied, the Malahanobis distance with the precision matrix as supplied will be computed in the dot product, instead of the vanilla dot product. - Attributes + Attributes: ---------- None. @@ -55,7 +61,7 @@ def __init__( sparse="auto", oa=False, mahalanobis_precision=None, - se_kernel: Stationary | None = None, + se_kernel: NumericKernel | None = None, requires_ordered_features: bool = False, as_tensor: bool = True, ): @@ -120,7 +126,7 @@ def parse_input(self, X, label_start_idx=0, label_end_idx=None): - Returns + Returns: ------- out : np.array, shape=(len(X), n_labels) A np.array for frequency (cols) histograms for all Graphs (rows). @@ -139,9 +145,9 @@ def parse_input(self, X, label_start_idx=0, label_end_idx=None): if not isinstance(X, Iterable): raise TypeError("input must be an iterable\n") else: - rows, cols, data = list(), list(), list() + rows, cols, data = [], [], [] if self._method_calling in [0, 1, 2]: - labels = dict() + labels = {} self._labels = labels elif self._method_calling == 3: labels = dict(self._labels) @@ -200,7 +206,7 @@ def parse_input(self, X, label_start_idx=0, label_end_idx=None): ni += 1 if self.require_ordered_features: - label_length = max(label_end_idx - label_start_idx, max(cols)) + 1 + label_length = max(label_end_idx - label_start_idx, *cols) + 1 else: label_length = len(labels) @@ -244,7 +250,7 @@ def _calculate_kernel_matrix(self, Y=None): Y : np.array, default=None The array between samples and features. - Returns + Returns: ------- K : numpy array, shape = [n_targets, n_inputs] The kernel matrix: a calculation between all pairs of graphs @@ -260,24 +266,19 @@ def _calculate_kernel_matrix(self, Y=None): for j in range(i, self.X.shape[0]): K[i, j] = np.sum(np.minimum(self.X[i, :], self.X[j, :])) K[j, i] = K[i, j] + elif self.se_kernel is not None: + K = self.se_kernel._forward(self.X, self.X) else: - if self.se_kernel is not None: - K = self.se_kernel._forward(self.X, self.X) - else: - K = self.X @ self.X.T + K = self.X @ self.X.T + elif self.oa: + K = np.zeros((Y.shape[0], self.X.shape[0])) + for i in range(Y.shape[0]): + for j in range(self.X.shape[0]): + K[i, j] = np.sum(np.minimum(self.X[j, :], Y[i, : self.X.shape[1]])) + elif self.se_kernel is not None: + K = self.se_kernel._forward(self.X, Y) else: - if self.oa: - K = np.zeros((Y.shape[0], self.X.shape[0])) - for i in range(Y.shape[0]): - for j in range(self.X.shape[0]): - K[i, j] = np.sum( - np.minimum(self.X[j, :], Y[i, : self.X.shape[1]]) - ) - else: - if self.se_kernel is not None: - K = self.se_kernel._forward(self.X, Y) - else: - K = Y[:, : self.X.shape[1]] @ self.X.T + K = Y[:, : self.X.shape[1]] @ self.X.T if self.sparse_: return K.toarray() @@ -291,7 +292,7 @@ def diagonal(self, use_tensor=False): ---------- None. - Returns + Returns: ------- X_diag : np.array The diagonal of the kernel matrix, of the fitted. This consists @@ -309,11 +310,10 @@ def diagonal(self, use_tensor=False): # Calculate diagonal of X if use_tensor: self._X_diag = torch.einsum("ij,ij->i", [self.X_tensor, self.X_tensor]) + elif self.sparse_: + self._X_diag = squeeze(array(self.X.multiply(self.X).sum(axis=1))) else: - if self.sparse_: - self._X_diag = squeeze(array(self.X.multiply(self.X).sum(axis=1))) - else: - self._X_diag = einsum("ij,ij->i", self.X, self.X) + self._X_diag = einsum("ij,ij->i", self.X, self.X) try: check_is_fitted(self, ["_Y"]) if use_tensor: @@ -346,7 +346,7 @@ def transform(self, X, return_embedding_only=False, **kwargs): computing the kernel function). This is used when computing the derivative of the kernel w.r.t. the test points/ - Returns + Returns: ------- K : numpy array, shape = [n_targets, n_input_graphs] corresponding to the kernel matrix, a calculation between @@ -395,7 +395,7 @@ def fit_transform(self, X, **kwargs): There is no need of a target in a transformer, yet the pipeline API requires this parameter. - Returns + Returns: ------- K : numpy array, shape = [n_targets, n_input_graphs] corresponding to the kernel matrix, a calculation between @@ -431,7 +431,7 @@ def fit(self, X, y=None, **kwargs): There is no need of a target in a transformer, yet the pipeline API requires this parameter. - Returns + Returns: ------- self : object Returns self. diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py index dd5dd829..f10e406f 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py @@ -70,7 +70,7 @@ def __init__( h: int = 5, base_graph_kernel=VertexHistogram, node_weights=None, - layer_weights: torch.Tensor | None = None, + layer_weights=None, as_tensor: bool = True, ): """Initialise a `weisfeiler_lehman` kernel.""" @@ -121,27 +121,16 @@ def initialize(self): try: base_graph_kernel, params = base_graph_kernel except Exception as _error: - raise TypeError( - "Base kernel was not formulated in " - "the correct way. " - "Check documentation." - ) from _error + NOT_YET_IMPLEMENTED_StmtRaise if not ( - type(base_graph_kernel) is type # pylint: disable=C0123 + type(base_graph_kernel) + is type # pylint: disable=C0123 and issubclass(base_graph_kernel, Kernel) ): - raise TypeError( - "The first argument must be a valid " - "grakel.kernel.kernel Object" - ) + NOT_YET_IMPLEMENTED_StmtRaise if not isinstance(params, dict): - raise ValueError( - "If the second argument of base " - "kernel exists, it must be a diction" - "ary between parameters names and " - "values" - ) + NOT_YET_IMPLEMENTED_StmtRaise params.pop("normalize", None) params["normalize"] = False @@ -152,16 +141,14 @@ def initialize(self): if not self._initialized["h"]: if not isinstance(self.h, int) or self.h < 0: - raise TypeError( - "'h' must be a non-negative integer. Got h:" + str(self.h) - ) + NOT_YET_IMPLEMENTED_StmtRaise self._h = self.h + 1 self._initialized["h"] = True - if self.layer_weights is None: - self.layer_weights = torch.ones((self._h,)) - else: - assert len(self.layer_weights) == self._h + if self.layer_weights is None or self.layer_weights.shape[0] != self._h: + self.layer_weights = np.ones((self._h,)) + if self.as_tensor and not isinstance(self.layer_weights, torch.Tensor): + self.layer_weights = torch.tensor(self.layer_weights) self._initialized["h"] = True self._initialized["layer_weights"] = True @@ -204,9 +191,7 @@ def parse_input( """ if self._method_calling not in [1, 2]: - raise ValueError( - "method call must be called either from fit " + "or fit-transform" - ) + NOT_YET_IMPLEMENTED_StmtRaise elif hasattr(self, "_X_diag"): # Clean _X_diag value delattr(self, "_X_diag") @@ -218,7 +203,7 @@ def parse_input( else: # Input validation and parsing if not isinstance(X, collections.abc.Iterable): - raise TypeError("input must be an iterable\n") + NOT_YET_IMPLEMENTED_StmtRaise else: nx = 0 Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict() @@ -228,26 +213,25 @@ def parse_input( x = list(x) if is_iter and (len(x) == 0 or len(x) >= 2): if len(x) == 0: - warnings.warn("Ignoring empty element on index: " + str(idx)) + warnings.warn( + "Ignoring empty element on index: " + str(idx) + ) continue + elif len(x) > 2: + extra = tuple() + if len(x) > 3: + extra = tuple(x[3:]) + x = Graph(x[0], x[1], x[2], graph_format=self._graph_format) + extra = ( + x.get_labels( + purpose=self._graph_format, + label_type="edge", + return_none=True, + ), + ) + extra else: - if len(x) > 2: - extra = tuple() - if len(x) > 3: - extra = tuple(x[3:]) - x = Graph( - x[0], x[1], x[2], graph_format=self._graph_format - ) - extra = ( - x.get_labels( - purpose=self._graph_format, - label_type="edge", - return_none=True, - ), - ) + extra - else: - x = Graph(x[0], x[1], {}, graph_format=self._graph_format) - extra = tuple() + x = Graph(x[0], x[1], {}, graph_format=self._graph_format) + extra = tuple() elif isinstance(x, Graph): x.desired_format(self._graph_format) @@ -262,19 +246,14 @@ def parse_input( extra = (el,) else: - raise TypeError( - "each element of X must be either a " - + "graph object or a list with at least " - + "a graph like object and node labels " - + "dict \n" - ) + NOT_YET_IMPLEMENTED_StmtRaise Gs_ed[nx] = x.get_edge_dictionary() L[nx] = x.get_labels(purpose="dictionary") extras[nx] = extra - distinct_values |= set(L[nx].values()) - nx += 1 + NOT_YET_IMPLEMENTED_StmtAugAssign + NOT_YET_IMPLEMENTED_StmtAugAssign if nx == 0: - raise ValueError("parsed input is empty") + NOT_YET_IMPLEMENTED_StmtRaise # Save the number of "fitted" graphs. self._nx = nx @@ -284,33 +263,28 @@ def parse_input( label_count = 0 for dv in sorted(list(distinct_values)): WL_labels_inverse[dv] = label_count - label_count += 1 + NOT_YET_IMPLEMENTED_StmtAugAssign # Initalize an inverse dictionary of labels for all iterations - self._inv_labels = ( - OrderedDict() - ) # Inverse dictionary of labels, in term of the *previous layer* + self._inv_labels = OrderedDict() # Inverse dictionary of labels, in term of the *previous layer* self._inv_labels[0] = deepcopy(WL_labels_inverse) - self.feature_dims.append( - len(WL_labels_inverse) - ) # Update the zeroth iteration feature dim - - self._inv_label_node_attr = ( - OrderedDict() - ) # Inverse dictionary of labels, in term of the *node attribute* - self._label_node_attr = ( - OrderedDict() - ) # Same as above, but with key and value inverted - self._label_node_attr[0], self._inv_label_node_attr[0] = self.translate_label( - WL_labels_inverse, 0 - ) + self.feature_dims.append(len(WL_labels_inverse)) # Update the zeroth iteration feature dim + + self._inv_label_node_attr = OrderedDict() # Inverse dictionary of labels, in term of the *node attribute* + self._label_node_attr = OrderedDict() # Same as above, but with key and value inverted + ( + self._label_node_attr[0], + self._inv_label_node_attr[0], + ) = self.translate_label(WL_labels_inverse, 0) if self.node_weights is not None: self._feature_weight = OrderedDict() # Ensure the order is the same self._feature_weight[0] = self._compute_feature_weight( self.node_weights, 0, WL_labels_inverse - )[1] + )[ + 1 + ] else: self._feature_weight = None @@ -323,7 +297,7 @@ def generate_graphs(label_count: int, WL_labels_inverse): L[j] = new_labels # add new labels new_graphs.append((Gs_ed[j], new_labels) + extras[j]) - yield new_graphs + NOT_YET_IMPLEMENTED_ExprYield for i in range(1, self._h): label_set, WL_labels_inverse, L_temp = set(), dict(), dict() @@ -333,10 +307,10 @@ def generate_graphs(label_count: int, WL_labels_inverse): # Keep for each node the temporary L_temp[j] = dict() for v in Gs_ed[j].keys(): - credential = ( - str(L[j][v]) - + "," - + str(sorted(L[j][n] for n in Gs_ed[j][v].keys())) + credential = str(L[j][v]) + "," + str( + sorted( + (NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []) + ) ) L_temp[j][v] = credential label_set.add(credential) @@ -344,7 +318,7 @@ def generate_graphs(label_count: int, WL_labels_inverse): label_list = sorted(list(label_set)) for dv in label_list: WL_labels_inverse[dv] = label_count - label_count += 1 + NOT_YET_IMPLEMENTED_StmtAugAssign # Recalculate labels new_graphs = list() @@ -370,9 +344,11 @@ def generate_graphs(label_count: int, WL_labels_inverse): if self.node_weights is not None: self._feature_weight[i] = self._compute_feature_weight( self.node_weights, i, self._inv_label_node_attr[i] - )[1] + )[ + 1 + ] # assert len(self._feature_weight[i] == len(WL_labels_inverse)) - yield new_graphs + NOT_YET_IMPLEMENTED_ExprYield # Initialise the base graph kernel. base_graph_kernel = {} @@ -391,22 +367,21 @@ def generate_graphs(label_count: int, WL_labels_inverse): label_end_idx=self.feature_dims[i + 1], ) ) + elif self._method_calling == 1: + base_graph_kernel[i].fit( + g, + label_start_idx=self.feature_dims[i], + label_end_idx=self.feature_dims[i + 1], + ) else: - if self._method_calling == 1: - base_graph_kernel[i].fit( + K.append( + self.layer_weights[i] + * base_graph_kernel[i].fit_transform( g, label_start_idx=self.feature_dims[i], label_end_idx=self.feature_dims[i + 1], ) - else: - K.append( - self.layer_weights[i] - * base_graph_kernel[i].fit_transform( - g, - label_start_idx=self.feature_dims[i], - label_end_idx=self.feature_dims[i + 1], - ) - ) + ) if gp_fit: self.X_fit[self._h] = X @@ -453,7 +428,7 @@ def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True): # pylint: di 0, ] # Flush the feature dimensions if X is None: - raise ValueError("transform input cannot be None") + NOT_YET_IMPLEMENTED_StmtRaise else: km, self.X = self.parse_input(X, gp_fit=gp_fit) @@ -488,47 +463,39 @@ def transform(self, X: Iterable, return_embedding_only: bool = True): # Input validation and parsing if X is None: - raise ValueError("transform input cannot be None") + NOT_YET_IMPLEMENTED_StmtRaise + elif not isinstance(X, collections.abc.Iterable): + NOT_YET_IMPLEMENTED_StmtRaise else: - if not isinstance(X, collections.abc.Iterable): - raise ValueError("input must be an iterable\n") - else: - nx = 0 - distinct_values = set() - Gs_ed, L = dict(), dict() - for i, x in enumerate(iter(X)): - is_iter = isinstance(x, collections.abc.Iterable) - if is_iter: - x = list(x) - if is_iter and len(x) in [0, 2, 3]: - if len(x) == 0: - warnings.warn("Ignoring empty element on index: " + str(i)) - continue - - elif len(x) in [2, 3]: - x = Graph(x[0], x[1], {}, self._graph_format) - elif isinstance(x, Graph): - x.desired_format("dictionary") - else: - raise ValueError( - "each element of X must have at " - + "least one and at most 3 elements\n" - ) - Gs_ed[nx] = x.get_edge_dictionary() - L[nx] = x.get_labels(purpose="dictionary") + nx = 0 + distinct_values = set() + Gs_ed, L = dict(), dict() + for i, x in enumerate(iter(X)): + is_iter = isinstance(x, collections.abc.Iterable) + if is_iter: + x = list(x) + if is_iter and len(x) in [0, 2, 3]: + if len(x) == 0: + warnings.warn("Ignoring empty element on index: " + str(i)) + continue + + elif len(x) in [2, 3]: + x = Graph(x[0], x[1], {}, self._graph_format) + elif isinstance(x, Graph): + x.desired_format("dictionary") + else: + NOT_YET_IMPLEMENTED_StmtRaise + Gs_ed[nx] = x.get_edge_dictionary() + L[nx] = x.get_labels(purpose="dictionary") - # Hold all the distinct values - distinct_values |= { - v for v in L[nx].values() if v not in self._inv_labels[0] - } - nx += 1 - if nx == 0: - raise ValueError("parsed input is empty") + # Hold all the distinct values + NOT_YET_IMPLEMENTED_StmtAugAssign + NOT_YET_IMPLEMENTED_StmtAugAssign + if nx == 0: + NOT_YET_IMPLEMENTED_StmtRaise nl = len(self._inv_labels[0]) - WL_labels_inverse = { - dv: idx for (idx, dv) in enumerate(sorted(list(distinct_values)), nl) - } + WL_labels_inverse = {NOT_IMPLEMENTED_dict_key: NOT_IMPLEMENTED_dict_value for key, value in NOT_IMPLEMENTED_dict} WL_labels_inverse = OrderedDict(WL_labels_inverse) def generate_graphs_transform(WL_labels_inverse, nl): @@ -544,21 +511,21 @@ def generate_graphs_transform(WL_labels_inverse, nl): L[j] = new_labels # produce the new graphs new_graphs.append([Gs_ed[j], new_labels]) - yield new_graphs + NOT_YET_IMPLEMENTED_ExprYield for i in range(1, self._h): new_graphs = list() L_temp, label_set = dict(), set() - nl += len(self._inv_labels[i]) + NOT_YET_IMPLEMENTED_StmtAugAssign for j in range(nx): # Find unique labels and sort them for both graphs # Keep for each node the temporary L_temp[j] = dict() for v in Gs_ed[j].keys(): - credential = ( - str(L[j][v]) - + "," - + str(sorted(L[j][n] for n in Gs_ed[j][v].keys())) + credential = str(L[j][v]) + "," + str( + sorted( + (NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []) + ) ) L_temp[j][v] = credential if credential not in self._inv_labels[i]: @@ -583,7 +550,7 @@ def generate_graphs_transform(WL_labels_inverse, nl): L[j] = new_labels # Create the new graphs with the new labels. new_graphs.append([Gs_ed[j], new_labels]) - yield new_graphs + NOT_YET_IMPLEMENTED_ExprYield if return_embedding_only: K = [] @@ -600,29 +567,11 @@ def generate_graphs_transform(WL_labels_inverse, nl): # Calculate the kernel matrix without parallelization if self.as_tensor: - summand = [ - self.layer_weights[i] - * self.X[i].transform( - g, - label_start_idx=self.feature_dims[i], - label_end_idx=self.feature_dims[i + 1], - ) - for i, g in enumerate(generate_graphs_transform(WL_labels_inverse, nl)) - ] + summand = [NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []] K = torch.stack(summand, dim=0).sum(dim=0) else: K = np.sum( - ( - self.layer_weights[i] - * self.X[i].transform( - g, - label_start_idx=self.feature_dims[i], - label_end_idx=self.feature_dims[i + 1], - ) - for (i, g) in enumerate( - generate_graphs_transform(WL_labels_inverse, nl) - ) - ), + (NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []), axis=0, ) @@ -631,7 +580,7 @@ def generate_graphs_transform(WL_labels_inverse, nl): X_diag, Y_diag = self.diagonal() if self.as_tensor: div_ = torch.sqrt(torch.ger(Y_diag, X_diag)) - K /= div_ + NOT_YET_IMPLEMENTED_StmtAugAssign else: old_settings = np.seterr(divide="ignore") K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag)))) @@ -667,7 +616,7 @@ def diagonal(self): if self._is_transformed: Y_diag = self.X[0].diagonal()[1] for i in range(1, self._h): - Y_diag += self.X[i].diagonal()[1] + NOT_YET_IMPLEMENTED_StmtAugAssign except NotFittedError: # Calculate diagonal of X if self._is_transformed: @@ -676,8 +625,8 @@ def diagonal(self): X_diag.flags.writeable = True for i in range(1, self._h): x, y = self.X[i].diagonal() - X_diag += x - Y_diag += y + NOT_YET_IMPLEMENTED_StmtAugAssign + NOT_YET_IMPLEMENTED_StmtAugAssign self._X_diag = X_diag # case sub kernel is only fitted @@ -686,7 +635,7 @@ def diagonal(self): X_diag.flags.writeable = True for i in range(1, self._n_iter): x = self.X[i].diagonal() - X_diag += x + NOT_YET_IMPLEMENTED_StmtAugAssign self._X_diag = X_diag if self.as_tensor: @@ -710,15 +659,18 @@ def translate_label(curr_layer: dict, h: int, prev_layer: dict = None): """ if h == 0: - return {v: str(k) for k, v in curr_layer.items()}, curr_layer + return ( + {NOT_IMPLEMENTED_dict_key: NOT_IMPLEMENTED_dict_value for key, value in NOT_IMPLEMENTED_dict}, + curr_layer, + ) else: - assert prev_layer is not None + NOT_YET_IMPLEMENTED_StmtAssert label_in_node_attr, inv_label_in_node_attr = OrderedDict(), OrderedDict() for pattern, encoding in curr_layer.items(): # current pattern is in terms of the encoding previous layer. Find the pattern from the prev_layer root, leaf = literal_eval(pattern) root_ = prev_layer[root] - leaf_ = [prev_layer[i] for i in leaf] + leaf_ = [NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []] label_in_node_attr.update({encoding: "~".join([root_] + leaf_)}) inv_label_in_node_attr.update({"~".join([root_] + leaf_): encoding}) return label_in_node_attr, inv_label_in_node_attr @@ -739,18 +691,22 @@ def _compute_feature_weight( feature_weights_flattened = [] if h == 0: feature_weight = OrderedDict( - {k: (node_weight[k]) ** 2 for k in inv_label_node_attr.keys()} + {NOT_IMPLEMENTED_dict_key: NOT_IMPLEMENTED_dict_value for key, value in NOT_IMPLEMENTED_dict} ) - feature_weights_flattened = np.array(list(feature_weight.values())).flatten() + feature_weights_flattened = np.array( + list(feature_weight.values()) + ).flatten() else: for k, _ in inv_label_node_attr.items(): # k is the pattern, v is the encoding k_sep = k.split("~") - average_weight = np.mean([(node_weight[i]) ** 2 for i in k_sep]) + average_weight = np.mean( + [NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []] + ) feature_weights.update({k: average_weight}) feature_weights_flattened.append(average_weight) feature_weights_flattened = np.array(feature_weights_flattened).flatten() - assert len(feature_weights_flattened) == len(inv_label_node_attr) + NOT_YET_IMPLEMENTED_StmtAssert return feature_weights, feature_weights_flattened def dK_dX(self, X_test: None): diff --git a/neps/optimizers/bayesian_optimization/kernels/kernel.py b/neps/optimizers/bayesian_optimization/kernels/kernel.py index 57cd4895..42382a51 100644 --- a/neps/optimizers/bayesian_optimization/kernels/kernel.py +++ b/neps/optimizers/bayesian_optimization/kernels/kernel.py @@ -2,8 +2,8 @@ import copy import inspect -from abc import ABC, abstractmethod import math +from abc import ABC, abstractmethod from typing import Any, ClassVar, Generic, Mapping, Sequence, TypeVar from typing_extensions import Self @@ -56,12 +56,11 @@ def grid_search( x: T, y: torch.Tensor, *, - grid: Sequence[Mapping[str, Any]], + grid: Sequence[Mapping[str, Any]] | None = None, noise_variances: Sequence[float] = (1e-6,), - ) -> tuple[Self, float] | Exception: + ) -> tuple[Self, float]: # Returns: (Kernel[T], float) | None if failed - if len(grid) == 0: - raise ValueError("Grid must have at least one element.") + grid = grid or self.suggested_grid def _fit_and_eval( _params: Mapping[str, Any], @@ -70,7 +69,6 @@ def _fit_and_eval( K = cloned_kernel.forward(x) best_lml = -float("inf") - exception: Exception | None = None for noise_variance in noise_variances: K.diag().add_(noise_variance) @@ -81,18 +79,14 @@ def _fit_and_eval( K.diag().sub_(noise_variance) - if exception is None: - return cloned_kernel, best_lml - - return exception + return cloned_kernel, best_lml evals = [_fit_and_eval(params) for params in grid] evals_with_score = [e for e in evals if not isinstance(e, Exception)] if not any(evals_with_score): raise evals[-1] # type: ignore - best_eval = max(evals_with_score, key=lambda e: e[1]) # type: ignore - return best_eval + return max(evals_with_score, key=lambda e: e[1]) # type: ignore class NumericKernel(Kernel[torch.Tensor]): ... diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py index 8bcbd45b..07b56333 100644 --- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py +++ b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py @@ -1,12 +1,13 @@ from __future__ import annotations +from abc import ABC +from itertools import product from math import sqrt -from typing import Any, Mapping, Sequence, ClassVar -from typing_extensions import override, Self +from typing import Any, ClassVar, Mapping, Sequence +from typing_extensions import Self, override -from itertools import product import torch -import torch.nn as nn +from torch import nn from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel @@ -19,10 +20,10 @@ STD_ENCODED_OUTPUT_SCALE = (1e-2, 1e-1, 1, 1e1, 1e2) -class Stationary(Kernel[torch.Tensor]): +class NumericKernel(Kernel[torch.Tensor], ABC): suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] = [ - {"lengthscale": l, "output_scale": o} - for l, o in product(LENGTHSCALE_GRID, STD_ENCODED_OUTPUT_SCALE) + {"lengthscale": _l, "output_scale": o} + for _l, o in product(LENGTHSCALE_GRID, STD_ENCODED_OUTPUT_SCALE) ] def __init__( @@ -59,25 +60,33 @@ def as_optimizable(self) -> Self: def forward(self, x: torch.Tensor, x2: torch.Tensor | None = None) -> torch.Tensor: # NOTE: I don't think this is the right way to do this... - with torch.no_grad(): - self.lengthscale.data.clamp_(*self.lengthscale_bounds) - self.outputscale.data.clamp_(*self.outputscale_bounds) + if self.lengthscale_bounds is not None or self.outputscale_bounds is not None: + with torch.no_grad(): + if self.lengthscale_bounds is not None: + self.lengthscale.data.clamp_(*self.lengthscale_bounds) + if self.outputscale_bounds is not None: + self.outputscale.data.clamp_(*self.outputscale_bounds) x2 = x if x2 is None else x2 return self._forward(x, x2) + def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: ... + + +class Stationary(NumericKernel): + @override def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: - return self.outputscale * torch.cdist(x1, x2, p=2) + return self.outputscale * torch.cdist(x1, x2, p=2) / self.lengthscale -class RBFKernel(Stationary): +class RBFKernel(NumericKernel): @override def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: dist_sq = torch.cdist(x1, x2, p=2) ** 2 return self.outputscale * torch.exp(-dist_sq / (2 * self.lengthscale**2)) -class Matern32Kernel(Stationary): +class Matern32Kernel(NumericKernel): @override def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: dist = torch.cdist(x1, x2, p=2) / self.lengthscale @@ -86,7 +95,7 @@ def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: return self.outputscale * matern32 -class HammingKernel(Stationary): +class HammingKernel(NumericKernel): @override def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: dists = (x1.unsqueeze(1) != x2.unsqueeze(0)).float().sum(-1) / x1.shape[-1] @@ -94,7 +103,7 @@ def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: return self.outputscale * torch.exp(-scaled_dists) -class Matern52Kernel(Stationary): +class Matern52Kernel(NumericKernel): @override def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: dist = torch.cdist(x1, x2, p=2) / self.lengthscale diff --git a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py index 68d257b1..8c1feb26 100644 --- a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py +++ b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py @@ -1,37 +1,38 @@ from __future__ import annotations +from itertools import product from typing import Any, ClassVar, Mapping, Sequence from typing_extensions import Self +import numpy as np +import numpy.typing as npt import torch -import torch.nn as nn -from itertools import product +from torch import nn -import numpy as np from neps.optimizers.bayesian_optimization.kernels.grakel_replace import ( VertexHistogram, WeisfeilerLehman as _WL, ) from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel -from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import Stationary -from neps.search_spaces.encoding import WLInput +from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import NumericKernel GRID_WL_LENGTHSCALES = torch.tensor([np.e**i for i in range(-2, 3)]) GRID_WL_SUBTREE_CANDIDATES = (1, 2, 3, 4, 5) -class WeisfilerLehman(Kernel[Sequence[WLInput]]): +class WeisfilerLehman(Kernel[npt.NDArray[np.object_]]): """Weisfiler Lehman kernel using grakel functions.""" suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] = [ - {"h": h, "se_kernel": Stationary(lengthscale=l)} + {"h": h, "se_kernel": NumericKernel(lengthscale=l)} for h, l in product(GRID_WL_SUBTREE_CANDIDATES, GRID_WL_LENGTHSCALES) ] def __init__( self, + *, h: int = 0, - se_kernel: Stationary | None = None, + se_kernel: NumericKernel | None = None, layer_weights: torch.Tensor | None = None, oa: bool = False, node_label: str = "op_name", @@ -70,7 +71,7 @@ def __init__( def as_optimizable(self) -> Self: return self.clone_with(layer_weights=nn.Parameter(self.layer_weights)) - def fit_transform(self, gr: Sequence[WLInput]) -> torch.Tensor: + def fit_transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor: self.layer_weights.clamp_(0, 1) self.wl_kernel_ = _WL( h=self.h, @@ -87,12 +88,27 @@ def fit_transform(self, gr: Sequence[WLInput]) -> torch.Tensor: normalize=True, ) - K = self.wl_kernel_.fit_transform(gr) + K = self.wl_kernel_.fit_transform(iter(gr)) return torch.as_tensor(K, dtype=torch.float64) - def transform(self, gr: Sequence[WLInput]) -> torch.Tensor: + def transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor: assert self.wl_kernel_ is not None self.layer_weights.clamp_(0, 1) - K = self.wl_kernel_.transform(gr) + K = self.wl_kernel_.transform(iter(gr)) return torch.as_tensor(K, dtype=torch.float64) + + def forward( + self, + x: npt.NDArray[np.object_], + x2: npt.NDArray[np.object_] | None = None, + ) -> torch.Tensor: + if x2 is None: + K = self.fit_transform(x) + self.wl_kernel_ = None + return K + + self.fit_transform(x) + K = self.transform(x2) + self.wl_kernel_ = None + return K diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index f46e89b9..e63c033f 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -3,36 +3,36 @@ import logging from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any, Literal, Mapping, Sequence -from typing_extensions import Literal -import torch.nn as nn import numpy as np import torch +from torch import nn +from torch.optim import SGD, Adam # type: ignore from neps.optimizers.bayesian_optimization.kernels.kernel import ( Kernel, - log_marginal_likelihood, compute_pd_inverse, + log_marginal_likelihood, ) from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import ( HammingKernel, Matern52Kernel, + NumericKernel, ) from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import ( WeisfilerLehman, ) from neps.search_spaces import SearchSpace from neps.search_spaces.encoding import ( - IntegerCategoricalTransformer, - JointTransformer, + CategoricalToIntegerTransformer, + DataPack, MinMaxNormalizer, OneHotEncoder, TensorTransformer, Transformer, WLInputTransformer, ) -from neps.search_spaces.hyperparameters.float import FloatParameter -from neps.search_spaces.hyperparameters.integer import IntegerParameter +from neps.search_spaces.hyperparameters import FloatParameter, IntegerParameter if TYPE_CHECKING: from neps.search_spaces.search_space import SearchSpace @@ -47,9 +47,9 @@ @dataclass class ComprehensiveGP: space: SearchSpace - kernels: dict[str, tuple[Kernel, Transformer]] - combined_kernel: Literal["sum", "product"] = "sum" + kernels: dict[str, tuple[Sequence[str], Kernel]] + combined_kernel: Literal["sum", "product"] = "sum" noise_variance: Sequence[float] = NOISE_VARIANCE_GRID kernel_parameter_grid: Mapping[str, Sequence[Mapping[str, Any]]] | bool = True @@ -60,20 +60,24 @@ class ComprehensiveGP: # Post fit attributes K_inv_: torch.Tensor | None = None - n_train_: int | None = None likelihood_: float | None = None y_: torch.Tensor | None = None y_normalized_: torch.Tensor | None = None y_mean_: float | None = None y_std_: float | None = None - optimized_kernels_: dict[str, Kernel] | None = None - train_data_: dict[str, Any] | None = None + opt_kernels_: dict[str, tuple[Sequence[str], Kernel]] | None = None + train_x_: DataPack | None = None def __post_init__(self): # TODO: Remove when search space is just definition and does not hold values. self.space = self.space.clone() - def fit(self, x: list[dict[str, Any]], train_y: torch.Tensor) -> None: + def fit( + self, + *, + x: DataPack, + train_y: torch.Tensor, + ) -> None: # Preprocessing y_ = torch.as_tensor(train_y, device=self.device, dtype=torch.float64) @@ -83,40 +87,19 @@ def fit(self, x: list[dict[str, Any]], train_y: torch.Tensor) -> None: self.y_normalized_ = (y_ - self.y_mean_) / self.y_std_ self.y_ = y_ - _data = { - key: transformer.encode(x, self.space) - for key, (_, transformer) in self.kernels.items() - } - # optimized kernel parameters + noise variance optim_vars: list[nn.Parameter] = [] + opt_kernels: dict[str, tuple[Sequence[str], Kernel]] = {} - grids = { - name: k.suggested_grid - for name, (k, _) in self.kernels.items() - if k.suggested_grid is not None - } - - kernels: dict[str, Kernel] = {} - for kernel_name, (kernel, _) in self.kernels.items(): - xs = _data[kernel_name] - grid = grids[kernel_name] - - maybe_optimized_kernel = kernel.grid_search( - x=xs, + N: int + for _kernel_name, (hps, kernel) in self.kernels.items(): + data = x.select(hps) + opt_kernel, _ = kernel.grid_search( + x=data, # type: ignore y=self.y_normalized_, - grid=grid, ) - if isinstance(maybe_optimized_kernel, Exception): - raise ValueError( - f"Failed to optimize kernel {kernel_name} with grid {grid}." - ) from maybe_optimized_kernel - - opt_kernel, _ = maybe_optimized_kernel - gradient_enabled_kernel = opt_kernel.as_optimizable() - kernels[kernel_name] = gradient_enabled_kernel - - optim_vars.extend(gradient_enabled_kernel.parameters()) + optim_vars.extend(opt_kernel.parameters()) + opt_kernels[_kernel_name] = (hps, opt_kernel) # Now that we've optimized the kernels, we convert go convert their # parameters into a tensor we can further refine with some optimizer iterations @@ -128,30 +111,25 @@ def fit(self, x: list[dict[str, Any]], train_y: torch.Tensor) -> None: optim_vars.append(noise_variance) if self.optimizer == "adam": - optim = torch.optim.Adam(optim_vars, **self.optimizer_kwargs) # type: ignore + optim = Adam(optim_vars, **self.optimizer_kwargs) # type: ignore elif self.optimizer == "sgd": - optim = torch.optim.SGD(optim_vars, **self.optimizer_kwargs) # type: ignore + optim = SGD(optim_vars, **self.optimizer_kwargs) # type: ignore else: raise ValueError(f"Invalid optimizer {self.optimizer}") K_inv: torch.Tensor | None = None + _init = torch.zeros if self.combined_kernel == "sum" else torch.ones N = len(x) - for i in range(self.optimizer_iters): + K = _init((N, N), device=self.device, dtype=torch.float64) + for _i in range(self.optimizer_iters): optim.zero_grad() - # Now we iterate over kernels to build up K - _init = torch.zeros if self.combined_kernel == "sum" else torch.ones - K = _init(N, N, device=self.device, dtype=torch.float64) - for kernel_name, kernel in kernels.items(): - data = _data[kernel_name] - gram = kernel.forward(data, data) - - if self.combined_kernel == "sum": - K.add_(gram) - else: - K.mul_(gram) - K.diag().add_(noise_variance) + for _kernel_name, (hps, opt_kernel) in opt_kernels.items(): + data = x.select(hps) + k = opt_kernel.forward(data) + K.add_(k) if self.combined_kernel == "sum" else K.mul_(k) + K.diag().add_(noise_variance) K_inv, logDetK = compute_pd_inverse(K) nlml = -log_marginal_likelihood(K_inv, logDetK, y=self.y_normalized_) @@ -166,52 +144,50 @@ def fit(self, x: list[dict[str, Any]], train_y: torch.Tensor) -> None: assert K_inv is not None self.K_inv_ = K_inv.clone() self.noise_variance_ = noise_variance.item() - self.optimized_kernels_ = kernels - self.n_train_ = N - self.train_data_ = _data - - def predict(self, x: list[dict[str, Any]]) -> tuple[torch.Tensor, torch.Tensor]: + self.opt_kernels_ = opt_kernels + self.train_x_ = x + + def predict( + self, + *, + x: DataPack, + ) -> tuple[torch.Tensor, torch.Tensor]: """Kriging predictions.""" if ( self.K_inv_ is None - or self.n_train_ is None - or self.optimized_kernels_ is None - or self.train_data_ is None + or self.train_x_ is None or self.y_normalized_ is None or self.y_std_ is None + or self.opt_kernels_ is None ): raise ValueError( "Inverse of Gram matrix is not instantiated. Please call the optimize " "function to fit on the training data first!" ) - _data = { - key: transformer.encode(x, self.space) - for key, (_, transformer) in self.kernels.items() - } _init = torch.zeros if self.combined_kernel == "sum" else torch.ones n_test = len(x) K_train_test = _init( - self.n_train_, n_test, device=self.device, dtype=torch.float64 + len(self.train_x_), n_test, device=self.device, dtype=torch.float64 ) - K_test_test = _init(n_test, n_test, device=self.device, dtype=torch.float64) - - for kernel_name, kernel in self.optimized_kernels_.items(): - train_x = self.train_data_[kernel_name] - test_x = _data[kernel_name] - - gram = kernel.forward(train_x, test_x) + for _kernel_name, (hps, opt_kernel) in self.opt_kernels_.items(): + train = self.train_x_.select(hps) + test = x.select(hps) + k = opt_kernel.forward(train, test) if self.combined_kernel == "sum": - K_train_test.add_(gram) + K_train_test.add_(k) else: - K_train_test.mul_(gram) + K_train_test.mul_(k) - gram = kernel.forward(test_x, test_x) + K_test_test = _init(n_test, n_test, device=self.device, dtype=torch.float64) + for _kernel_name, (hps, opt_kernel) in self.opt_kernels_.items(): + test = x.select(hps) + k = opt_kernel.forward(test, test) if self.combined_kernel == "sum": - K_test_test.add_(gram) + K_test_test.add_(k) else: - K_test_test.mul_(gram) + K_test_test.mul_(k) # Compute the predictive mean @@ -220,7 +196,6 @@ def predict(self, x: list[dict[str, Any]]) -> tuple[torch.Tensor, torch.Tensor]: mu_s = mu_s * self.y_std_ + self.y_mean_ cov_s = K_test_test - K_train_test.t() @ self.K_inv_ @ K_train_test - cov_s.diagonal().clamp_(self.noise_variance_, np.inf) cov_s *= self.y_std_**2 return mu_s, cov_s @@ -280,7 +255,7 @@ def get_default_kernels( transformer = JointTransformer.join(one_hot_encoder, fid_normalizer) kernels["vectorial"] = (Matern52Kernel(), transformer) else: - transformer = IntegerCategoricalTransformer(tuple(space.categoricals)) + transformer = CategoricalToIntegerTransformer(tuple(space.categoricals)) kernels["categorical"] = (HammingKernel(), transformer) return kernels diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index c5c47332..19efa6b6 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -1,21 +1,12 @@ from __future__ import annotations import random -from typing import Any, TYPE_CHECKING, Literal -from typing_extensions import override -from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP +from itertools import chain +from typing import TYPE_CHECKING, Any, Literal, Mapping -from neps.state.optimizer import BudgetInfo -from neps.utils.types import ConfigResult, RawConfig -from neps.utils.common import instance_from_map -from neps.search_spaces import ( - CategoricalParameter, - ConstantParameter, - FloatParameter, - IntegerParameter, - SearchSpace, -) -from neps.optimizers.base_optimizer import BaseOptimizer +import torch + +from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig from neps.optimizers.bayesian_optimization.acquisition_functions import ( AcquisitionMapping, DecayingPriorWeightedAcquisition, @@ -23,15 +14,26 @@ from neps.optimizers.bayesian_optimization.acquisition_samplers import ( AcquisitionSamplerMapping, ) -from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, -) from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping +from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP +from neps.search_spaces import ( + CategoricalParameter, + ConstantParameter, + FloatParameter, + IntegerParameter, + SearchSpace, +) +from neps.search_spaces.encoding import Encoder +from neps.utils.common import instance_from_map if TYPE_CHECKING: from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( BaseAcquisition, ) + from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( + AcquisitionSampler, + ) + from neps.state import BudgetInfo, Trial # TODO(eddiebergman): Why not just include in the definition of the parameters. CUSTOM_FLOAT_CONFIDENCE_SCORES = dict(FloatParameter.DEFAULT_CONFIDENCE_SCORES) @@ -49,6 +51,7 @@ class BayesianOptimization(BaseOptimizer): def __init__( self, pipeline_space: SearchSpace, + *, initial_design_size: int = 10, surrogate_model: str | Any = "gp", acquisition: str | BaseAcquisition = "EI", @@ -62,7 +65,7 @@ def __init__( cost_value_on_error: None | float = None, logger=None, disable_priors: bool = False, - prior_confidence: Literal["low", "medium", "high"] = None, + prior_confidence: Literal["low", "medium", "high"] | None = None, sample_default_first: bool = False, ): """Initialise the BO loop. @@ -124,10 +127,7 @@ def __init__( self._initial_design_size = initial_design_size self._random_interleave_prob = random_interleave_prob - self._num_train_x: int = 0 self._num_error_evaluations: int = 0 - self._pending_evaluations: list = [] - self._model_update_failed: bool = False self.sample_default_first = sample_default_first if isinstance(surrogate_model, str): @@ -136,7 +136,11 @@ def __init__( space=pipeline_space, include_fidelities=False, ) + self._encoder = Encoder.default(self.pipeline_space) else: + raise NotImplementedError( + "Only 'gp' is supported as a surrogate model for now." + ) self.surrogate_model = instance_from_map( SurrogateModelMapping, surrogate_model, @@ -162,132 +166,100 @@ def __init__( name="acquisition sampler function", kwargs={"patience": self.patience, "pipeline_space": self.pipeline_space}, ) - self._enhance_priors() - - def _enhance_priors(self, confidence_score: dict = None) -> None: - """Only applicable when priors are given along with a confidence. - - Args: - confidence_score: dict - The confidence scores for the 2 major variable types. - Example: {"categorical": 5.2, "numeric": 0.15} - """ - if self.prior_confidence is None: - return - if ( - hasattr(self.pipeline_space, "has_prior") - and not self.pipeline_space.has_prior - ): - return - for k, v in self.pipeline_space.items(): - if v.is_fidelity or isinstance(v, ConstantParameter): - continue - elif isinstance(v, (FloatParameter, IntegerParameter)): - if confidence_score is None: + if self.pipeline_space.has_prior: + for k, v in self.pipeline_space.items(): + if v.is_fidelity or isinstance(v, ConstantParameter): + continue + elif isinstance(v, (FloatParameter, IntegerParameter)): confidence = CUSTOM_FLOAT_CONFIDENCE_SCORES[self.prior_confidence] - else: - confidence = confidence_score["numeric"] - self.pipeline_space[k].default_confidence_score = confidence - elif isinstance(v, CategoricalParameter): - if confidence_score is None: + self.pipeline_space[k].default_confidence_score = confidence + elif isinstance(v, CategoricalParameter): confidence = CUSTOM_CATEGORICAL_CONFIDENCE_SCORES[ self.prior_confidence ] - else: - confidence = confidence_score["categorical"] - self.pipeline_space[k].default_confidence_score = confidence - return - - def is_init_phase(self) -> bool: - """Decides if optimization is still under the warmstart phase/model-based search.""" - if self._num_train_x >= self._initial_design_size: - return False - return True + self.pipeline_space[k].default_confidence_score = confidence - @override - def load_optimization_state( + def ask( self, - previous_results: dict[str, ConfigResult], - pending_evaluations: dict[str, SearchSpace], + trials: Mapping[str, Trial], budget_info: BudgetInfo | None, optimizer_state: dict[str, Any], - ) -> None: - train_x = [el.config for el in previous_results.values()] - train_y = [self.get_loss(el.result) for el in previous_results.values()] - if self.ignore_errors: - train_x = [x for x, y in zip(train_x, train_y) if y != "error"] - train_y_no_error = [y for y in train_y if y != "error"] - self._num_error_evaluations = len(train_y) - len(train_y_no_error) - train_y = train_y_no_error - self._num_train_x = len(train_x) - self._pending_evaluations = [el for el in pending_evaluations.values()] - if not self.is_init_phase(): + ) -> tuple[SampledConfig, dict[str, Any]]: + # TODO: Lift this into runtime, let the + # optimizer advertise the encoding wants... + completed = [ + t + for t in trials.values() + if t.report is not None and t.report.loss is not None + ] + train_x = [t.config for t in completed] + train_y: torch.Tensor = torch.as_tensor([t.report.loss for t in completed]) # type: ignore + + pending = [t.config for t in trials.values() if t.state.pending()] + + space = self.pipeline_space + + # TODO: This would be better if we could serialize these + # in their encoded form. later... + for name, hp in space.categoricals.items(): + for config in chain(train_x, pending): + config[name] = hp.choices.index(config[name]) + for name, hp in space.graphs.items(): + for config in chain(train_x, pending): + config[name] = hp.clone().load_from(config[name]) + + if len(trials) == 0 and self.sample_default_first and space.has_prior: + config = space.sample_default_configuration( + patience=self.patience, ignore_fidelity=False + ) + elif len(trials) <= self._initial_design_size: + config = space.sample( + patience=self.patience, user_priors=True, ignore_fidelity=False + ) + elif random.random() < self._random_interleave_prob: + config = space.sample( + patience=self.patience, user_priors=False, ignore_fidelity=False + ) + else: try: - if len(self._pending_evaluations) > 0: + if len(pending) > 0: # We want to use hallucinated results for the evaluations that have # not finished yet. For this we fit a model on the finished # evaluations and add these to the other results to fit another model. self.surrogate_model.fit(train_x, train_y) - ys, _ = self.surrogate_model.predict(self._pending_evaluations) - train_x += self._pending_evaluations + ys, _ = self.surrogate_model.predict(pending) + train_x += pending train_y += list(ys.detach().numpy()) + # TODO: When using a GP, if we've already fit the + # model due to the if stamet above, we only + # need to update the model with the new points. + # fit on all the data again, only the new points... self.surrogate_model.fit(train_x, train_y) self.acquisition.set_state(self.surrogate_model) self.acquisition_sampler.set_state(x=train_x, y=train_y) + for _ in range(self.patience): + config = self.acquisition_sampler.sample(self.acquisition) + if config not in pending: + break + else: + config = space.sample( + patience=self.patience, user_priors=True, ignore_fidelity=False + ) - self._model_update_failed = False - except RuntimeError as runtime_error: + except RuntimeError as e: self.logger.exception( "Model could not be updated due to below error. Sampling will not use" - " the model." + " the model.", + exc_info=e, ) - if self.loss_value_on_error is None or self.cost_value_on_error is None: - raise ValueError( - "A RuntimeError happened and " - "loss_value_on_error or cost_value_on_error " - "value is not provided, please fix the error or " - "provide the values to continue without " - "updating the model" - ) from runtime_error - self._model_update_failed = True - - def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]: - if ( - self._num_train_x == 0 - and self.sample_default_first - and self.pipeline_space.has_prior - ): - config = self.pipeline_space.sample_default_configuration( - patience=self.patience, ignore_fidelity=False - ) - elif self._num_train_x == 0 and self._initial_design_size >= 1: - config = self.pipeline_space.sample( - patience=self.patience, user_priors=True, ignore_fidelity=False - ) - elif random.random() < self._random_interleave_prob: - config = self.pipeline_space.sample( - patience=self.patience, ignore_fidelity=False - ) - elif self.is_init_phase() or self._model_update_failed: - # initial design space - config = self.pipeline_space.sample( - patience=self.patience, user_priors=True, ignore_fidelity=False - ) - else: - for _ in range(self.patience): - config = self.acquisition_sampler.sample(self.acquisition) - if config not in self._pending_evaluations: - break - else: - config = self.pipeline_space.sample( + config = space.sample( patience=self.patience, user_priors=True, ignore_fidelity=False ) - config_id = str( - self._num_train_x - + self._num_error_evaluations - + len(self._pending_evaluations) - + 1 - ) - return config.hp_values(), config_id, None + config_id = str(len(trials) + 1) + return SampledConfig( + id=config_id, + config=config.hp_values(), + previous_config_id=None, + ), optimizer_state diff --git a/neps/search_spaces/distributions/__init__.py b/neps/search_spaces/distributions/__init__.py new file mode 100644 index 00000000..65151e66 --- /dev/null +++ b/neps/search_spaces/distributions/__init__.py @@ -0,0 +1,16 @@ +from neps.search_spaces.distributions.distribution import Distribution +from neps.search_spaces.distributions.truncnorm import TruncNormDistribution +from neps.search_spaces.distributions.uniform_float import UniformFloatDistribution +from neps.search_spaces.distributions.uniform_int import UniformIntDistribution +from neps.search_spaces.distributions.weighted_ints import WeightedIntsDistribution + +UNIT_UNIFORM = UniformFloatDistribution.new(0.0, 1.0) + +__all__ = [ + "Distribution", + "TruncNormDistribution", + "UniformFloatDistribution", + "UniformIntDistribution", + "UNIT_UNIFORM", + "WeightedIntsDistribution", +] diff --git a/neps/search_spaces/distributions/distribution.py b/neps/search_spaces/distributions/distribution.py new file mode 100644 index 00000000..7ab4dd6f --- /dev/null +++ b/neps/search_spaces/distributions/distribution.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, TypeVar +from typing_extensions import Protocol + +V = TypeVar("V", int, float) + + +if TYPE_CHECKING: + from torch import Generator, Tensor + + from neps.search_spaces.domain import Domain + + +class Distribution(Protocol[V]): + @property + def domain(self) -> Domain[V]: ... + + def sample(self, n: int, to: Domain, *, seed: Generator) -> Tensor: ... + + def likelihood(self, value: Tensor) -> Tensor: ... diff --git a/neps/search_spaces/distributions/truncnorm.py b/neps/search_spaces/distributions/truncnorm.py new file mode 100644 index 00000000..3938cf1c --- /dev/null +++ b/neps/search_spaces/distributions/truncnorm.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +from dataclasses import dataclass +from functools import lru_cache +from typing import TYPE_CHECKING, Any +from typing_extensions import override + +import torch +from torch import Tensor + +from neps.search_spaces.distributions.distribution import Distribution +from neps.search_spaces.domain import Domain + +if TYPE_CHECKING: + from neps.utils.types import Number + +INT_HIGH = 1_000_000 + + +@lru_cache +def _truncnorm(a: float, b: float, loc: float, scale: float) -> Any: + from scipy.stats import truncnorm + + return truncnorm(a=a, b=b, loc=loc, scale=scale) + + +@dataclass(frozen=True) +class TruncNormDistribution(Distribution[float]): + domain: Domain[float] + center: float + std: float + truncnorm: Any + + @override + def sample(self, n: int, seed: torch.Generator) -> Tensor: + random_state = torch.randint(INT_HIGH, size=(1,), generator=seed) + rv = self.truncnorm.rvs(size=n, random_state=random_state.item()) + return torch.tensor(rv, dtype=self.domain.dtype) + + @override + def likelihood(self, value: Tensor) -> Tensor: + return self.truncnorm.pdf(value.numpy()) + + def normalize(self) -> TruncNormDistribution: + # Send to unit domain + center = float(self.domain.from_unit(torch.tensor(self.center)).item()) + std = self.std / self.domain.length + + return TruncNormDistribution( + domain=Domain.unit_float(), + center=center, + std=std, + truncnorm=_truncnorm( + a=(0 - center) / std, + b=(1 - center) / std, + loc=center, + scale=std, + ), + ) + + def with_center_and_confidence( + self, + center: Number, + confidence: float, + ) -> TruncNormDistribution: + assert 0 <= confidence <= 1 + assert self.domain.lower <= center <= self.domain.upper + std = 1 - confidence + center = float(center) + return TruncNormDistribution( + domain=self.domain, + center=center, + std=std, + truncnorm=_truncnorm( + a=(self.domain.lower - center) / std, + b=(self.domain.upper - center) / std, + loc=center, + scale=std, + ), + ) + + @classmethod + def new( + cls, + lower: Number, + center: Number, + upper: Number, + *, + std: Number, + std_is_normalized: bool, + ) -> TruncNormDistribution: + assert lower <= center <= upper, f"{lower} <= {center} <= {upper}" + center = float(center) + + if std_is_normalized: + assert 0 <= std <= 1 + std = float((upper - lower) * std) + else: + assert std > 0 + std = float(std) + + return cls( + domain=Domain.float(float(lower), float(upper)), + center=center, + std=std, + truncnorm=_truncnorm( + a=(lower - center) / std, + b=(upper - center) / std, + loc=center, + scale=std, + ), + ) diff --git a/neps/search_spaces/distributions/uniform_float.py b/neps/search_spaces/distributions/uniform_float.py new file mode 100644 index 00000000..bdb43ee8 --- /dev/null +++ b/neps/search_spaces/distributions/uniform_float.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing_extensions import override + +import torch +from torch import Tensor + +from neps.search_spaces.distributions.distribution import Distribution +from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain + +INT_HIGH = 1_000_000 + + +@dataclass(frozen=True) +class UniformFloatDistribution(Distribution[float]): + domain: Domain[float] + _pdf: float = field(repr=False) + + @override + def sample(self, n: int, to: Domain, seed: torch.Generator) -> Tensor: + # This creates samples in a unit float domain, rather than + # the `.domain` attribute of this distribution. Rather than scale + # up twice, we just scale directly form the UNIT_FLOAT_DOMAIN + # We still however need the `.domain` attribute for `likelihood` + unit_samples = torch.rand(n, generator=seed) + return to.cast(unit_samples, UNIT_FLOAT_DOMAIN) + + @override + def likelihood(self, value: Tensor) -> Tensor: + return torch.where( + (value >= self.domain.lower) & (value <= self.domain.upper), + self._pdf, + 0.0, + ) + + @classmethod + def new(cls, lower: int | float, upper: int | float) -> UniformFloatDistribution: + _pdf = 1.0 / (upper - lower) + return cls(Domain.float(lower, upper), _pdf=_pdf) + + @classmethod + def unit_distribution(cls) -> UniformFloatDistribution: + return UNIT_UNIFORM_FLOAT + + +UNIT_UNIFORM_FLOAT = UniformFloatDistribution.new(0.0, 1.0) diff --git a/neps/search_spaces/distributions/uniform_int.py b/neps/search_spaces/distributions/uniform_int.py new file mode 100644 index 00000000..8fd7b043 --- /dev/null +++ b/neps/search_spaces/distributions/uniform_int.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING +from typing_extensions import override + +import torch +from torch import Tensor + +from neps.search_spaces.distributions.distribution import Distribution +from neps.search_spaces.domain import Domain + +if TYPE_CHECKING: + from neps.utils.types import Number + + +@dataclass(frozen=True) +class UniformIntDistribution(Distribution[int]): + domain: Domain[int] + _pdf: float = field(repr=False) + + @override + def sample(self, n: int, to: Domain, *, seed: torch.Generator) -> Tensor: + samples = torch.randint( + self.domain.lower, + self.domain.upper, + size=(n,), + generator=seed, + ) + return to.cast(samples, frm=self.domain) + + @override + def likelihood(self, value: Tensor) -> Tensor: + return torch.where( + (value >= self.domain.lower) & (value <= self.domain.upper), + self._pdf, + 0.0, + ) + + @classmethod + def indices(cls, n: int) -> UniformIntDistribution: + return cls(Domain.int(0, n - 1), _pdf=1.0 / n) + + @classmethod + def new(cls, lower: Number, upper: Number) -> UniformIntDistribution: + return cls(Domain.int(lower, upper), _pdf=1.0 / (upper - lower)) diff --git a/neps/search_spaces/distributions/weighted_ints.py b/neps/search_spaces/distributions/weighted_ints.py new file mode 100644 index 00000000..3c8c60c5 --- /dev/null +++ b/neps/search_spaces/distributions/weighted_ints.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +import warnings +from dataclasses import dataclass +from typing import TYPE_CHECKING, ClassVar, Sequence +from typing_extensions import override + +import torch +from torch import Tensor + +from neps.search_spaces.distributions.distribution import Distribution +from neps.search_spaces.domain import Domain + +if TYPE_CHECKING: + from neps.utils.types import Number + + +@dataclass(frozen=True) +class WeightedIntsDistribution(Distribution[int]): + # NOTE: Having a Million weights is very resource intense and super slow + # for sampling, especially given our common use case is to have only one weight + # with the rest being uniform. 100 is well out of scope for what was intended, + # as this is mostly intended for categoricals. + # If we need this, then we should make a more efficient implementation, + # such as one that uniform samples and then with probability `weight` + # replaces the value with the favoured value. + LIMIT_FOR_WEIGHTED_INTS: ClassVar[int] = 200 + + domain: Domain[int] + weights: Tensor + + @override + def sample(self, n: int, to: Domain, *, seed: torch.Generator) -> Tensor: + rand_tensor = torch.multinomial( + self.weights, + n, + replacement=True, + generator=seed, + ) + return to.cast(rand_tensor, frm=self.domain) + + @override + def likelihood(self, value: Tensor) -> Tensor: + valid_indices = torch.logical_and( + value >= self.domain.lower, value <= self.domain.upper + ) + psuedo_indices = torch.where(valid_indices, value, 0) + probs = self.weights[psuedo_indices] + return torch.where(valid_indices, probs, 0) + + @classmethod + def new(cls, weights: Sequence[Number] | Tensor) -> WeightedIntsDistribution: + if len(weights) > cls.LIMIT_FOR_WEIGHTED_INTS: + raise ValueError( + f"Having {len(weights)} weights is very resource intense and slow" + " for sampling. Consider using a more efficient implementation" + " if you need this many weights.", + ) + return cls( + weights=torch.as_tensor(weights, dtype=torch.float64), + domain=Domain.indices(len(weights)), + ) + + @classmethod + def with_favoured( + cls, + n: int, + favoured: int, + confidence: float, + ) -> WeightedIntsDistribution: + if n > cls.LIMIT_FOR_WEIGHTED_INTS: + raise ValueError( + f"Having {n} weights is very resource intense and slow" + " for sampling. Consider using a more efficient implementation" + " if you need this many weights.", + ) + + assert 0.0 <= confidence <= 1.0 + remaining = 1.0 - confidence + rest = remaining / (n - 1) + if confidence < rest: + warnings.warn( + f"Weight {confidence} is less than the rest {rest}." + " This will make the favoured value less likely to be sampled" + " than the rest of the values.", + UserWarning, + stacklevel=2, + ) + dist = torch.full(size=(n,), fill_value=rest, dtype=torch.float64) + dist[favoured] = confidence + return cls(weights=dist, domain=Domain.indices(n)) diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py new file mode 100644 index 00000000..06814862 --- /dev/null +++ b/neps/search_spaces/domain.py @@ -0,0 +1,316 @@ +# TODO: Could theoretically implement dtype,device,out for all methods here but +# would need to be careful not to accidentally send to and from GPU. +from __future__ import annotations + +import math +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Generic, TypeVar +from typing_extensions import TypeAlias + +import torch +from torch import Tensor + +if TYPE_CHECKING: + from neps.search_spaces.distributions.truncnorm import TruncNormDistribution + from neps.search_spaces.distributions.uniform_float import ( + UniformFloatDistribution, + ) + from neps.search_spaces.distributions.uniform_int import UniformIntDistribution + from neps.search_spaces.distributions.weighted_ints import WeightedIntsDistribution + + +Number = int | float +V = TypeVar("V", int, float) +V2 = TypeVar("V2", int, float) + + +@dataclass(frozen=True) +class NumberDomain(Generic[V]): + lower: V + upper: V + round: bool + log_bounds: tuple[float, float] | None = None + bins: int | None = None + + dtype: torch.dtype = field(init=False, repr=False) + is_unit: bool = field(init=False, repr=False) + midpoint: V = field(init=False, repr=False) + is_log: bool = field(init=False, repr=False) + length: V = field(init=False, repr=False) + cardinality: int | None = field(init=False, repr=False) + + def __post_init__(self): + assert isinstance(self.lower, type(self.upper)) + object.__setattr__(self, "is_unit", self.lower == 0 and self.upper == 1) + object.__setattr__(self, "is_log", self.log_bounds is not None) + object.__setattr__( + self, "dtype", torch.int64 if isinstance(self.lower, int) else torch.float64 + ) + object.__setattr__(self, "length", self.upper - self.lower) + + if self.bins: + cardinality = self.bins + elif self.round: + cardinality = int(self.upper - self.lower + 1) + else: + cardinality = None + + object.__setattr__(self, "cardinality", cardinality) + mid = self.from_unit(torch.tensor(0.5)).item() + if self.dtype == torch.int64: + mid = int(round(mid)) + object.__setattr__(self, "midpoint", mid) + + @classmethod + def float( + cls, + lower: Number, + upper: Number, + *, + log: bool = False, + bins: int | None = None, + ) -> NumberDomain[float]: + return NumberDomain( + lower=float(lower), + upper=float(upper), + log_bounds=(math.log(lower), math.log(upper)) if log else None, + bins=bins, + round=False, + ) + + @classmethod + def int( + cls, + lower: Number, + upper: Number, + *, + log: bool = False, + bins: int | None = None, + ) -> NumberDomain[int]: + return NumberDomain( + lower=int(round(lower)), + upper=int(round(upper)), + log_bounds=(math.log(lower), math.log(upper)) if log else None, + round=True, + bins=bins, + ) + + @classmethod + def indices(cls, n: int) -> NumberDomain[int]: + """Create a domain for a range of indices. + + Like range based functions this domain is inclusive of the lower bound + and exclusive of the upper bound. + + Use this method to create a domain for indices + """ + return NumberDomain.int(0, n - 1) + + def to_unit(self, x: Tensor) -> Tensor: + if self.is_unit: + return x # type: ignore + + if self.log_bounds is not None: + x = torch.log(x) + lower, upper = self.log_bounds + else: + lower, upper = self.lower, self.upper + + return (x - lower) / (upper - lower) + + def from_unit(self, x: Tensor) -> Tensor: + if self.is_unit: + return x + + bins = self.bins + if bins is not None: + quantization_levels = torch.floor(x * bins).clip(0, bins - 1) + x = quantization_levels / (bins - 1) + + # Now we scale to the new domain + if self.log_bounds is not None: + lower, upper = self.log_bounds + x = x * (upper - lower) + lower + x = torch.exp(x) + else: + lower, upper = self.lower, self.upper + x = x * (upper - lower) + lower + + if self.round: + x = torch.round(x) + + return x.type(self.dtype) + + def cast( + self, + x: Tensor, + frm: Domain, + ) -> Tensor: + if isinstance(frm, OneHotDomain): + x = torch.argmax(x, dim=1) + frm = frm.int_domain + + # NOTE: In general, we should always be able to go through the unit interval + # [0, 1] to be able to transform between domains. However sometimes we can + # bypass some steps, dependant on the domains, hence the ugliness... + + # Shortcut 1. (Same Domain) + # We can shortcut out going through normalized space if all the boundaries and + # they live on the same scale. However, if their bins don't line up, we will + # have to go through unit space to figure out the bins + same_bounds = self.lower == frm.lower and self.upper == frm.upper + same_log_bounds = self.log_bounds == frm.log_bounds + same_bins = self.bins == frm.bins + if same_bounds and same_log_bounds and (self.bins is None or same_bins): + if self.round: + x = torch.round(x) + return x.type(self.dtype) + + # Shortcut 2. (From normalized) + # The domain we are coming from is already normalized, we only need to lift + if frm.is_unit: + return self.from_unit(x) # type: ignore + + # Shortcut 3. (Log lift) + # We can also shortcut out if the only diffrence is that we are coming frm the + # log bounds of this domain. We dont care if where we came from was binned or not, + # we just lift it up with `np.exp` and round if needed + if (self.lower, self.upper) == frm.log_bounds and self.bins is None: + x = torch.exp(x) + if self.round: + x = torch.round(x) + return x.type(self.dtype) + + # Otherwise, through the unit interval we go + norm = frm.to_unit(x) + lift = self.from_unit(norm) + return lift # noqa: RET504 + + def uniform_distribution(self) -> UniformFloatDistribution | UniformIntDistribution: + from neps.search_spaces.distributions import ( + UNIT_UNIFORM, + UniformFloatDistribution, + UniformIntDistribution, + ) + + # (Log Lift) - sample on it's log domain + if self.log_bounds is not None: + return UniformFloatDistribution.new(*self.log_bounds) + + # (Same Domain) - Just sample integers + if self.dtype == torch.int64 and self.bins is None: + return UniformIntDistribution.new(self.lower, self.upper) + + # NOTE: There's a possibility where you could use an integer distribution for + # binned domains, however the cost of sampling integers and casting is likely + # higher than just casting from normalized domain. Would need to verify this + # In any case, Normalized Uniform Float is a safe choice + + # (From Normalized) + return UNIT_UNIFORM + + def unit_uniform_distribution(self) -> UniformFloatDistribution: + from neps.search_spaces.distributions import UNIT_UNIFORM + + return UNIT_UNIFORM + + def truncnorm_distribution( + self, + center: Number, + *, + confidence: float | None = None, + std: float | None = None, + ) -> TruncNormDistribution: + from neps.search_spaces.distributions import TruncNormDistribution + + # If you need a unit one, create this and then call `normalize()` on it. + if std is None and confidence is None: + raise ValueError( + "Must specify either `std` in (lower, upper) or `confidence` in (0, 1)" + ) + + if std is None: + assert 0 <= confidence <= 1 # type: ignore + _std = float(1 - confidence) # type: ignore + _is_normalized = True + else: + _std = float(std) + _is_normalized = False + + # (Log Lift) - sample on it's log domain + if self.log_bounds is not None: + return TruncNormDistribution.new( + lower=self.log_bounds[0], + center=math.log(center), + upper=self.log_bounds[1], + std=_std, + std_is_normalized=_is_normalized, + ) + + # NOTE: There's a possibility where you could use an integer distribution for + # binned domains, however the cost of sampling integers and casting is likely + # higher than just casting from normalized domain. Would need to verify this + # In any case, Normalized Uniform Float is a safe choice + + # (From Normalized) + truncnorm = TruncNormDistribution.new( + lower=self.lower, + center=math.log(center), + upper=self.upper, + std=_std, + std_is_normalized=_is_normalized, + ) + return truncnorm.normalize() + + def weighted_indices_distribution( + self, center_index: int, *, confidence: float + ) -> WeightedIntsDistribution: + from neps.search_spaces.distributions import WeightedIntsDistribution + + if self.cardinality is None: + raise ValueError( + "Cannot create a weighted distribution for a continuous domain!" + ) + if not isinstance(center_index, int): + raise ValueError( + f"Center index must be an integer of type {self.dtype} to" + " create a weighted distribution!" + ) + assert 0 <= confidence <= 1 + + return WeightedIntsDistribution.with_favoured( + n=self.cardinality, + favoured=int(round(center_index)), + confidence=confidence, + ) + + @classmethod + def unit_float(cls) -> NumberDomain[float]: + return UNIT_FLOAT_DOMAIN + + +@dataclass(frozen=True) +class OneHotDomain: + cardinality: int + int_domain: NumberDomain[int] = field(init=False, repr=False) + + def __post_init__(self): + object.__setattr__( + self, + "int_domain", + NumberDomain.indices(self.cardinality), + ) + + def cast(self, x: Tensor, frm: NumberDomain[int]) -> Tensor: + # Convert to integers first + x = self.int_domain.cast(x, frm) + + # Then one hot encode + buffer = torch.zeros((len(x), self.cardinality)) + buffer.scatter_(1, x.unsqueeze(1), 1) + return buffer + + +UNIT_FLOAT_DOMAIN = NumberDomain.float(0.0, 1.0) + +Domain: TypeAlias = NumberDomain | OneHotDomain diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py index 592cfa88..adcaa121 100644 --- a/neps/search_spaces/encoding.py +++ b/neps/search_spaces/encoding.py @@ -1,360 +1,442 @@ from __future__ import annotations from dataclasses import dataclass, field -from grakel.utils import graph_from_networkx - -from typing import Any, TypeAlias, TypeVar, Generic -from typing_extensions import Self, override, Self -from itertools import chain -import torch -from neps.search_spaces import ( - CategoricalParameter, - IntegerParameter, - FloatParameter, +from typing import ( + TYPE_CHECKING, + Any, + Generic, + Sequence, + Sized, + TypeAlias, + TypeVar, + overload, ) +from typing_extensions import Protocol, override -from neps.search_spaces.search_space import SearchSpace, Parameter - -WLInput: TypeAlias = tuple[dict, dict | None, dict | None] - +import numpy as np +import numpy.typing as npt +import torch +from grakel.utils import graph_from_networkx -@dataclass -class GraphEncoder: - hps: tuple[str] +from neps.search_spaces.domain import ( + UNIT_FLOAT_DOMAIN, + Domain, + NumberDomain, + OneHotDomain, +) - def encode( - self, - x: list[dict[str, Any]], - space: SearchSpace, - ) -> dict[str, list[WLInput]]: - return {hp: [config[hp].value for config in x] for hp in self.hps} +if TYPE_CHECKING: + import networkx as nx + from neps.search_spaces.search_space import SearchSpace +WLInput: TypeAlias = tuple[dict, dict | None, dict | None] +V = TypeVar("V", int, float) T = TypeVar("T") -@dataclass -class Transformer(Generic[T]): - hps: tuple[str] - - def encode(self, x: list[dict[str, Any]], space: SearchSpace) -> T: ... +class Transformer(Protocol[T]): + def encode(self, x: Sequence[Any]) -> T: ... - def value_decode(self, x: T, space: SearchSpace) -> dict[str, list[Any]]: ... + def decode(self, x: T) -> list[Any]: ... - def decode(self, x: T, space: SearchSpace) -> list[dict[str, Any]]: - values = self.value_decode(x, space) - return [(dict(zip(values, t))) for t in zip(*values.values())] +class TensorTransformer(Transformer[torch.Tensor], Protocol): + domain: Domain + output_cols: int -@dataclass -class WLInputTransformer(Transformer[WLInput]): def encode( self, - x: list[dict[str, Any]], - space: SearchSpace, - ) -> dict[str, list[WLInput]]: - _graphs: dict[str, list[WLInput]] = {} - for hp_name in space.graphs.keys(): - gs = [conf[hp_name].value for conf in x] - _graphs[hp_name] = graph_from_networkx(gs) # type: ignore + x: list[Any], + *, + out: torch.Tensor | None = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: ... - return _graphs - def value_decode( - self, - x: dict[str, list[WLInput]], - space: SearchSpace, - ) -> dict[str, list[Any]]: - raise NotImplementedError("Cannot decode WLInput to values.") +@dataclass +class CategoricalToIntegerTransformer(TensorTransformer): + choices: list[Any] + domain: NumberDomain = field(init=False) + output_cols: int = field(init=False) + _lookup: dict[Any, int] | None = field(init=False) -@dataclass -class TensorTransformer(Transformer[torch.Tensor]): - def output_cols(self, space: SearchSpace) -> int: ... + def __post_init__(self): + assert len(self.choices) > 0 + + self.domain = NumberDomain.indices(len(self.choices)) + self.output_cols = 1 + if len(self.choices) > 3: + try: + self._lookup = {c: i for i, c in enumerate(self.choices)} + except TypeError: + self._lookup = None + @override def encode( self, - x: list[dict[str, Any]], - space: SearchSpace, + x: list[Any], *, - device: torch.device | None = None, + out: torch.Tensor | None = None, dtype: torch.dtype | None = None, + device: torch.device | None = None, ) -> torch.Tensor: - width = len(self.hps) - buffer = torch.empty(size=(len(x), width), dtype=dtype, device=device) - - for i, name in enumerate(self.hps): - hp = space[name] - assert isinstance(hp, CategoricalParameter) - values = torch.tensor( - [config[name]._value_index for config in x], dtype=dtype, device=device - ) + if dtype is None: + dtype = torch.int if out is None else out.dtype - return buffer + values = ( + [self._lookup[c] for c in x] + if self._lookup + else [self.choices.index(c) for c in x] + ) + + if out is None: + return torch.tensor(values, dtype=dtype, device=device) + assert out.shape == (len(x),), f"{out.shape} != {(len(x),)}" + out[:] = torch.tensor(values, dtype=out.dtype, device=out.device) + return out + @override + def decode(self, x: torch.Tensor) -> list[Any]: + return [self.choices[i] for i in x] + + +# TODO: Maybe add a shift argument, could be useful to have `0` as midpoint +# and `-0.5` as lower bound with `0.5` as upper bound. @dataclass -class IntegerCategoricalTransformer(TensorTransformer): - def output_cols(self, space: SearchSpace) -> int: - return len(self.hps) +class MinMaxNormalizer(TensorTransformer, Generic[V]): + original_domain: NumberDomain[V] + + domain: NumberDomain[float] = field(init=False) + output_cols: int = field(init=False) + + def __post_init__(self): + self.domain = UNIT_FLOAT_DOMAIN + self.output_cols = 1 @override def encode( self, - x: list[dict[str, Any]], - space: SearchSpace, + x: list[V], *, + out: torch.Tensor | None = None, dtype: torch.dtype | None = None, device: torch.device | None = None, ) -> torch.Tensor: - if dtype is None: - dtype = torch.int - - buffer = torch.empty(size=(len(x), len(self.hps)), dtype=dtype, device=device) - for i, name in enumerate(self.hps): - hp = space[name] - assert isinstance(hp, CategoricalParameter) - values = torch.tensor( - [config[name].value for config in x], dtype=dtype, device=device - ) - buffer[:, i] = values + if out is not None: + dtype = out.dtype + device = out.device + else: + dtype = torch.float64 if dtype is None else dtype - return buffer + values = torch.tensor(list(x), dtype=dtype, device=device) + values = self.domain.cast(values, frm=self.original_domain) + if out is None: + return values - @override - def value_decode(self, x: torch.Tensor, space: SearchSpace) -> dict[str, list[Any]]: - values: dict[str, list[Any]] = {} - for i, name in enumerate(self.hps): - hp = space[name] - assert isinstance(hp, CategoricalParameter) - enc = x[:, i] - values[name] = [hp.choices[i] for i in enc.tolist()] + assert out.shape == (len(x),), f"{out.shape} != {(len(x),)}" + out[:] = values + return out - return values + @override + def decode(self, x: torch.Tensor) -> list[V]: + values = self.original_domain.from_unit(x) + return values.tolist() @dataclass -class MinMaxNormalizer(TensorTransformer): - def output_cols(self, space: SearchSpace) -> int: - return len(self.hps) +class OneHotEncoder(TensorTransformer): + choices: list[Any] + + domain: OneHotDomain = field(init=False) + output_cols: int = field(init=False) + categorical_to_integer: CategoricalToIntegerTransformer = field(init=False) + + def __post_init__(self): + self.categorical_to_integer = CategoricalToIntegerTransformer(self.choices) + self.output_cols = len(self.choices) @override def encode( self, - x: list[dict[str, Any]], - space: SearchSpace, + x: list[Any], *, + out: torch.Tensor | None = None, dtype: torch.dtype | None = None, device: torch.device | None = None, ) -> torch.Tensor: - if dtype is None: - dtype = torch.float64 + if out is not None: + dtype = out.dtype + device = out.device + else: + dtype = torch.float64 if dtype is None else dtype + + ints = self.categorical_to_integer.encode(x, dtype=torch.int64, device=device) + shape = (len(x), self.output_cols) + if out is None: + buffer = torch.zeros(size=shape, dtype=dtype, device=device) + else: + assert out.shape == shape, f"{out.shape} != {shape}" + buffer = out + + cat_tensor = torch.tensor(ints, dtype=torch.int64, device=device).unsqueeze(1) + buffer.scatter_(1, cat_tensor, 1) + return buffer - width = len(self.hps) - buffer = torch.empty(size=(len(x), width), dtype=dtype, device=device) + @override + def decode(self, x: torch.Tensor) -> list[Any]: + ints = torch.argmax(x, dim=1) + return self.categorical_to_integer.decode(ints) - for i, name in enumerate(self.hps): - hp = space[name] - assert isinstance(hp, (FloatParameter, IntegerParameter)) - values = torch.tensor( - [config[name].value for config in x], dtype=dtype, device=device - ) - if hp.log_bounds: - lower, upper = hp.log_bounds - buffer[:, i] = (torch.log(values) - lower) / (upper - lower) - else: - lower, upper = hp.lower, hp.upper - buffer[:, i] = (values - lower) / (upper - lower) - return buffer +@dataclass +class WLInputTransformer(Transformer[WLInput]): + hp: str - @override - def value_decode( - self, - x: torch.Tensor, - space: SearchSpace, - ) -> dict[str, list[Any]]: - values: dict[str, list[Any]] = {} + def encode(self, x: Sequence[nx.Graph]) -> list[WLInput]: + return [graph_from_networkx(g) for g in x] # type: ignore - for i, name in enumerate(self.hps): - hp = space[name] - assert isinstance(hp, (FloatParameter, IntegerParameter)) - enc = x[:, i] - if hp.log_bounds: - lower, upper = hp.log_bounds - enc = torch.exp(enc * (upper - lower) + lower) - else: - lower, upper = hp.lower, hp.upper - enc = enc * (upper - lower) + lower + def decode(self, x: dict[str, list[WLInput]]) -> dict[str, list[Any]]: + raise NotImplementedError("Cannot decode WLInput to values.") - if isinstance(hp, IntegerParameter): - enc = torch.round(enc).to(torch.int) - values[name] = enc.tolist() +@dataclass +class GraphEncoder: + transformers: dict[str, WLInputTransformer] + column_lookup: dict[str, int] = field(init=False) + + def __post_init__(self): + transformers = sorted(self.transformers.items(), key=lambda t: t[0]) + self.transformers = dict(transformers) + self.column_lookup: dict[str, int] = { + name: i for i, (name, _) in enumerate(self.transformers.items()) + } + + def select( + self, x: npt.NDArray[np.object_], hp: str | Sequence[str] + ) -> npt.NDArray[np.object_]: + # Kind of a redundant function but made to be compatible with TensorPack + if isinstance(hp, str): + return x[:, self.column_lookup[hp]] + + return x[:, [self.column_lookup[h] for h in hp]] + + def encode(self, x: list[SearchSpace]) -> npt.NDArray[np.object_]: + buffer = np.empty((len(x), len(self.transformers)), dtype=np.object_) + for hp, transformer in self.transformers.items(): + values = [conf[hp].value for conf in x] + buffer[:, self.column_lookup[hp]] = transformer.encode(values) # type: ignore + return buffer - return values + def decode_dicts(self, x: npt.NDArray[np.object_]) -> list[dict[str, Any]]: + raise NotImplementedError("Cannot decode graph embeddings.") @dataclass -class StandardNormalizer(TensorTransformer): - std_means: dict[str, tuple[float, float]] = field(default_factory=dict) +class TensorEncoder: + transformers: dict[str, TensorTransformer] + column_lookup: dict[str, tuple[int, int]] = field(init=False) - def output_cols(self, space: SearchSpace) -> int: - return len(self.hps) + def __post_init__(self): + transformers = sorted( + self.transformers.items(), key=lambda t: (t[1].output_cols, t[0]) + ) + self.transformers = dict(transformers) + self.column_lookup: dict[str, tuple[int, int]] = {} + offset = 0 + for name, transformer in self.transformers.items(): + self.column_lookup[name] = (offset, offset + transformer.output_cols) + offset += transformer.output_cols + + def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor: + if isinstance(hp, str): + return x[:, slice(*self.column_lookup[hp])] + cols = torch.concatenate([torch.arange(*self.column_lookup[h]) for h in hp]) + return x[:, cols] - @override def encode( self, - x: list[dict[str, Any]], - space: SearchSpace, + x: list[SearchSpace], *, - dtype: torch.dtype | None = None, device: torch.device | None = None, ) -> torch.Tensor: - if dtype is None: - dtype = torch.float64 - - width = len(self.hps) - buffer = torch.empty(size=(len(x), width), dtype=dtype, device=device) - std_means: dict[str, tuple[float, float]] = {} - - for i, name in enumerate(self.hps): - hp = space[name] - assert isinstance(hp, (FloatParameter, IntegerParameter)) - values = torch.tensor( - [config[name].value for config in x], dtype=dtype, device=device + width = sum(t.output_cols for t in self.transformers.values()) + buffer = torch.empty((len(x), width), dtype=torch.float64, device=device) + + for hp_name, transformer in self.transformers.items(): + values = [conf[hp_name] for conf in x] + lookup = self.column_lookup[hp_name] + + # Encode directly into buffer + transformer.encode( + values, + out=buffer[:, slice(*lookup)], + dtype=torch.float64, + device=device, ) - if hp.log_bounds: - values = torch.log(values) - - mean, std = values.mean(), values.std() - std_means[name] = (mean.item(), std.item()) - - buffer[:, i] = (values - mean) / std - self.std_means = std_means return buffer - @override - def value_decode(self, x: torch.Tensor, space: SearchSpace) -> dict[str, list[Any]]: + def decode_dicts(self, x: torch.Tensor) -> list[dict[str, Any]]: values: dict[str, list[Any]] = {} + for hp_name, transformer in self.transformers.items(): + lookup = self.column_lookup[hp_name] + values[hp_name] = transformer.decode(x[:, slice(*lookup)]) - for i, name in enumerate(self.hps): - hp = space[name] - assert isinstance(hp, Parameter) - enc = x[:, i] - if isinstance(hp, (FloatParameter, IntegerParameter)): - std, mean = self.std_means[name] - if hp.log_bounds: - enc = torch.exp(enc * std + mean) - else: - enc = enc * std + mean - - if isinstance(hp, IntegerParameter): - enc = torch.round(enc).to(torch.int) - - values[name] = enc.tolist() - else: - raise ValueError(f"Invalid hyperparameter type: {type(hp)}") - - return values + keys = list(values.keys()) + return [dict(zip(keys, vals)) for vals in zip(*values.values())] @dataclass -class OneHotEncoder(TensorTransformer): - def output_cols(self, space: SearchSpace) -> int: - return sum(len(hp.choices) for hp in (space[name] for name in self.hps)) # type: ignore +class DataEncoder: + tensors: TensorEncoder | None = None + graphs: GraphEncoder | None = None - @override def encode( self, - x: list[dict[str, Any]], - space: SearchSpace, + x: list[SearchSpace], *, - dtype: torch.dtype | None = None, device: torch.device | None = None, - ) -> torch.Tensor: - if dtype is None: - dtype = torch.bool + ) -> tuple[torch.Tensor | None, npt.NDArray[np.object_] | None]: + tensor = self.tensors.encode(x, device=device) if self.tensors else None + graphs = self.graphs.encode(x) if self.graphs else None + return tensor, graphs - categoricals: dict[str, CategoricalParameter] = {} - for name in self.hps: - hp = space[name] - assert isinstance(hp, CategoricalParameter) - categoricals[name] = hp + @overload + def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor: ... - width = sum(len(hp.choices) for hp in categoricals.values()) - buffer = torch.zeros(size=(len(x), width), dtype=dtype, device=device) + @overload + def select( + self, x: npt.NDArray[np.object_], hp: str | Sequence[str] + ) -> npt.NDArray[np.object_]: ... - offset = 0 - for name, hp in categoricals.items(): - n_choices = len(hp.choices) - _xs = [config[name]._value_index for config in x] - cat_tensor = torch.tensor(_xs, dtype=torch.int64, device=device).unsqueeze(1) - buffer[:, offset : offset + n_choices].scatter_(1, cat_tensor, 1) - offset += n_choices - - return buffer - - @override - def value_decode( + def select( self, - x: torch.Tensor, - space: SearchSpace, - ) -> dict[str, list[Any]]: - values: dict[str, list[Any]] = {} + x: torch.Tensor | npt.NDArray[np.object_], + hp: str | Sequence[str], + ) -> torch.Tensor | npt.NDArray[np.object_]: + if isinstance(x, torch.Tensor): + assert self.tensors is not None + return self.tensors.select(x, hp) - offset = 0 - for name in self.hps: - hp = space[name] - assert isinstance(hp, CategoricalParameter) - n_choices = len(hp.choices) - enc = x[:, offset : offset + n_choices].argmax(dim=1) + assert self.graphs is not None + return self.graphs.select(x, hp) - values[name] = [hp.choices[i] for i in enc] - offset += n_choices - - return values + def decode_dicts( + self, + x: torch.Tensor + | npt.NDArray[np.object_] + | tuple[torch.Tensor | None, npt.NDArray[np.object_] | None], + ) -> list[dict[str, Any]]: + if isinstance(x, tuple): + tensors, graphs = x + elif isinstance(x, torch.Tensor): + tensors, graphs = x, None + else: + tensors, graphs = None, x + + tensor_values: list[dict[str, Any]] | None = None + if tensors is not None: + assert self.tensors is not None + tensor_values = self.tensors.decode_dicts(tensors) + + graph_values: list[dict[str, Any]] | None = None + if graphs is not None: + assert self.graphs is not None + graph_values = self.graphs.decode_dicts(graphs) + + if tensor_values is not None and graph_values is not None: + assert len(tensor_values) == len(graph_values) + return [{**t, **g} for t, g in zip(tensor_values, graph_values)] + + if tensor_values is not None: + return tensor_values + + assert graph_values is not None + return graph_values @dataclass -class JointTransformer(TensorTransformer): - transforms: tuple[TensorTransformer, ...] - - def output_cols(self, space: SearchSpace) -> int: - return sum(t.output_cols(space) for t in self.transforms) +class DataPack(Sized): + space: SearchSpace + encoder: DataEncoder + numerical: torch.Tensor | None = None + graphs: npt.NDArray[np.object_] | None = None + _len: int = field(init=False) + + def __post_init__(self): + if self.numerical is not None and self.graphs is not None: + assert len(self.numerical) == len(self.graphs) + self._len = len(self.numerical) + elif self.numerical is not None: + self._len = len(self.numerical) + elif self.graphs is not None: + self._len = len(self.graphs) + else: + raise ValueError("At least one of numerical or graphs must be provided") + + def __len__(self) -> int: + return self._len + + def select(self, hp: str | Sequence[str]) -> torch.Tensor | npt.NDArray[np.object_]: + if isinstance(hp, str): + if self.encoder.tensors and hp in self.encoder.tensors.transformers: + assert self.numerical is not None + return self.encoder.tensors.select(self.numerical, hp) + + if self.encoder.graphs and hp in self.encoder.graphs.transformers: + assert self.graphs is not None + return self.encoder.graphs.select(self.graphs, hp) + + tkeys = ( + None + if self.encoder.tensors is None + else self.encoder.tensors.transformers.keys() + ) + gkeys = ( + None + if self.encoder.graphs is None + else self.encoder.graphs.transformers.keys() + ) + raise KeyError( + f"Unknown hyperparameter {hp}. Not in either tensors or graphs" + f"\nTensors: {tkeys}" + f"\nGraphs: {gkeys}" + ) - @classmethod - def join(cls, *transforms: TensorTransformer) -> Self: - hps = tuple(chain.from_iterable(t.hps for t in transforms)) - return cls(hps, transforms) + all_in_tensors = False + all_in_graphs = False + tkeys = None + gkeys = None + if self.encoder.tensors: + all_in_tensors = all(h in self.encoder.tensors.transformers for h in hp) + + if self.encoder.graphs: + all_in_graphs = all(h in self.encoder.graphs.transformers for h in hp) + gkeys = self.encoder.graphs.transformers.keys() + + if not all_in_tensors and not all_in_graphs: + raise ValueError( + "Cannot select from both tensors and graphs!" + f"Got keys: {hp}" + f"\nTensors: {tkeys}" + f"\nGraphs: {gkeys}" + ) - @override - def encode( - self, - x: list[dict[str, Any]], - space: SearchSpace, - *, - dtype: torch.dtype | None = None, - device: torch.device | None = None, - ) -> torch.Tensor: - return torch.cat( - [t.encode(x, space, dtype=dtype, device=device) for t in self.transforms], - dim=1, - ) + if all_in_tensors: + assert self.numerical is not None + assert self.encoder.tensors is not None + return self.encoder.tensors.select(self.numerical, hp) - @override - def value_decode( - self, - x: torch.Tensor, - space: SearchSpace, - ) -> dict[str, list[Any]]: - values: dict[str, list[Any]] = {} - offset = 0 - for t in self.transforms: - width = t.output_cols(space) - t_values = t.value_decode(x[:, offset : offset + width], space) - values.update(t_values) - offset += width + assert self.graphs is not None + assert self.encoder.graphs is not None + return self.encoder.graphs.select(self.graphs, hp) - return values + def decode(self) -> list[SearchSpace]: + return [ + self.space.from_dict(d) + for d in self.encoder.decode_dicts((self.numerical, self.graphs)) + ] diff --git a/neps/search_spaces/neighborhoods.py b/neps/search_spaces/neighborhoods.py new file mode 100644 index 00000000..91c34a6f --- /dev/null +++ b/neps/search_spaces/neighborhoods.py @@ -0,0 +1,281 @@ +from __future__ import annotations + +from typing import TypeVar + +import numpy as np + +from neps.search_spaces.domain import Domain +from neps.utils.types import Arr, f64, i64 + +V = TypeVar("V", f64, i64) + +UNIQUE_NEIGHBOR_GENERATOR_N_RETRIES = 8 +UNIQUE_NEIGHBOR_GENERATOR_SAMPLE_MULTIPLIER = 4 + +NON_UNIQUE_NEIGHBORS_N_RETRIES = 8 +NON_UNIQUE_NEIGHBORS_SAMPLE_MULTIPLIER = 4 + +# Small enough but prevents needing to keep re-allocating temporary memory +# 50 * 8 = 400 bytes +_SMALL = 50 +_SMALL_CACHED_ARANGE = np.arange(_SMALL, dtype=i64) + + +def unorded_finite_neighbors( + pivot: V, + domain: Domain[V], + *, + n: int, + seed: np.random.Generator, +) -> Arr[V]: + N = domain.cardinality + assert N is not None, "Domain must be finite." + if N <= _SMALL: + full_range = _SMALL_CACHED_ARANGE[: domain.cardinality] + else: + full_range = np.arange(N, dtype=i64) + + range_domain = Domain.indices(N) + _pivot = range_domain.cast(pivot, frm=domain) + + left = full_range[:_pivot] + right = full_range[_pivot + 1 :] + _range = np.concatenate((left, right)) + + seed.shuffle(_range) + + return domain.cast(_range[:n], frm=range_domain) + + +def neighbors( + pivot: V, + domain: Domain[V], + *, + n: int, + std: float, + seed: np.random.Generator, + n_retries: int = NON_UNIQUE_NEIGHBORS_N_RETRIES, + sample_multiplier: int = NON_UNIQUE_NEIGHBORS_SAMPLE_MULTIPLIER, +) -> Arr[V]: + """Create a neighborhood of `n` neighbors around `pivot` with a normal distribution. + + If you need unique neighbors, you should use + [`unique_neighborhood`][neps.search_spaces.neighborhoods.unique_neighborhood]. + + !!! tip + + [`unique_neighborhood`][neps.search_spaces.neighborhoods.unique_neighborhood] + is quite expensive in certain situations as it has to repeatedly sample and check + for uniqueness. If you can afford duplicates, use this function instead. + + If [`domain.cardinality == None`][neps.search_spaces.domain.Domain.cardinality], + and you can afford an infentesimally small percentage change of duplicates, + you should use this function instead. + + !!! warning + + It is up to the caller to ensure that the pivot lies within the domain, + including at one of the bins if the domain is quantized. + + Args: + pivot: The center of the neighborhood. + domain: The domain to get neighbors from. + n: The number of neighbors to generate. + std: The standard deviation of the normal distribution. + seed: The random seed to use. + n_retries: + The number of retries to attempt to generate unique neighbors. + Each retry increases the standard deviation of the normal distribution to + prevent rejection sampling from failing. + sample_multiplier: + A multiplier which multiplies by `n` to determine the number of samples to + generate for try. By oversampling, we prevent having to repeated calls to + sampling. This prevents having to do more rounds of sampling when too many + samples are out of bounds, useful for when the `pivot` is near the bounds. + + Tuning this may be beneficial in unique circumstances, however we advise + leaving this as a default. + + Returns: + An array of `n` neighbors around `pivot`. + """ + # Generate batches of n * BUFFER_MULTIPLIER candidates, filling the above + # buffer until we have enough valid candidates. + # We should not overflow as the buffer + offset = 0 + SAMPLE_SIZE = n * sample_multiplier + BUFFER_SIZE = (n + 1) * sample_multiplier + + # We extend the range of stds to try to find neighbors + neighbors: Arr[V] = np.empty(BUFFER_SIZE, dtype=domain.dtype) + stds = np.linspace(std, 1.0, n_retries + 1, endpoint=True) + + lower = domain.lower + upper = domain.upper + range_size = upper - lower + sample_domain = Domain.float(lower, upper) + + for _std in stds: + candidates = seed.normal(pivot, _std * range_size, size=(SAMPLE_SIZE,)) + + bounded_candidates = candidates[(candidates >= lower) & (candidates <= upper)] + maybe_valid = domain.cast(bounded_candidates, frm=sample_domain) + + # High chance of overlap with original point if there's a finite amount of + # possible elements + if domain.cardinality is not None: + valid = maybe_valid[maybe_valid != pivot] + else: + valid = maybe_valid + + n_candidates = len(valid) + neighbors[offset : offset + n_candidates] = valid + offset += n_candidates + + if offset >= n: + return neighbors[:n] + + raise ValueError( + f"Failed to find enough neighbors with {n_retries} retries." + f" Given {n} neighbors, we only found {offset}." + f" The `Normals` for sampling neighbors were" + f" Normal(mu={pivot}, sigma={list(stds)})" + f" which were meant to find vectorized neighbors of the vector {pivot}," + " which was expected to be in the range" + f" ({lower}, {lower}).", + ) + + +def unique_neighborhood( + pivot: V, + domain: Domain[V], + *, + n: int, + seed: np.random.Generator, + std: float, + n_retries: int = UNIQUE_NEIGHBOR_GENERATOR_N_RETRIES, + sample_multiplier: int = UNIQUE_NEIGHBOR_GENERATOR_SAMPLE_MULTIPLIER, +) -> Arr[V]: + """Create a neighborhood of `n` neighbors around `pivot` with a normal distribution. + + The neighborhood is created by sampling from a normal distribution centered around + `pivot` with a standard deviation of `std`. The samples are then quantized to the + range `[lower, upper]` with `bins` bins. The number of samples is `n`. + + !!! tip + + [`unique_neighborhood`][neps.search_spaces.neighborhoods.unique_neighborhood] + is quite expensive in certain situations as it has to repeatedly sample and check + for uniqueness. If you can afford duplicates, use this function instead. + + If [`domain.cardinality == None`][neps.search_spaces.domain.Domain.cardinality], + and you can afford an infentesimally small percentage change of duplicates, + you should use [`neighbors`][neps.search_spaces.neighborhoods.neighbors] instead. + + !!! warning + + If there are not enough unique neighbors to sample from, the function will + return less than `n` neighbors. + + !!! warning + + It is up to the caller to ensure that the pivot lies within the domain, + including at one of the bins if the domain is quantized. + + + Args: + pivot: The center of the neighborhood. + domain: The domain to get neighbors from. + n: The number of neighbors to generate. + std: The standard deviation of the normal distribution. + seed: The random seed to use. + n_retries: + The number of retries to attempt to generate unique neighbors. + Each retry increases the standard deviation of the normal distribution to prevent + rejection sampling from failing. + sample_multiplier: + A multiplier which multiplies by `n` to determine the number of samples to + generate for try. By oversampling, we prevent having to repeated calls to + both sampling and unique checking. + + However, oversampling makes a tradeoff when the `std` is not high enough to + generate `n` unique neighbors, effectively sampling more of the same duplicates. + + Tuning this may be beneficial in unique circumstances, however we advise leaving + this as a default. + + Returns: + An array of `n` neighbors around `pivot`, or less than `n` if not enough unique + neighbors could be generated. + """ # noqa: E501 + # Different than other neighborhoods as it's unnormalized and + # the quantization is directly integers. + assert n < 1000000, "Can only generate less than 1 million neighbors." + assert 0 < std < 1.0, "Standard deviation must be in the range (0, 1)." + lower = domain.lower + upper = domain.upper + + # In the easiest case, we have a domain with finite elements and we need + # more neighbors than are possible. We then generate all of them. + # We can do this simply with a range and removing the pivot. + if domain.cardinality is not None and n >= domain.cardinality - 1: + range_domain = Domain.indices(domain.cardinality) + int_pivot = range_domain.cast(pivot, frm=domain) + + if int_pivot == 0: + _range = np.arange(1, domain.cardinality, dtype=i64) + return domain.cast(_range, frm=range_domain) + + if int_pivot == domain.cardinality - 1: + _range = np.arange(0, domain.cardinality - 1, dtype=i64) + return domain.cast(_range, frm=range_domain) + + left = np.arange(0, int_pivot, dtype=i64) + right = np.arange(int_pivot + 1, domain.cardinality, dtype=i64) + _range = np.concatenate((left, right)) + + return domain.cast(_range, frm=range_domain) + + # Otherwise, we use a repeated sampling strategy where we slowly increase the + # std of a normal, centered on `center`, slowly expanding `std` such that + # rejection won't fail. + + # We set up a buffer that can hold the number of neighbors we need, plus some + # extra excess from sampling, preventing us from having to reallocate memory. + # We also include the initial value in the buffer, as we will remove it later. + SAMPLE_SIZE = n * sample_multiplier + BUFFER_SIZE = n * (sample_multiplier + 1) + neighbors = np.empty(BUFFER_SIZE + 1, dtype=domain.dtype) + neighbors[0] = pivot + offset = 1 # Indexes into current progress of filling buffer + stds = np.linspace(std, 1.0, n_retries + 1, endpoint=True) + sample_domain = Domain.float(lower, upper) + + range_size = upper - lower + for _std in stds: + # Generate candidates in vectorized space + candidates = seed.normal(pivot, _std * range_size, size=SAMPLE_SIZE) + valid = (candidates >= lower) & (candidates <= upper) + + candidates = domain.cast(x=candidates[valid], frm=sample_domain) + + # Find new unique neighbors + uniq = np.unique(candidates) + new_uniq = np.setdiff1d(uniq, neighbors[:offset], assume_unique=True) + + n_new_unique = len(new_uniq) + neighbors[offset : offset + n_new_unique] = new_uniq + offset += n_new_unique + + # We have enough neighbors, we can stop + if offset - 1 >= n: + # Ensure we don't include the initial value point + return neighbors[1 : n + 1] + + raise ValueError( + f"Failed to find enough neighbors with {n_retries} retries." + f" Given {n=} neighbors to generate, we only found {offset - 1}." + f" The normal's for sampling neighbors were Normal({pivot}, {list(stds)})" + f" which were meant to find neighbors of {pivot}. in the range" + f" ({lower}, {upper}).", + ) diff --git a/neps/search_spaces/samplers/__init__.py b/neps/search_spaces/samplers/__init__.py new file mode 100644 index 00000000..784b5aa4 --- /dev/null +++ b/neps/search_spaces/samplers/__init__.py @@ -0,0 +1,9 @@ +from neps.search_spaces.samplers.prior import PriorSampler +from neps.search_spaces.samplers.sampler import Sampler +from neps.search_spaces.samplers.uniform import UniformSampler + +__all__ = [ + "Sampler", + "UniformSampler", + "PriorSampler", +] diff --git a/neps/search_spaces/samplers/model.py b/neps/search_spaces/samplers/model.py new file mode 100644 index 00000000..c413b6bf --- /dev/null +++ b/neps/search_spaces/samplers/model.py @@ -0,0 +1,186 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any, Mapping + +import numpy as np + +from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping +from neps.optimizers.bayesian_optimization.acquisition_samplers import ( + AcquisitionSamplerMapping, +) +from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_kernels +from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping +from neps.search_spaces.samplers.sampler import Sampler +from neps.search_spaces.samplers.uniform import UniformSampler +from neps.utils.common import instance_from_map + +logger = logging.getLogger(__name__) + +if TYPE_CHECKING: + from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( + BaseAcquisition, + ) + from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( + AcquisitionSampler, + ) + from neps.search_spaces import SearchSpace + from neps.utils.types import Number + + +class ModelPolicy(Sampler): + """A policy for sampling configuration, i.e. the default for SH / hyperband. + + Args: + SamplingPolicy ([type]): [description] + """ + + def __init__( + self, + *, + space: SearchSpace, + surrogate_model: str | Any = "gp", + surrogate_model_args: Mapping[str, Any] | None = None, + domain_se_kernel: str | None = None, + graph_kernels: list | None = None, + hp_kernels: list | None = None, + acquisition: str | BaseAcquisition | type[BaseAcquisition] = "EI", + acquisition_sampler: ( + str | AcquisitionSampler | type[AcquisitionSampler] + ) = "random", + patience: int = 100, + ): + surrogate_model_args = dict(surrogate_model_args) if surrogate_model_args else {} + + graph_kernels, hp_kernels = get_kernels( + pipeline_space=space, + domain_se_kernel=domain_se_kernel, + graph_kernels=graph_kernels, + hp_kernels=hp_kernels, + optimal_assignment=False, + ) + + if "graph_kernels" not in surrogate_model_args: + surrogate_model_args["graph_kernels"] = None + + if "hp_kernels" not in surrogate_model_args: + surrogate_model_args["hp_kernels"] = hp_kernels + + if not surrogate_model_args["hp_kernels"]: + raise ValueError("No kernels are provided!") + + if "vectorial_features" not in surrogate_model_args: + # TODO: Graph gets ignored? + surrogate_model_args["vectorial_features"] = { + "continuous": len(space.numericals), + "categorical": len(space.categoricals), + } + + # TODO: What the hell type is this + self.surrogate_model: Any = instance_from_map( + SurrogateModelMapping, + surrogate_model, + name="surrogate model", + kwargs=surrogate_model_args, + ) + + self.acquisition: BaseAcquisition = instance_from_map( + AcquisitionMapping, + acquisition, # type: ignore + name="acquisition function", + ) + + self.acquisition_sampler: AcquisitionSampler = instance_from_map( + AcquisitionSamplerMapping, + acquisition_sampler, # type: ignore + name="acquisition sampler function", + kwargs={"patience": patience, "pipeline_space": space}, + ) + self.uniform_sampler = UniformSampler.new(space) + + def _fantasize_pending(self, train_x, train_y, pending_x): + if len(pending_x) == 0: + return train_x, train_y + + self.surrogate_model.fit(train_x, train_y) + # hallucinating: predict for the pending evaluations + _y, _ = self.surrogate_model.predict(pending_x) + _y = _y.detach().numpy().tolist() + # appending to training data + train_x.extend(pending_x) + train_y.extend(_y) + return train_x, train_y + + def update_model(self, train_x, train_y, pending_x, decay_t=None): + if decay_t is None: + decay_t = len(train_x) + train_x, train_y = self._fantasize_pending(train_x, train_y, pending_x) + self.surrogate_model.fit(train_x, train_y) + self.acquisition.set_state(self.surrogate_model, decay_t=decay_t) + # TODO: set_state should generalize to all options + # no needed to set state of sampler when using `random` + # self.acquisition_sampler.set_state(x=train_x, y=train_y) + + def sample( + self, + n: int, + *, + active_max_fidelity: Mapping[str, Number] | None = None, + fidelity: Mapping[str, Number] | None = None, + seed: np.random.Generator, + ) -> SearchSpace: + """Performs the equivalent of optimizing the acquisition function. + + Performs 2 strategies as per the arguments passed: + * If fidelity is not None, triggers the case when the surrogate has been + trained jointly with the fidelity dimension, i.e., all observations ever + recorded. In this case, the EI for random samples is evaluated at the + `fidelity` where the new sample will be evaluated. The top-10 are selected, + and the EI for them is evaluated at the target/mmax fidelity. + * If active_max_fidelity is not None, triggers the case when a surrogate is + trained per fidelity. In this case, all samples have their fidelity + variable set to the same value. This value is same as that of the fidelity + value of the configs in the training data. + """ + logger.info("Acquiring...") + + # sampling random configurations + samples = [ + self.space.sample(user_priors=False, ignore_fidelity=True) + for _ in range(SAMPLE_THRESHOLD) + ] + + if fidelity is not None: + # w/o setting this flag, the AF eval will set all fidelities to max + self.acquisition.optimize_on_max_fidelity = False + _inc_copy = self.acquisition.incumbent + # TODO: better design required, for example, not import torch + # right now this case handles the 2-step acquisition in `sample` + if "incumbent" in kwargs: + # sets the incumbent to the best score at the required fidelity for + # correct computation of EI scores + self.acquisition.incumbent = torch.tensor(kwargs["incumbent"]) + # updating the fidelity of the sampled configurations + samples = list(map(update_fidelity, samples, [fidelity] * len(samples))) + # computing EI at the given `fidelity` + eis = self.acquisition.eval(x=samples, asscalar=True) + # extracting the 10 highest scores + _ids = np.argsort(eis)[-TOP_EI_SAMPLE_COUNT:] + samples = pd.Series(samples).iloc[_ids].values.tolist() + # setting the fidelity to the maximum fidelity + self.acquisition.optimize_on_max_fidelity = True + self.acquisition.incumbent = _inc_copy + + if active_max_fidelity is not None: + # w/o setting this flag, the AF eval will set all fidelities to max + self.acquisition.optimize_on_max_fidelity = False + fidelity = active_max_fidelity + samples = list(map(update_fidelity, samples, [fidelity] * len(samples))) + + # computes the EI for all `samples` + eis = self.acquisition.eval(x=samples, asscalar=True) + # extracting the highest scored sample + return samples[np.argmax(eis)] + # TODO: can generalize s.t. sampler works for all types, currently, + # random sampler in NePS does not do what is required here + # return self.acquisition_sampler.sample(self.acquisition) diff --git a/neps/search_spaces/samplers/prior.py b/neps/search_spaces/samplers/prior.py new file mode 100644 index 00000000..65165cae --- /dev/null +++ b/neps/search_spaces/samplers/prior.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Mapping +from typing_extensions import Self, override + +from neps.search_spaces.config import Config +from neps.search_spaces.distributions.uniform_int import UniformIntDistribution +from neps.search_spaces.distributions.weighted_ints import WeightedIntsDistribution +from neps.search_spaces.samplers.sampler import Sampler + +if TYPE_CHECKING: + import numpy as np + + from neps.search_spaces.distributions.distribution import Distribution + from neps.search_spaces.search_space import SearchSpace + + +@dataclass +class PriorSampler(Sampler): + search_space: SearchSpace + + _numerical_distributions: Mapping[str, Distribution] + _categorical_distributions: Mapping[str, Distribution] + + @override + def sample_configs( + self, + n: int, + *, + fidelity: Mapping[str, float] | None, + seed: np.random.Generator, + with_constants: bool = True, + ) -> list[Config]: + numerical_samples = {} + for k, dist in self._numerical_distributions.items(): + param = self.search_space.numericals[k] + numerical_samples[k] = dist.sample(n, to=param.domain, seed=seed) + + categorical_samples = {} + for k, dist in self._categorical_distributions.items(): + cat = self.search_space.categoricals[k] + domain = cat.domain + samples = dist.sample(n, to=domain, seed=seed) + choices = cat.lookup(samples) + categorical_samples[k] = choices + + graph_samples = {} + for k, v in self.search_space.graphs.items(): + graph_samples[k] = [v.sample() for _ in range(n)] + + _constants = self.search_space.constants if with_constants else {} + + return [ + Config( + values={ + **{k: samples[i] for k, samples in numerical_samples.items()}, + **{k: samples[i] for k, samples in categorical_samples.items()}, + **{k: samples[i] for k, samples in graph_samples.items()}, + **_constants, + }, + fidelity=fidelity, + ) + for i in range(n) + ] + + @classmethod + def new( + cls, + space: SearchSpace, + prior: Mapping[str, tuple[Any, float]], + *, + replace_missing_with_uniform: bool = True, + ) -> Self: + missing = set(space.hyperparameters) - set(prior.keys()) + if not replace_missing_with_uniform and any(missing): + raise ValueError( + "If `replace_missing_with_uniform` is False, the prior must be defined" + f" for all parameters. Missing prior for: {missing}" + ) + + numerical_distributions = { + hp_name: ( + hp.domain.truncnorm_distribution(center=p[0], confidence=p[1]) + if (p := prior.get(hp_name)) + else hp.domain.uniform_distribution() + ) + for hp_name, hp in space.numericals.items() + } + # NOTE: It would be nice to somehow check if the prior given for + # a categorical was an index or a value in the categorical. + # Since it's much more efficient to hold on to the index, we will + # assume that for now. + categorical_distribution = { + hp_name: ( + WeightedIntsDistribution.with_favoured( + n=cat.size, + favoured=cat.index(p[0]), + confidence=p[1], + ) + if (p := prior.get(hp_name)) + else UniformIntDistribution.indices(cat.size) + ) + for hp_name, cat in space.categoricals.items() + } + return cls( + space, + _numerical_distributions=numerical_distributions, + _categorical_distributions=categorical_distribution, + ) diff --git a/neps/search_spaces/samplers/sampler.py b/neps/search_spaces/samplers/sampler.py new file mode 100644 index 00000000..f104a3a5 --- /dev/null +++ b/neps/search_spaces/samplers/sampler.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Mapping +from typing_extensions import Protocol + +if TYPE_CHECKING: + import numpy as np + + from neps.search_spaces.config import Config + from neps.utils.types import Number + + +@dataclass +class Sampler(Protocol): + def sample_configs( + self, + n: int, + *, + fidelity: Mapping[str, Number] | None, + seed: np.random.Generator, + ) -> list[Config]: ... diff --git a/neps/search_spaces/samplers/uniform.py b/neps/search_spaces/samplers/uniform.py new file mode 100644 index 00000000..88060932 --- /dev/null +++ b/neps/search_spaces/samplers/uniform.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Mapping +from typing_extensions import Self, override + +from neps.search_spaces.config import Config +from neps.search_spaces.distributions.uniform_int import UniformIntDistribution +from neps.search_spaces.samplers.sampler import Sampler + +if TYPE_CHECKING: + import numpy as np + + from neps.search_spaces.distributions.distribution import Distribution + from neps.search_spaces.search_space import SearchSpace + + +@dataclass +class UniformSampler(Sampler): + search_space: SearchSpace + + _numerical_distributions: Mapping[str, Distribution] + _categorical_distributions: Mapping[str, Distribution] + + @override + def sample_configs( + self, + n: int, + *, + fidelity: Mapping[str, float] | None = None, + seed: np.random.Generator, + with_constants: bool = True, + ) -> list[Config]: + numerical_samples = {} + for k, dist in self._numerical_distributions.items(): + param = self.search_space.numericals[k] + numerical_samples[k] = dist.sample(n, to=param.domain, seed=seed) + + categorical_samples = {} + for k, dist in self._categorical_distributions.items(): + cat = self.search_space.categoricals[k] + domain = cat.domain + samples = dist.sample(n, to=domain, seed=seed) + choices = cat.lookup(samples) + categorical_samples[k] = choices + + graph_samples = {} + for k, v in self.search_space.graphs.items(): + graph_samples[k] = [v.sample() for _ in range(n)] + + _constants = self.search_space.constants if with_constants else {} + + return [ + Config( + { + **{k: samples[i] for k, samples in numerical_samples.items()}, + **{k: samples[i] for k, samples in categorical_samples.items()}, + **{k: samples[i] for k, samples in graph_samples.items()}, + **_constants, + }, + fidelity=fidelity, + ) + for i in range(n) + ] + + @classmethod + def new(cls, space: SearchSpace) -> Self: + numerical_distributions = { + k: p.domain.uniform_distribution() for k, p in space.numericals.items() + } + categorical_distribution = { + k: UniformIntDistribution.indices(p.size) + for k, p in space.categoricals.items() + } + return cls( + space, + _numerical_distributions=numerical_distributions, + _categorical_distributions=categorical_distribution, + ) diff --git a/neps/search_spaces/samplers/weighted_sampler.py b/neps/search_spaces/samplers/weighted_sampler.py new file mode 100644 index 00000000..32e51908 --- /dev/null +++ b/neps/search_spaces/samplers/weighted_sampler.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Mapping +from typing_extensions import Self, override + +import numpy as np + +from neps.search_spaces.samplers.sampler import Sampler +from neps.utils.types import Arr, Number, f64 + +if TYPE_CHECKING: + from neps.search_spaces.config import Config + + +@dataclass +class WeightedSampler(Sampler): + weights: dict[str, float] + samplers: dict[str, Sampler] + + _probabilities: Arr[f64] = field(init=False, repr=False, compare=False) + _samplers: Arr[np.str_] = field(init=False, repr=False, compare=False) + + def __post_init__(self): + probs = np.array(list(self.weights.values()), dtype=f64) + probs /= probs.sum() + self._probabilities = probs + self._samplers = np.asarray(sorted(self.samplers.keys()), dtype=np.str_) + + @override + def sample_configs( + self, + n: int, + *, + fidelity: Mapping[str, Number] | None, + seed: np.random.Generator, + ) -> list[Config]: + choices = seed.choice(self._samplers, size=n, p=self._probabilities) + keys, counts = np.unique(choices, return_counts=True) + + configs: list[Config] = [] + for key, count in zip(keys, counts): + sampler = self.samplers[key] + config_samples = sampler.sample_configs(count, fidelity=fidelity, seed=seed) + configs.extend(config_samples) + + return configs + + @classmethod + def equally_weighted(cls, samples: dict[str, Sampler]) -> Self: + return cls(weights={k: 1.0 for k in samples}, samplers=samples) diff --git a/neps/state/__init__.py b/neps/state/__init__.py index 6508dba2..7a85c7d4 100644 --- a/neps/state/__init__.py +++ b/neps/state/__init__.py @@ -5,6 +5,7 @@ VersionedResource, Versioner, ) +from neps.state.optimizer import BudgetInfo, OptimizationState, OptimizerInfo from neps.state.seed_snapshot import SeedSnapshot from neps.state.trial import Trial @@ -12,6 +13,9 @@ "Locker", "SeedSnapshot", "Synced", + "BudgetInfo", + "OptimizationState", + "OptimizerInfo", "Trial", "ReaderWriter", "Versioner", diff --git a/neps/state/optimizer.py b/neps/state/optimizer.py index f4000b07..bd8cbc2e 100644 --- a/neps/state/optimizer.py +++ b/neps/state/optimizer.py @@ -19,7 +19,6 @@ def remaining_cost_budget(self) -> float: return self.max_cost_budget - self.used_cost_budget def clone(self) -> BudgetInfo: - """Clone the budget info.""" return BudgetInfo( max_cost_budget=self.max_cost_budget, used_cost_budget=self.used_cost_budget, diff --git a/neps/state/trial.py b/neps/state/trial.py index 862e2bbb..3cd9b9c1 100644 --- a/neps/state/trial.py +++ b/neps/state/trial.py @@ -37,6 +37,10 @@ class State(Enum): CORRUPTED = "corrupted" UNKNOWN = "unknown" + def pending(self) -> bool: + """Return True if the trial is pending.""" + return self in (State.PENDING, State.SUBMITTED, State.EVALUATING) + @dataclass class MetaData: @@ -129,7 +133,7 @@ class Trial: MetaData: ClassVar = MetaData NotReportedYetError: ClassVar = NotReportedYetError - config: Mapping[str, Any] + config: dict[str, Any] metadata: MetaData state: State report: Report | None diff --git a/neps/utils/types.py b/neps/utils/types.py index a6b6c540..be1f103b 100644 --- a/neps/utils/types.py +++ b/neps/utils/types.py @@ -15,7 +15,7 @@ # TODO(eddiebergman): We can turn this to an enum at some # point to prevent having to isinstance and str match ERROR: TypeAlias = Literal["error"] -Number: TypeAlias = Union[int, float, np.number] +Number: TypeAlias = Union[int, float] ConfigID: TypeAlias = str RawConfig: TypeAlias = Mapping[str, Any] Metadata: TypeAlias = Dict[str, Any] From 27bb0a3bf67e7d4549cbc4b46e7bac3cbbbc5fe1 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Mon, 26 Aug 2024 15:15:52 +0200 Subject: [PATCH 11/63] refactor: Switch to botorch --- .gitignore | 2 +- .../acquisition_functions/ei.py | 17 +- .../grakel_replace/weisfeiler_lehman.py | 293 ++++++----- .../bayesian_optimization/kernels/kernel.py | 161 ------ .../kernels/vectorial_kernels.py | 112 ---- .../kernels/weisfilerlehman.py | 31 +- .../bayesian_optimization/models/__init__.py | 6 +- .../bayesian_optimization/models/gp.py | 484 +++++++++--------- .../bayesian_optimization/optimizer.py | 202 ++++---- .../optimizers/bayesian_optimization/sobol.py | 0 neps/runtime.py | 4 +- neps/search_spaces/domain.py | 45 +- neps/search_spaces/encoding.py | 262 +++++++--- neps/search_spaces/hyperparameters/float.py | 2 + neps/search_spaces/hyperparameters/integer.py | 2 + .../hyperparameters/numerical.py | 3 + neps_examples/basic_usage/hyperparameters.py | 17 +- 17 files changed, 768 insertions(+), 875 deletions(-) delete mode 100644 neps/optimizers/bayesian_optimization/kernels/kernel.py delete mode 100644 neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py create mode 100644 neps/optimizers/bayesian_optimization/sobol.py diff --git a/.gitignore b/.gitignore index e8be93e7..09a1430c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -# Python +#False Python __pycache__ dist diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py index cc13cc8e..1a4e24d0 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py @@ -68,14 +68,11 @@ def eval( else: _x = x - try: - mu, cov = self.surrogate_model.predict(_x) - except ValueError as e: - raise e - # return -1.0 # in case of error. return ei of -1 + mu, cov = self.surrogate_model.predict(_x) std = torch.sqrt(torch.diag(cov)) mu_star = self.incumbent + gauss = Normal(torch.zeros(1, device=mu.device), torch.ones(1, device=mu.device)) # u = (mu - mu_star - self.xi) / std # ei = std * updf + (mu - mu_star - self.xi) * ucdf @@ -88,7 +85,15 @@ def eval( ) * gauss.cdf(v - std) else: u = (mu_star - mu - self.xi) / std - ucdf = gauss.cdf(u) + try: + ucdf = gauss.cdf(u) + except ValueError as e: + print(f"u: {u}") # noqa: T201 + print(f"mu_star: {mu_star}") # noqa: T201 + print(f"mu: {mu}") # noqa: T201 + print(f"std: {std}") # noqa: T201 + print(f"diag: {cov.diag()}") # noqa: T201 + raise e updf = torch.exp(gauss.log_prob(u)) ei = std * updf + (mu_star - mu - self.xi) * ucdf if self.augmented_ei: diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py index f10e406f..8c4baf64 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py @@ -1,5 +1,7 @@ """The weisfeiler lehman kernel :cite:`shervashidze2011weisfeiler`.""" +from __future__ import annotations + import collections import collections.abc import logging @@ -42,7 +44,7 @@ class WeisfeilerLehman(Kernel): {'node_name1': weight1, 'node_name2': weight2 ... } Must be of the same length as the number of different node attributes - Attributes + Attributes: ---------- X : dict Holds a dictionary of fitted subkernel modules for all levels. @@ -101,9 +103,9 @@ def __init__( self.X = None self._X_diag = None - self.X_fit = dict() - self.K_precomputed = dict() - self.base_graph_kernel_precomputed = dict() + self.X_fit = {} + self.K_precomputed = {} + self.base_graph_kernel_precomputed = {} def initialize(self): """Initialize all transformer arguments, needing initialization.""" @@ -111,26 +113,37 @@ def initialize(self): if not self._initialized["base_graph_kernel"]: base_graph_kernel = self.base_graph_kernel if base_graph_kernel is None: - base_graph_kernel, params = VertexHistogram, dict() + base_graph_kernel, params = VertexHistogram, {} # TODO: make sure we're always passing like this elif type(base_graph_kernel) is type and issubclass( # pylint: disable=C0123 base_graph_kernel, Kernel ): - params = dict() + params = {} else: try: base_graph_kernel, params = base_graph_kernel except Exception as _error: - NOT_YET_IMPLEMENTED_StmtRaise + raise TypeError( + "Base kernel was not formulated in " + "the correct way. " + "Check documentation." + ) from _error if not ( - type(base_graph_kernel) - is type # pylint: disable=C0123 + type(base_graph_kernel) is type # pylint: disable=C0123 and issubclass(base_graph_kernel, Kernel) ): - NOT_YET_IMPLEMENTED_StmtRaise + raise TypeError( + "The first argument must be a valid " + "grakel.kernel.kernel Object" + ) if not isinstance(params, dict): - NOT_YET_IMPLEMENTED_StmtRaise + raise ValueError( + "If the second argument of base " + "kernel exists, it must be a diction" + "ary between parameters names and " + "values" + ) params.pop("normalize", None) params["normalize"] = False @@ -141,7 +154,9 @@ def initialize(self): if not self._initialized["h"]: if not isinstance(self.h, int) or self.h < 0: - NOT_YET_IMPLEMENTED_StmtRaise + raise TypeError( + "'h' must be a non-negative integer. Got h:" + str(self.h) + ) self._h = self.h + 1 self._initialized["h"] = True @@ -181,7 +196,7 @@ def parse_input( gp_fit: bool If False use precomputed vals for first N values, else compute them and save them - Returns + Returns: ------- base_graph_kernel : object Returns base_graph_kernel. @@ -191,34 +206,34 @@ def parse_input( """ if self._method_calling not in [1, 2]: - NOT_YET_IMPLEMENTED_StmtRaise + raise ValueError( + "method call must be called either from fit " + "or fit-transform" + ) elif hasattr(self, "_X_diag"): # Clean _X_diag value delattr(self, "_X_diag") # skip kernel computation if we have already computed the corresponding kernel - if self._h in self.K_precomputed.keys() and self.X_fit[self._h] == X: + if self._h in self.K_precomputed and self.X_fit[self._h] == X: K = self.K_precomputed[self._h] base_graph_kernel = self.base_graph_kernel_precomputed[self._h] else: # Input validation and parsing if not isinstance(X, collections.abc.Iterable): - NOT_YET_IMPLEMENTED_StmtRaise + raise TypeError("input must be an iterable\n") else: nx = 0 - Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict() + Gs_ed, L, distinct_values, extras = {}, {}, set(), {} for idx, x in enumerate(iter(X)): is_iter = isinstance(x, collections.abc.Iterable) if is_iter: x = list(x) if is_iter and (len(x) == 0 or len(x) >= 2): if len(x) == 0: - warnings.warn( - "Ignoring empty element on index: " + str(idx) - ) + warnings.warn("Ignoring empty element on index: " + str(idx)) continue elif len(x) > 2: - extra = tuple() + extra = () if len(x) > 3: extra = tuple(x[3:]) x = Graph(x[0], x[1], x[2], graph_format=self._graph_format) @@ -228,10 +243,11 @@ def parse_input( label_type="edge", return_none=True, ), - ) + extra + *extra, + ) else: x = Graph(x[0], x[1], {}, graph_format=self._graph_format) - extra = tuple() + extra = () elif isinstance(x, Graph): x.desired_format(self._graph_format) @@ -240,20 +256,22 @@ def parse_input( label_type="edge", return_none=True, ) - if el is None: - extra = tuple() - else: - extra = (el,) + extra = () if el is None else (el,) else: - NOT_YET_IMPLEMENTED_StmtRaise + raise TypeError( + "each element of X must be either a " + + "graph object or a list with at least " + + "a graph like object and node labels " + + "dict \n" + ) Gs_ed[nx] = x.get_edge_dictionary() L[nx] = x.get_labels(purpose="dictionary") extras[nx] = extra - NOT_YET_IMPLEMENTED_StmtAugAssign - NOT_YET_IMPLEMENTED_StmtAugAssign + distinct_values |= set(L[nx].values()) + nx += 1 if nx == 0: - NOT_YET_IMPLEMENTED_StmtRaise + raise ValueError("parsed input is empty") # Save the number of "fitted" graphs. self._nx = nx @@ -261,70 +279,75 @@ def parse_input( # assign a number to each label label_count = 0 - for dv in sorted(list(distinct_values)): + for dv in sorted(distinct_values): WL_labels_inverse[dv] = label_count - NOT_YET_IMPLEMENTED_StmtAugAssign + label_count += 1 # Initalize an inverse dictionary of labels for all iterations - self._inv_labels = OrderedDict() # Inverse dictionary of labels, in term of the *previous layer* + self._inv_labels = ( + OrderedDict() + ) # Inverse dictionary of labels, in term of the *previous layer* self._inv_labels[0] = deepcopy(WL_labels_inverse) - self.feature_dims.append(len(WL_labels_inverse)) # Update the zeroth iteration feature dim - - self._inv_label_node_attr = OrderedDict() # Inverse dictionary of labels, in term of the *node attribute* - self._label_node_attr = OrderedDict() # Same as above, but with key and value inverted - ( - self._label_node_attr[0], - self._inv_label_node_attr[0], - ) = self.translate_label(WL_labels_inverse, 0) + self.feature_dims.append( + len(WL_labels_inverse) + ) # Update the zeroth iteration feature dim + + self._inv_label_node_attr = ( + OrderedDict() + ) # Inverse dictionary of labels, in term of the *node attribute* + self._label_node_attr = ( + OrderedDict() + ) # Same as above, but with key and value inverted + self._label_node_attr[0], self._inv_label_node_attr[0] = self.translate_label( + WL_labels_inverse, 0 + ) if self.node_weights is not None: self._feature_weight = OrderedDict() # Ensure the order is the same self._feature_weight[0] = self._compute_feature_weight( self.node_weights, 0, WL_labels_inverse - )[ - 1 - ] + )[1] else: self._feature_weight = None def generate_graphs(label_count: int, WL_labels_inverse): - new_graphs = list() + new_graphs = [] for j in range(self._nx): - new_labels = dict() - for k in L[j].keys(): + new_labels = {} + for k in L[j]: new_labels[k] = WL_labels_inverse[L[j][k]] L[j] = new_labels # add new labels new_graphs.append((Gs_ed[j], new_labels) + extras[j]) - NOT_YET_IMPLEMENTED_ExprYield + yield new_graphs for i in range(1, self._h): - label_set, WL_labels_inverse, L_temp = set(), dict(), dict() + label_set, WL_labels_inverse, L_temp = set(), {}, {} for j in range(nx): # Find unique labels and sort # them for both graphs # Keep for each node the temporary - L_temp[j] = dict() - for v in Gs_ed[j].keys(): - credential = str(L[j][v]) + "," + str( - sorted( - (NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []) - ) + L_temp[j] = {} + for v in Gs_ed[j]: + credential = ( + str(L[j][v]) + + "," + + str(sorted(L[j][n] for n in Gs_ed[j][v])) ) L_temp[j][v] = credential label_set.add(credential) - label_list = sorted(list(label_set)) + label_list = sorted(label_set) for dv in label_list: WL_labels_inverse[dv] = label_count - NOT_YET_IMPLEMENTED_StmtAugAssign + label_count += 1 # Recalculate labels - new_graphs = list() + new_graphs = [] for j in range(nx): - new_labels = dict() - for k in L_temp[j].keys(): + new_labels = {} + for k in L_temp[j]: new_labels[k] = WL_labels_inverse[L_temp[j][k]] L[j] = new_labels # relabel @@ -344,11 +367,9 @@ def generate_graphs(label_count: int, WL_labels_inverse): if self.node_weights is not None: self._feature_weight[i] = self._compute_feature_weight( self.node_weights, i, self._inv_label_node_attr[i] - )[ - 1 - ] + )[1] # assert len(self._feature_weight[i] == len(WL_labels_inverse)) - NOT_YET_IMPLEMENTED_ExprYield + yield new_graphs # Initialise the base graph kernel. base_graph_kernel = {} @@ -397,6 +418,7 @@ def generate_graphs(label_count: int, WL_labels_inverse): K = torch.stack(K, dim=0).sum(dim=0) return K, base_graph_kernel return np.sum(K, axis=0), base_graph_kernel + return None def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True): # pylint: disable=unused-argument """Fit and transform, on the same dataset. @@ -414,7 +436,7 @@ def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True): # pylint: di y : Object, default=None Ignored argument, added for the pipeline. - Returns + Returns: ------- K : numpy array, shape = [n_targets, n_input_graphs] corresponding to the kernel matrix, a calculation between @@ -428,7 +450,7 @@ def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True): # pylint: di 0, ] # Flush the feature dimensions if X is None: - NOT_YET_IMPLEMENTED_StmtRaise + raise ValueError("transform input cannot be None") else: km, self.X = self.parse_input(X, gp_fit=gp_fit) @@ -450,7 +472,8 @@ def transform(self, X: Iterable, return_embedding_only: bool = True): return_embedding_only: bool Whether to return the embedding of the graphs only, instead of computing the kernel all the way to the end. - Returns + + Returns: ------- K : numpy array, shape = [n_targets, n_input_graphs] corresponding to the kernel matrix, a calculation between @@ -463,13 +486,13 @@ def transform(self, X: Iterable, return_embedding_only: bool = True): # Input validation and parsing if X is None: - NOT_YET_IMPLEMENTED_StmtRaise + raise ValueError("transform input cannot be None") elif not isinstance(X, collections.abc.Iterable): - NOT_YET_IMPLEMENTED_StmtRaise + raise ValueError("input must be an iterable\n") else: nx = 0 distinct_values = set() - Gs_ed, L = dict(), dict() + Gs_ed, L = {}, {} for i, x in enumerate(iter(X)): is_iter = isinstance(x, collections.abc.Iterable) if is_iter: @@ -484,25 +507,32 @@ def transform(self, X: Iterable, return_embedding_only: bool = True): elif isinstance(x, Graph): x.desired_format("dictionary") else: - NOT_YET_IMPLEMENTED_StmtRaise + raise ValueError( + "each element of X must have at " + + "least one and at most 3 elements\n" + ) Gs_ed[nx] = x.get_edge_dictionary() L[nx] = x.get_labels(purpose="dictionary") # Hold all the distinct values - NOT_YET_IMPLEMENTED_StmtAugAssign - NOT_YET_IMPLEMENTED_StmtAugAssign + distinct_values |= { + v for v in L[nx].values() if v not in self._inv_labels[0] + } + nx += 1 if nx == 0: - NOT_YET_IMPLEMENTED_StmtRaise + raise ValueError("parsed input is empty") nl = len(self._inv_labels[0]) - WL_labels_inverse = {NOT_IMPLEMENTED_dict_key: NOT_IMPLEMENTED_dict_value for key, value in NOT_IMPLEMENTED_dict} + WL_labels_inverse = { + dv: idx for (idx, dv) in enumerate(sorted(distinct_values), nl) + } WL_labels_inverse = OrderedDict(WL_labels_inverse) def generate_graphs_transform(WL_labels_inverse, nl): # calculate the kernel matrix for the 0 iteration - new_graphs = list() + new_graphs = [] for j in range(nx): - new_labels = dict() + new_labels = {} for k, v in L[j].items(): if v in self._inv_labels[0]: new_labels[k] = self._inv_labels[0][v] @@ -511,37 +541,35 @@ def generate_graphs_transform(WL_labels_inverse, nl): L[j] = new_labels # produce the new graphs new_graphs.append([Gs_ed[j], new_labels]) - NOT_YET_IMPLEMENTED_ExprYield + yield new_graphs for i in range(1, self._h): - new_graphs = list() - L_temp, label_set = dict(), set() - NOT_YET_IMPLEMENTED_StmtAugAssign + new_graphs = [] + L_temp, label_set = {}, set() + nl += len(self._inv_labels[i]) for j in range(nx): # Find unique labels and sort them for both graphs # Keep for each node the temporary - L_temp[j] = dict() - for v in Gs_ed[j].keys(): - credential = str(L[j][v]) + "," + str( - sorted( - (NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []) - ) + L_temp[j] = {} + for v in Gs_ed[j]: + credential = ( + str(L[j][v]) + "," + str(sorted(L[j][n] for n in Gs_ed[j][v])) ) L_temp[j][v] = credential if credential not in self._inv_labels[i]: label_set.add(credential) # Calculate the new label_set - WL_labels_inverse = dict() + WL_labels_inverse = {} if len(label_set) > 0: - for dv in sorted(list(label_set)): + for dv in sorted(label_set): idx = len(WL_labels_inverse) + nl WL_labels_inverse[dv] = idx # Recalculate labels - new_graphs = list() + new_graphs = [] for j in range(nx): - new_labels = dict() + new_labels = {} for k, v in L_temp[j].items(): if v in self._inv_labels[i]: new_labels[k] = self._inv_labels[i][v] @@ -550,7 +578,7 @@ def generate_graphs_transform(WL_labels_inverse, nl): L[j] = new_labels # Create the new graphs with the new labels. new_graphs.append([Gs_ed[j], new_labels]) - NOT_YET_IMPLEMENTED_ExprYield + yield new_graphs if return_embedding_only: K = [] @@ -567,11 +595,29 @@ def generate_graphs_transform(WL_labels_inverse, nl): # Calculate the kernel matrix without parallelization if self.as_tensor: - summand = [NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []] + summand = [ + self.layer_weights[i] + * self.X[i].transform( + g, + label_start_idx=self.feature_dims[i], + label_end_idx=self.feature_dims[i + 1], + ) + for i, g in enumerate(generate_graphs_transform(WL_labels_inverse, nl)) + ] K = torch.stack(summand, dim=0).sum(dim=0) else: K = np.sum( - (NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []), + ( + self.layer_weights[i] + * self.X[i].transform( + g, + label_start_idx=self.feature_dims[i], + label_end_idx=self.feature_dims[i + 1], + ) + for (i, g) in enumerate( + generate_graphs_transform(WL_labels_inverse, nl) + ) + ), axis=0, ) @@ -580,7 +626,7 @@ def generate_graphs_transform(WL_labels_inverse, nl): X_diag, Y_diag = self.diagonal() if self.as_tensor: div_ = torch.sqrt(torch.ger(Y_diag, X_diag)) - NOT_YET_IMPLEMENTED_StmtAugAssign + K /= div_ else: old_settings = np.seterr(divide="ignore") K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag)))) @@ -598,7 +644,7 @@ def diagonal(self): ---------- None. - Returns + Returns: ------- X_diag : np.array The diagonal of the kernel matrix, of the fitted data. @@ -616,7 +662,7 @@ def diagonal(self): if self._is_transformed: Y_diag = self.X[0].diagonal()[1] for i in range(1, self._h): - NOT_YET_IMPLEMENTED_StmtAugAssign + Y_diag += self.X[i].diagonal()[1] except NotFittedError: # Calculate diagonal of X if self._is_transformed: @@ -625,8 +671,8 @@ def diagonal(self): X_diag.flags.writeable = True for i in range(1, self._h): x, y = self.X[i].diagonal() - NOT_YET_IMPLEMENTED_StmtAugAssign - NOT_YET_IMPLEMENTED_StmtAugAssign + X_diag += x + Y_diag += y self._X_diag = X_diag # case sub kernel is only fitted @@ -635,7 +681,7 @@ def diagonal(self): X_diag.flags.writeable = True for i in range(1, self._n_iter): x = self.X[i].diagonal() - NOT_YET_IMPLEMENTED_StmtAugAssign + X_diag += x self._X_diag = X_diag if self.as_tensor: @@ -648,42 +694,39 @@ def diagonal(self): return self._X_diag @staticmethod - def translate_label(curr_layer: dict, h: int, prev_layer: dict = None): + def translate_label(curr_layer: dict, h: int, prev_layer: dict | None = None): """Translate the label to be in terms of the node attributes curr_layer: the WL_label_inverse object. A dictionary with element of the format of - {pattern: encoding} + {pattern: encoding}. - return: + Return: label_in_node_attr: in terms of {encoding: pattern}, but pattern is always in term of the node attribute inv_label_in_node_attr: in terms of {pattern: encoding} """ if h == 0: - return ( - {NOT_IMPLEMENTED_dict_key: NOT_IMPLEMENTED_dict_value for key, value in NOT_IMPLEMENTED_dict}, - curr_layer, - ) + return {v: str(k) for k, v in curr_layer.items()}, curr_layer else: - NOT_YET_IMPLEMENTED_StmtAssert + assert prev_layer is not None label_in_node_attr, inv_label_in_node_attr = OrderedDict(), OrderedDict() for pattern, encoding in curr_layer.items(): # current pattern is in terms of the encoding previous layer. Find the pattern from the prev_layer root, leaf = literal_eval(pattern) root_ = prev_layer[root] - leaf_ = [NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []] - label_in_node_attr.update({encoding: "~".join([root_] + leaf_)}) - inv_label_in_node_attr.update({"~".join([root_] + leaf_): encoding}) + leaf_ = [prev_layer[i] for i in leaf] + label_in_node_attr.update({encoding: "~".join([root_, *leaf_])}) + inv_label_in_node_attr.update({"~".join([root_, *leaf_]): encoding}) return label_in_node_attr, inv_label_in_node_attr @staticmethod def _compute_feature_weight( node_weight: OrderedDict, h: int, inv_label_node_attr: OrderedDict ): - """ - Compute the feature weight, based on the average weight of the constituent node attributes. + """Compute the feature weight, based on the average weight of the constituent node attributes. + Return: feature_weights: a dictionary with h layers, each of which is a dictionary of the format of - {tuple1: weight1; tuplr2, weight2 ...} where tuplex is the tuple representation of the learned graph feature + {tuple1: weight1; tuplr2, weight2 ...} where tuplex is the tuple representation of the learned graph feature. feature_weight_flattened: same as above, but in a flattened np format. """ @@ -691,29 +734,25 @@ def _compute_feature_weight( feature_weights_flattened = [] if h == 0: feature_weight = OrderedDict( - {NOT_IMPLEMENTED_dict_key: NOT_IMPLEMENTED_dict_value for key, value in NOT_IMPLEMENTED_dict} + {k: (node_weight[k]) ** 2 for k in inv_label_node_attr} ) - feature_weights_flattened = np.array( - list(feature_weight.values()) - ).flatten() + feature_weights_flattened = np.array(list(feature_weight.values())).flatten() else: for k, _ in inv_label_node_attr.items(): # k is the pattern, v is the encoding k_sep = k.split("~") - average_weight = np.mean( - [NOT_YET_IMPLEMENTED_generator_key for NOT_YET_IMPLEMENTED_generator_key in []] - ) + average_weight = np.mean([(node_weight[i]) ** 2 for i in k_sep]) feature_weights.update({k: average_weight}) feature_weights_flattened.append(average_weight) feature_weights_flattened = np.array(feature_weights_flattened).flatten() - NOT_YET_IMPLEMENTED_StmtAssert + assert len(feature_weights_flattened) == len(inv_label_node_attr) return feature_weights, feature_weights_flattened def dK_dX(self, X_test: None): - """ - Do additional forward and backward pass, compute the kernel derivative wrt the testing location. - If no test locations are provided, the derivatives are evaluated at the training points - Returns + """Do additional forward and backward pass, compute the kernel derivative wrt the testing location. + If no test locations are provided, the derivatives are evaluated at the training points. + + Returns. ------- """ diff --git a/neps/optimizers/bayesian_optimization/kernels/kernel.py b/neps/optimizers/bayesian_optimization/kernels/kernel.py deleted file mode 100644 index 42382a51..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/kernel.py +++ /dev/null @@ -1,161 +0,0 @@ -from __future__ import annotations - -import copy -import inspect -import math -from abc import ABC, abstractmethod -from typing import Any, ClassVar, Generic, Mapping, Sequence, TypeVar -from typing_extensions import Self - -import torch -from torch import nn - -from neps.utils.types import NotSet - -T = TypeVar("T") - - -class Kernel(ABC, nn.Module, Generic[T]): - suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] - - def __init__(self) -> None: - super().__init__() - - @abstractmethod - def as_optimizable(self) -> Self: ... - - @abstractmethod - def forward(self, x: T, x2: T | None = None) -> torch.Tensor: - raise NotImplementedError - - def clone(self) -> Self: - return self.clone_with() - - def clone_with(self, **params: Any) -> Self: - # h ttps://github.com/scikit-learn/scikit-learn/blob/70fdc843a4b8182d97a3508c1a426acc5e87e980/sklearn/base.py#L197 - sig = inspect.signature(self.__init__) - - self_values = {} - for p in sig.parameters.values(): - if p.name == "self": - continue - - attr = getattr(self, p.name, NotSet) - if attr is NotSet: - raise ValueError( - f"Could not clone as the variable {p.name} was not set in" - f" the constructor on the object: {self}" - ) - self_values[p.name] = params.get(p.name, attr) - - new_self_values = copy.deepcopy(self_values) - return self.__class__(**new_self_values) - - def grid_search( - self, - x: T, - y: torch.Tensor, - *, - grid: Sequence[Mapping[str, Any]] | None = None, - noise_variances: Sequence[float] = (1e-6,), - ) -> tuple[Self, float]: - # Returns: (Kernel[T], float) | None if failed - grid = grid or self.suggested_grid - - def _fit_and_eval( - _params: Mapping[str, Any], - ) -> tuple[Kernel[T], float] | Exception: - cloned_kernel = self.clone_with(**_params) - K = cloned_kernel.forward(x) - - best_lml = -float("inf") - for noise_variance in noise_variances: - K.diag().add_(noise_variance) - - K_inv, logDetK = compute_pd_inverse(K) - lml = log_marginal_likelihood(K_inv, logDetK, y).item() - if lml > best_lml: - best_lml = lml - - K.diag().sub_(noise_variance) - - return cloned_kernel, best_lml - - evals = [_fit_and_eval(params) for params in grid] - evals_with_score = [e for e in evals if not isinstance(e, Exception)] - if not any(evals_with_score): - raise evals[-1] # type: ignore - - return max(evals_with_score, key=lambda e: e[1]) # type: ignore - - -class NumericKernel(Kernel[torch.Tensor]): ... - - -TWO_LOG_2_PI = 2 * torch.log(torch.tensor(2 * math.pi)) - - -def log_marginal_likelihood( - K_inv: torch.Tensor, - logDetK: torch.Tensor, - y: torch.Tensor, -) -> torch.Tensor: - # y.T @ K_inv @ y --- Benchmarked to be twice as fast - quad_form = torch.matmul(y, torch.matmul(K_inv, y)) - n = y.shape[0] - - # TODO: We can drop the `n / 2 * TWO_LOG_2_PI` term for the grid - # search above as it's constant between the different kernel grids - # as it's purely data dependant with the `n` - return -0.5 * quad_form + 0.5 * logDetK - n / TWO_LOG_2_PI - - -class _CholeskyError(RuntimeError): - """Raised when the Cholesky decomposition fails.""" - - -# https://github.com/cornellius-gp/linear_operator/blob/eec70f9e1cd9106c32b05a3e774ea29d00d71cea/linear_operator/utils/cholesky.py#L12 -def _cholesky_routine( - K: torch.Tensor, - jitter: float | torch.Tensor = 1e-6, - max_tries: int = 4, -) -> torch.Tensor: - L, info = torch.linalg.cholesky_ex(K) - if not torch.any(info): - return L - - # Clone as we will modify in place, still cheaper - # than creating a new full tensor for identity. - K_prime = K.clone() - jitter_prev = 0 - for i in range(max_tries): - jitter_new = jitter * (10**i) - K_prime.diagonal().add_(jitter_new - jitter_prev) - L, info = torch.linalg.cholesky_ex(K_prime) - if not torch.any(info): - return L - - jitter_prev = jitter_new - - raise _CholeskyError("Failed to compute Cholesky decomposition.") - - -def compute_pd_inverse(K: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - # Adding noise to the diagonal of K helps with numerical stability - # when K is singular or near-singular, (i.e. it helps K be more "positive") which - # is required for the decomposition. - - try: - # L @ L.T = K_inv --- solves for L - L = _cholesky_routine(K) - logDetK = 2 * torch.sum(torch.log(torch.diag(L))) - - # K_inv = L_inv @ L_inv.T --- Efficiently solve for K_inv using just L - K_inv = torch.cholesky_inverse(L) - except _CholeskyError: - # If we fail to compute the Cholesky decomposition, - # then just compute the inverse directly. - K_inv = torch.linalg.inv(K) - logDetK = torch.linalg.slogdet(K)[1] - - return K_inv, logDetK diff --git a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py b/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py deleted file mode 100644 index 07b56333..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/vectorial_kernels.py +++ /dev/null @@ -1,112 +0,0 @@ -from __future__ import annotations - -from abc import ABC -from itertools import product -from math import sqrt -from typing import Any, ClassVar, Mapping, Sequence -from typing_extensions import Self, override - -import torch -from torch import nn - -from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel - -# TODO: -# We should try some variations of singular length scales -# (1 scale shared across all dimensions) -# and individual ARD lengthscales (1 for each dimension) -# ARD can overfit if not properly tuned... -LENGTHSCALE_GRID = (1e-2, 1e-1, 1, 1e1, 1e2) -STD_ENCODED_OUTPUT_SCALE = (1e-2, 1e-1, 1, 1e1, 1e2) - - -class NumericKernel(Kernel[torch.Tensor], ABC): - suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] = [ - {"lengthscale": _l, "output_scale": o} - for _l, o in product(LENGTHSCALE_GRID, STD_ENCODED_OUTPUT_SCALE) - ] - - def __init__( - self, - *, - lengthscale: torch.Tensor | None = None, - outputscale: torch.Tensor | None = None, - lengthscale_bounds: tuple[float, float] | None = (1e-2, 1e2), - outputscale_bounds: tuple[float, float] | None = (1e-2, 1e2), - device: torch.device | None = None, - ): - super().__init__() - self.lengthscale = ( - torch.as_tensor(lengthscale, dtype=torch.float64, device=device) - if lengthscale is not None - else torch.tensor(1, dtype=torch.float64, device=device) - ) - self.outputscale = ( - torch.as_tensor(outputscale, dtype=torch.float64, device=device) - if outputscale is not None - else torch.tensor(1, dtype=torch.float64, device=device) - ) - self.lengthscale_bounds = lengthscale_bounds - self.outputscale_bounds = outputscale_bounds - self.device = device - - self.train_: torch.Tensor | None = None - - def as_optimizable(self) -> Self: - return self.clone_with( - lengthscale=nn.Parameter(self.lengthscale), - outputscale=nn.Parameter(self.outputscale), - ) - - def forward(self, x: torch.Tensor, x2: torch.Tensor | None = None) -> torch.Tensor: - # NOTE: I don't think this is the right way to do this... - if self.lengthscale_bounds is not None or self.outputscale_bounds is not None: - with torch.no_grad(): - if self.lengthscale_bounds is not None: - self.lengthscale.data.clamp_(*self.lengthscale_bounds) - if self.outputscale_bounds is not None: - self.outputscale.data.clamp_(*self.outputscale_bounds) - - x2 = x if x2 is None else x2 - return self._forward(x, x2) - - def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: ... - - -class Stationary(NumericKernel): - @override - def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: - return self.outputscale * torch.cdist(x1, x2, p=2) / self.lengthscale - - -class RBFKernel(NumericKernel): - @override - def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: - dist_sq = torch.cdist(x1, x2, p=2) ** 2 - return self.outputscale * torch.exp(-dist_sq / (2 * self.lengthscale**2)) - - -class Matern32Kernel(NumericKernel): - @override - def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: - dist = torch.cdist(x1, x2, p=2) / self.lengthscale - factor = sqrt(3.0) * dist - matern32 = (1 + factor) * torch.exp(-factor) - return self.outputscale * matern32 - - -class HammingKernel(NumericKernel): - @override - def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: - dists = (x1.unsqueeze(1) != x2.unsqueeze(0)).float().sum(-1) / x1.shape[-1] - scaled_dists = dists / self.lengthscale - return self.outputscale * torch.exp(-scaled_dists) - - -class Matern52Kernel(NumericKernel): - @override - def _forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: - dist = torch.cdist(x1, x2, p=2) / self.lengthscale - factor = sqrt(5.0) * dist - matern52 = (1 + factor + (factor**2) / 3) * torch.exp(-factor) - return self.outputscale * matern52 diff --git a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py index 8c1feb26..44e8b8e1 100644 --- a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py +++ b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py @@ -1,7 +1,6 @@ from __future__ import annotations -from itertools import product -from typing import Any, ClassVar, Mapping, Sequence +from typing import TYPE_CHECKING from typing_extensions import Self import numpy as np @@ -14,20 +13,29 @@ WeisfeilerLehman as _WL, ) from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel -from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import NumericKernel + +if TYPE_CHECKING: + from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import ( + NumericKernel, + ) GRID_WL_LENGTHSCALES = torch.tensor([np.e**i for i in range(-2, 3)]) GRID_WL_SUBTREE_CANDIDATES = (1, 2, 3, 4, 5) +def normal_prior(param: torch.Tensor, mean: float, std: float) -> torch.Tensor: + return -0.5 * torch.sum(((param - mean) / std) ** 2) - torch.sum( + torch.log(std * torch.sqrt(2 * torch.tensor(np.pi))) + ) + + +def kernel_hp_prior(params: dict[str, nn.Parameter]) -> torch.Tensor: + return normal_prior(params["layer_weights"], mean=0, std=1) + + class WeisfilerLehman(Kernel[npt.NDArray[np.object_]]): """Weisfiler Lehman kernel using grakel functions.""" - suggested_grid: ClassVar[Sequence[Mapping[str, Any]]] = [ - {"h": h, "se_kernel": NumericKernel(lengthscale=l)} - for h, l in product(GRID_WL_SUBTREE_CANDIDATES, GRID_WL_LENGTHSCALES) - ] - def __init__( self, *, @@ -46,11 +54,12 @@ def __init__( vector embedding inner products are computed). If None, uses the default linear kernel layer_weights: The weights for each layer of the Weisfeiler-Lehman kernel. - If None, uses uniform + If None, uses uniform 1s oa: whether the optimal assignment variant of the Weisfiler-Lehman kernel should be used node_label: the node_label defining the key node attribute. """ + super().__init__(hyperparameter_prior=kernel_hp_prior) if se_kernel is not None and oa: raise ValueError( "Only one or none of se (successive embedding) and oa (optimal assignment) may be true!" @@ -72,7 +81,6 @@ def as_optimizable(self) -> Self: return self.clone_with(layer_weights=nn.Parameter(self.layer_weights)) def fit_transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor: - self.layer_weights.clamp_(0, 1) self.wl_kernel_ = _WL( h=self.h, base_graph_kernel=( # type: ignore @@ -84,7 +92,7 @@ def fit_transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor: "requires_ordered_features": True, }, ), - layer_weights=self.layer_weights, + layer_weights=self.layer_weights / self.layer_weights.sum(), normalize=True, ) @@ -93,7 +101,6 @@ def fit_transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor: def transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor: assert self.wl_kernel_ is not None - self.layer_weights.clamp_(0, 1) K = self.wl_kernel_.transform(iter(gr)) return torch.as_tensor(K, dtype=torch.float64) diff --git a/neps/optimizers/bayesian_optimization/models/__init__.py b/neps/optimizers/bayesian_optimization/models/__init__.py index 6ce65b61..5e40df9d 100755 --- a/neps/optimizers/bayesian_optimization/models/__init__.py +++ b/neps/optimizers/bayesian_optimization/models/__init__.py @@ -1,7 +1,5 @@ from neps.utils.common import MissingDependencyError -from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP - try: from neps.optimizers.models.deepGP import DeepGP except ImportError as e: @@ -14,6 +12,8 @@ SurrogateModelMapping = { "deep_gp": DeepGP, - "gp": ComprehensiveGP, + "gp": MissingDependencyError( + "Removed for now", NotImplementedError("GP is not implemented") + ), "pfn": PFN_SURROGATE, } diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index e63c033f..ab2884f3 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -1,261 +1,269 @@ from __future__ import annotations import logging -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Literal, Mapping, Sequence +import math +from typing import TYPE_CHECKING, Any, Mapping, TypeVar -import numpy as np +import gpytorch +import gpytorch.constraints import torch -from torch import nn -from torch.optim import SGD, Adam # type: ignore +from botorch.acquisition.analytic import SingleTaskGP +from botorch.models import MixedSingleTaskGP +from botorch.models.gp_regression_mixed import CategoricalKernel +from botorch.models.transforms.outcome import Standardize +from botorch.optim import optimize_acqf, optimize_acqf_mixed +from gpytorch.kernels import MaternKernel, ScaleKernel -from neps.optimizers.bayesian_optimization.kernels.kernel import ( - Kernel, - compute_pd_inverse, - log_marginal_likelihood, -) -from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import ( - HammingKernel, - Matern52Kernel, - NumericKernel, -) -from neps.optimizers.bayesian_optimization.kernels.weisfilerlehman import ( - WeisfilerLehman, -) -from neps.search_spaces import SearchSpace from neps.search_spaces.encoding import ( CategoricalToIntegerTransformer, + DataEncoder, DataPack, - MinMaxNormalizer, - OneHotEncoder, - TensorTransformer, - Transformer, - WLInputTransformer, ) -from neps.search_spaces.hyperparameters import FloatParameter, IntegerParameter if TYPE_CHECKING: - from neps.search_spaces.search_space import SearchSpace + from botorch.acquisition import AcquisitionFunction logger = logging.getLogger(__name__) -# The optimization we do for the noise is relatively cheap while the matrices -NOISE_VARIANCE_GRID = (1e-6, 1e-4, 1e-2, 1, 1e1, 1e2) - - -@dataclass -class ComprehensiveGP: - space: SearchSpace - kernels: dict[str, tuple[Sequence[str], Kernel]] - - combined_kernel: Literal["sum", "product"] = "sum" - noise_variance: Sequence[float] = NOISE_VARIANCE_GRID - kernel_parameter_grid: Mapping[str, Sequence[Mapping[str, Any]]] | bool = True - - optimizer: Literal["adam", "sgd"] = "adam" - optimizer_kwargs: Mapping[str, Any] = field(default_factory=lambda: {"lr": 0.1}) - optimizer_iters: int = 20 - device: torch.device | None = None - - # Post fit attributes - K_inv_: torch.Tensor | None = None - likelihood_: float | None = None - y_: torch.Tensor | None = None - y_normalized_: torch.Tensor | None = None - y_mean_: float | None = None - y_std_: float | None = None - opt_kernels_: dict[str, tuple[Sequence[str], Kernel]] | None = None - train_x_: DataPack | None = None - - def __post_init__(self): - # TODO: Remove when search space is just definition and does not hold values. - self.space = self.space.clone() - - def fit( - self, - *, - x: DataPack, - train_y: torch.Tensor, - ) -> None: - # Preprocessing - y_ = torch.as_tensor(train_y, device=self.device, dtype=torch.float64) - - # TODO: Dunno if I like this silent hack, setting std to 1 if no std - self.y_std_ = s if (s := torch.std(y_).item()) != 0 else 1 - self.y_mean_ = torch.mean(y_).item() - self.y_normalized_ = (y_ - self.y_mean_) / self.y_std_ - self.y_ = y_ - - # optimized kernel parameters + noise variance - optim_vars: list[nn.Parameter] = [] - opt_kernels: dict[str, tuple[Sequence[str], Kernel]] = {} - - N: int - for _kernel_name, (hps, kernel) in self.kernels.items(): - data = x.select(hps) - opt_kernel, _ = kernel.grid_search( - x=data, # type: ignore - y=self.y_normalized_, - ) - optim_vars.extend(opt_kernel.parameters()) - opt_kernels[_kernel_name] = (hps, opt_kernel) - - # Now that we've optimized the kernels, we convert go convert their - # parameters into a tensor we can further refine with some optimizer iterations - # - Optimize kernel-lengthscales, kernel-outputscale, noise-variance - # and any additional parameters they wish to advertise. - noise_variance = nn.Parameter( - torch.tensor(1e-3, device=self.device, dtype=torch.float64) +T = TypeVar("T") + + +def default_likelihood_with_prior() -> gpytorch.likelihoods.GaussianLikelihood: + # The effect of the likelihood of noise is pretty crucial w.r.t. + # whether we are going to overfit every point by overfitting with + # the lengthscale, or whether we smooth through and assume variation + # is due to noise. Setting it's prior is hard. For a non-noisy + # function, we'd want it looooowww, like 1e-8 kind of low. For + # even a 0.01% noise, we need that all the way up to 1e-2. Hence + # + # If we had 10% noise and we allow the noise to easily optimize towards + # 1e-8, then the lengthscales are forced to beome very small, essentially + # overfitting. If we have 0% noise and we don't allow it to easily get low + # then we will drastically underfit. + # A guiding principle here is that we should allow the noise to be just + # as if not slightly easier to tune than the lengthscales. I.e. we prefer + # smoother functions as it is easier to acquisition over. However once we + # over smooth and underfit, any new observations that inform us otherwise + # could just be attributed to noise. + # + # TOOD: We may want to move the likelihood inside the GP and decay the + # amount the GP can attribute to noise (reduce std and mean) relative + # to samples seen, effectively reducing the smoothness of the GP overtime + noise_mean = 1e-2 + noise_std = math.sqrt(3) + _noise_prior = gpytorch.priors.LogNormalPrior( + math.log(noise_mean) + noise_std**2, + noise_std, + ) + return gpytorch.likelihoods.GaussianLikelihood( + noise_prior=_noise_prior, + # Going below 1e-6 could introduuce a lot of numerical instability in the + # kernels, even if it's a noiseless function + noise_constraint=gpytorch.constraints.Interval( + lower_bound=1e-6, + upper_bound=1, + initial_value=noise_mean, + ), + ) + + +def default_signal_variance_prior() -> gpytorch.priors.NormalPrior: + # The outputscale prior is a bit more tricky. Essentially + # it describes how much we expect the function to move + # around the mean (0 as we normalize the `ys`) + # Based on `Vanilla GP work great in High Dimensions` by Carl Hvafner + # where it's fixed to `1.0`, we follow suit but allow some minor deviation + # with a prior. + return gpytorch.priors.NormalPrior(loc=1.0, scale=0.1) + + +def default_lengthscale_prior( + N: int, +) -> tuple[gpytorch.priors.LogNormalPrior, gpytorch.constraints.Interval]: + # Based on `Vanilla GP work great in High Dimensions` by Carl Hvafner + # TODO: I'm not convinced entirely that the `std` is independant + # of the dimension and number of samples + lengthscale_prior = gpytorch.priors.LogNormalPrior( + loc=math.sqrt(2.0) + math.log(N) / 2, + scale=math.sqrt(3.0), + ) + # NOTE: It's possible to just specify `GreaterThan`, however + # digging through the code, if this ends up at botorch's optimize, + # it will read this and take the bounds and give it to Scipy's + # L-BFGS-B optimizer. Without an upper bound, it defaults to `inf`, + # which can impact gradient estimates. + # tldr; set a bound if you have one, it always helps + lengthscale_constraint = gpytorch.constraints.Interval( + lower_bound=1e-4, + upper_bound=1e3, + initial_value=math.sqrt(2.0) + math.log(N) / 2, + ) + return lengthscale_prior, lengthscale_constraint + + +def default_mean() -> gpytorch.means.ConstantMean: + return gpytorch.means.ConstantMean( + constant_prior=gpytorch.priors.NormalPrior(0, 0.2), + constant_constraint=gpytorch.constraints.Interval( + lower_bound=-1e6, + upper_bound=1e6, + initial_value=0.0, + ), + ) + + +def default_matern_kernel( + N: int, # noqa: N803 + active_dims: tuple[int, ...] | None = None, +) -> ScaleKernel: + lengthscale_prior, lengthscale_constraint = default_lengthscale_prior(N) + + return ScaleKernel( + MaternKernel( + nu=2.5, + ard_num_dims=N, + active_dims=active_dims, + lengthscale_prior=lengthscale_prior, + lengthscale_constraint=lengthscale_constraint, + ), + ) + + +def default_categorical_kernel( + N: int, # noqa: N803 + active_dims: tuple[int, ...] | None = None, +) -> ScaleKernel: + # Following BoTorches implementation of the MixedSingleTaskGP + return ScaleKernel( + CategoricalKernel( + ard_num_dims=N, + active_dims=active_dims, + lengthscale_constraint=gpytorch.constraints.GreaterThan(1e-6), ) - optim_vars.append(noise_variance) - - if self.optimizer == "adam": - optim = Adam(optim_vars, **self.optimizer_kwargs) # type: ignore - elif self.optimizer == "sgd": - optim = SGD(optim_vars, **self.optimizer_kwargs) # type: ignore + ) + + +def default_single_obj_gp(x: DataPack, y: torch.Tensor) -> SingleTaskGP: + encoder = x.encoder + assert x.tensor is not None + assert encoder.tensors is not None + # Here, we will collect all graph encoded hyperparameters and assign each + # to its own individual WL kernel. + if encoder.graphs is not None: + raise NotImplementedError("Graphs are not yet supported.") + + numerics: list[str] = [] + categoricals: list[str] = [] + for hp_name, transformer in encoder.tensors.transformers.items(): + if isinstance(transformer, CategoricalToIntegerTransformer): + categoricals.append(hp_name) else: - raise ValueError(f"Invalid optimizer {self.optimizer}") - - K_inv: torch.Tensor | None = None - _init = torch.zeros if self.combined_kernel == "sum" else torch.ones - N = len(x) - K = _init((N, N), device=self.device, dtype=torch.float64) - for _i in range(self.optimizer_iters): - optim.zero_grad() - - for _kernel_name, (hps, opt_kernel) in opt_kernels.items(): - data = x.select(hps) - k = opt_kernel.forward(data) - K.add_(k) if self.combined_kernel == "sum" else K.mul_(k) - - K.diag().add_(noise_variance) - K_inv, logDetK = compute_pd_inverse(K) - nlml = -log_marginal_likelihood(K_inv, logDetK, y=self.y_normalized_) - - # TODO: Could early stop here... - nlml.backward() - optim.step() - - with torch.no_grad(): - noise_variance.clamp_(1e-6, np.inf) - - # Apply the optimal hyperparameters - assert K_inv is not None - self.K_inv_ = K_inv.clone() - self.noise_variance_ = noise_variance.item() - self.opt_kernels_ = opt_kernels - self.train_x_ = x - - def predict( - self, - *, - x: DataPack, - ) -> tuple[torch.Tensor, torch.Tensor]: - """Kriging predictions.""" - if ( - self.K_inv_ is None - or self.train_x_ is None - or self.y_normalized_ is None - or self.y_std_ is None - or self.opt_kernels_ is None - ): - raise ValueError( - "Inverse of Gram matrix is not instantiated. Please call the optimize " - "function to fit on the training data first!" - ) - - _init = torch.zeros if self.combined_kernel == "sum" else torch.ones - n_test = len(x) - - K_train_test = _init( - len(self.train_x_), n_test, device=self.device, dtype=torch.float64 + numerics.append(hp_name) + + categorical_indices = encoder.indices(categoricals) + numeric_indices = encoder.indices(numerics) + + # Purely vectorial + if len(categorical_indices) == 0: + return SingleTaskGP( + train_X=x.tensor, + train_Y=y, + mean_module=default_mean(), + likelihood=default_likelihood_with_prior(), + # Only matern kernel + covar_module=default_matern_kernel(len(numerics)), + outcome_transform=Standardize(m=1), + ) + + # Purely categorical + if len(numeric_indices) == 0: + return SingleTaskGP( + train_X=x.tensor, + train_Y=y, + mean_module=default_mean(), + likelihood=default_likelihood_with_prior(), + # Only categorical kernel + covar_module=default_categorical_kernel(len(categoricals)), + outcome_transform=Standardize(m=1), ) - for _kernel_name, (hps, opt_kernel) in self.opt_kernels_.items(): - train = self.train_x_.select(hps) - test = x.select(hps) - k = opt_kernel.forward(train, test) - if self.combined_kernel == "sum": - K_train_test.add_(k) - else: - K_train_test.mul_(k) - - K_test_test = _init(n_test, n_test, device=self.device, dtype=torch.float64) - for _kernel_name, (hps, opt_kernel) in self.opt_kernels_.items(): - test = x.select(hps) - k = opt_kernel.forward(test, test) - if self.combined_kernel == "sum": - K_test_test.add_(k) - else: - K_test_test.mul_(k) - - # Compute the predictive mean - - # Scale by the standard deviation and mean - mu_s = K_train_test.t() @ self.K_inv_ @ self.y_normalized_ - mu_s = mu_s * self.y_std_ + self.y_mean_ - - cov_s = K_test_test - K_train_test.t() @ self.K_inv_ @ K_train_test - cov_s *= self.y_std_**2 - - return mu_s, cov_s - - @classmethod - def get_default( - cls, space: SearchSpace, *, include_fidelities: bool = False - ) -> ComprehensiveGP: - kernels = get_default_kernels(space=space, include_fidelities=include_fidelities) - return cls(space=space, kernels=kernels) - - -def get_default_kernels( - *, - space: SearchSpace, - include_fidelities: bool = False, -) -> dict[str, tuple[Kernel, Transformer]]: - kernels: dict[str, tuple[Kernel, Transformer]] = {} - - # We will always need to use a graph kernel for graphs and there's no - # possibility to embed them into a tensor. - if any(space.graphs): - for hp_name in space.graphs: - kernels[f"graph_{hp_name}"] = ( - WeisfilerLehman(h=2, oa=True), - WLInputTransformer((hp_name,)), - ) - - assert all( - isinstance(f, (IntegerParameter, FloatParameter)) for f in space.fidelities - ), "Assumption for numeric represetnation of fidelity broken" - - any_numerical = any(space.numerical) or (include_fidelities and any(space.fidelities)) - if any_numerical: - # At least one numerical, fuse numeric + categoricals into one tensor encoding - transformers: list[TensorTransformer] = [] - if any(space.categoricals): - transformers.append(OneHotEncoder(tuple(space.categoricals))) - - if include_fidelities: - min_max_normalizer = MinMaxNormalizer( - tuple(space.numerical) + tuple(space.fidelities) - ) - else: - min_max_normalizer = MinMaxNormalizer(tuple(space.numerical)) - transformers.append(min_max_normalizer) - kernels["vectorial"] = (Matern52Kernel(), JointTransformer.join(*transformers)) - else: - # At this point, we assume only categoricals and maybe fidelities - assert any(space.categoricals) + # Mixed + def cont_kernel_factory( + batch_shape: torch.Size, + ard_num_dims: int, + active_dims: list[int], + ) -> ScaleKernel: + lengthscale_prior, lengthscale_constraint = default_lengthscale_prior( + ard_num_dims + ) + return ScaleKernel( + MaternKernel( + nu=2.5, + batch_shape=batch_shape, + ard_num_dims=ard_num_dims, + active_dims=active_dims, + lengthscale_prior=lengthscale_prior, + lengthscale_constraint=lengthscale_constraint, + ), + ) - if include_fidelities and any(space.fidelities): - fid_normalizer = MinMaxNormalizer(tuple(space.fidelities)) - one_hot_encoder = OneHotEncoder(tuple(space.categoricals)) + return MixedSingleTaskGP( + train_X=x.tensor, + train_Y=y, + cat_dims=list(categorical_indices), + likelihood=default_likelihood_with_prior(), + cont_kernel_factory=cont_kernel_factory, + outcome_transform=Standardize(m=1), + ) - transformer = JointTransformer.join(one_hot_encoder, fid_normalizer) - kernels["vectorial"] = (Matern52Kernel(), transformer) - else: - transformer = CategoricalToIntegerTransformer(tuple(space.categoricals)) - kernels["categorical"] = (HammingKernel(), transformer) - return kernels +def optimize_acq( + acq_fn: AcquisitionFunction, + encoder: DataEncoder, + *, + q: int, + num_restarts: int, + raw_samples: int, + acq_options: Mapping[str, Any] | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + acq_options = acq_options or {} + if encoder.has_graphs(): + raise NotImplementedError("Graphs are not yet supported.") + + assert encoder.tensors is not None + lower = [t.domain.lower for t in encoder.tensors.transformers.values()] + upper = [t.domain.upper for t in encoder.tensors.transformers.values()] + bounds = torch.tensor([lower, upper], dtype=torch.float) + + fixed_categoricals = encoder.categorical_product_indices() + + if not any(fixed_categoricals): + return optimize_acqf( + acq_function=acq_fn, + bounds=bounds, + q=q, + num_restarts=num_restarts, + raw_samples=raw_samples, + **acq_options, + ) + + if len(fixed_categoricals) > 30: + raise ValueError( + "The number of fixed categorical dimensions is too high. " + "This will lead to an explosion in the number of possible " + "combinations. Please reduce the number of fixed categorical " + "dimensions or consider encoding your categoricals in some other format." + ) + + # TODO: we should deterministicall shuffle the fixed_categoricals as the + # underlying function does not. + return optimize_acqf_mixed( + acq_function=acq_fn, + bounds=bounds, + num_restarts=num_restarts, + raw_samples=raw_samples, + q=q, + fixed_features_list=fixed_categoricals, # type: ignore + **acq_options, + ) diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 19efa6b6..a89a15aa 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -1,21 +1,22 @@ from __future__ import annotations import random -from itertools import chain -from typing import TYPE_CHECKING, Any, Literal, Mapping +from typing import TYPE_CHECKING, Any, Callable, Literal, Mapping import torch +from botorch.acquisition import ( + LinearMCObjective, + qLogExpectedImprovement, +) from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig from neps.optimizers.bayesian_optimization.acquisition_functions import ( - AcquisitionMapping, DecayingPriorWeightedAcquisition, ) -from neps.optimizers.bayesian_optimization.acquisition_samplers import ( - AcquisitionSamplerMapping, +from neps.optimizers.bayesian_optimization.models.gp import ( + default_single_obj_gp, + optimize_acq, ) -from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping -from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP from neps.search_spaces import ( CategoricalParameter, ConstantParameter, @@ -23,16 +24,13 @@ IntegerParameter, SearchSpace, ) -from neps.search_spaces.encoding import Encoder -from neps.utils.common import instance_from_map +from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN +from neps.search_spaces.encoding import DataEncoder if TYPE_CHECKING: - from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( - BaseAcquisition, - ) - from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, - ) + from botorch.models.model import Model + + from neps.search_spaces.encoding import DataPack from neps.state import BudgetInfo, Trial # TODO(eddiebergman): Why not just include in the definition of the parameters. @@ -53,10 +51,8 @@ def __init__( pipeline_space: SearchSpace, *, initial_design_size: int = 10, - surrogate_model: str | Any = "gp", - acquisition: str | BaseAcquisition = "EI", + surrogate_model: Literal["gp"] | Callable[[DataPack, torch.Tensor], Model] = "gp", log_prior_weighted: bool = False, - acquisition_sampler: str | AcquisitionSampler = "mutation", random_interleave_prob: float = 0.0, patience: int = 100, budget: None | int | float = None, @@ -67,6 +63,8 @@ def __init__( disable_priors: bool = False, prior_confidence: Literal["low", "medium", "high"] | None = None, sample_default_first: bool = False, + device: torch.device | None = None, + **kwargs: Any, # TODO: Remove ): """Initialise the BO loop. @@ -128,44 +126,20 @@ def __init__( self._initial_design_size = initial_design_size self._random_interleave_prob = random_interleave_prob self._num_error_evaluations: int = 0 + self.device = device self.sample_default_first = sample_default_first + self.encoder: DataEncoder | None = None - if isinstance(surrogate_model, str): - if surrogate_model == "gp": - self.surrogate_model = ComprehensiveGP.get_default( - space=pipeline_space, - include_fidelities=False, - ) - self._encoder = Encoder.default(self.pipeline_space) - else: - raise NotImplementedError( - "Only 'gp' is supported as a surrogate model for now." - ) - self.surrogate_model = instance_from_map( - SurrogateModelMapping, - surrogate_model, - name="surrogate model", - kwargs=surrogate_model_args, - ) + if surrogate_model == "gp": + self._get_fitted_model = default_single_obj_gp else: - self.surrogate_model = surrogate_model + self._get_fitted_model = surrogate_model - self.acquisition = instance_from_map( - AcquisitionMapping, - acquisition, - name="acquisition function", - ) if self.pipeline_space.has_prior: self.acquisition = DecayingPriorWeightedAcquisition( self.acquisition, log=log_prior_weighted ) - self.acquisition_sampler = instance_from_map( - AcquisitionSamplerMapping, - acquisition_sampler, - name="acquisition sampler function", - kwargs={"patience": self.patience, "pipeline_space": self.pipeline_space}, - ) if self.pipeline_space.has_prior: for k, v in self.pipeline_space.items(): if v.is_fidelity or isinstance(v, ConstantParameter): @@ -179,6 +153,8 @@ def __init__( ] self.pipeline_space[k].default_confidence_score = confidence + self._cached_sobol_configs: list[dict[str, Any]] | None = None + def ask( self, trials: Mapping[str, Trial], @@ -192,74 +168,102 @@ def ask( for t in trials.values() if t.report is not None and t.report.loss is not None ] - train_x = [t.config for t in completed] - train_y: torch.Tensor = torch.as_tensor([t.report.loss for t in completed]) # type: ignore + x_configs = [t.config for t in completed] + y: torch.Tensor = torch.as_tensor( + [t.report.loss for t in completed], + dtype=torch.float64, + ) # type: ignore + + # We only do single objective for now but may as well include this for when we have MO + if y.ndim == 1: + y = y.unsqueeze(1) pending = [t.config for t in trials.values() if t.state.pending()] + if self.encoder is None: + self.encoder = DataEncoder.default_encoder( + self.pipeline_space, + include_fidelities=False, + ) space = self.pipeline_space - # TODO: This would be better if we could serialize these - # in their encoded form. later... - for name, hp in space.categoricals.items(): - for config in chain(train_x, pending): - config[name] = hp.choices.index(config[name]) - for name, hp in space.graphs.items(): - for config in chain(train_x, pending): - config[name] = hp.clone().load_from(config[name]) - if len(trials) == 0 and self.sample_default_first and space.has_prior: config = space.sample_default_configuration( patience=self.patience, ignore_fidelity=False - ) + ).hp_values() + elif len(trials) <= self._initial_design_size: - config = space.sample( - patience=self.patience, user_priors=True, ignore_fidelity=False - ) + if self._cached_sobol_configs is None: + assert self.encoder.tensors is not None + ndim = len(self.encoder.tensors.transformers) + sobol = torch.quasirandom.SobolEngine( + dimension=ndim, + scramble=True, + seed=5, + ) + + # TODO: Need a better encapsulation of this + x = sobol.draw(self._initial_design_size * ndim, dtype=torch.float64) + hp_normalized_values = [] + for i, (_k, v) in enumerate(self.encoder.tensors.transformers.items()): + tensor = v.domain.cast(x[:, i], frm=UNIT_FLOAT_DOMAIN) + tensor = tensor.unsqueeze(1) if tensor.ndim == 1 else tensor + hp_normalized_values.append(tensor) + + tensor = torch.cat(hp_normalized_values, dim=1) + uniq = torch.unique(tensor, dim=0) + self._cached_sobol_configs = self.encoder.tensors.decode_dicts(uniq) + + if len(trials) <= len(self._cached_sobol_configs): + config = self._cached_sobol_configs[len(trials) - 1] + else: + # The case where sobol sampling couldn't generate enough unique configs + config = space.sample( + patience=self.patience, ignore_fidelity=False, user_priors=False + ).hp_values() + elif random.random() < self._random_interleave_prob: config = space.sample( patience=self.patience, user_priors=False, ignore_fidelity=False - ) + ).hp_values() else: - try: - if len(pending) > 0: - # We want to use hallucinated results for the evaluations that have - # not finished yet. For this we fit a model on the finished - # evaluations and add these to the other results to fit another model. - self.surrogate_model.fit(train_x, train_y) - ys, _ = self.surrogate_model.predict(pending) - train_x += pending - train_y += list(ys.detach().numpy()) - - # TODO: When using a GP, if we've already fit the - # model due to the if stamet above, we only - # need to update the model with the new points. - # fit on all the data again, only the new points... - self.surrogate_model.fit(train_x, train_y) - self.acquisition.set_state(self.surrogate_model) - self.acquisition_sampler.set_state(x=train_x, y=train_y) - for _ in range(self.patience): - config = self.acquisition_sampler.sample(self.acquisition) - if config not in pending: - break - else: - config = space.sample( - patience=self.patience, user_priors=True, ignore_fidelity=False - ) - - except RuntimeError as e: - self.logger.exception( - "Model could not be updated due to below error. Sampling will not use" - " the model.", - exc_info=e, - ) - config = space.sample( - patience=self.patience, user_priors=True, ignore_fidelity=False - ) + assert self.encoder is not None + x = self.encoder.encode(x_configs, device=self.device) + if any(pending): + x_pending = self.encoder.encode(pending, device=self.device) + x_pending = x_pending.tensor + assert x_pending is not None + else: + x_pending = None + + model = self._get_fitted_model(x, y) + + N_CANDIDATES_REQUIRED = 1 + N_INITIAL_RANDOM_SAMPLES = 512 + N_RESTARTS = 20 + + candidates, _eis = optimize_acq( + # TODO: We should evaluate whether LogNoisyEI is better than LogEI + acq_fn=qLogExpectedImprovement( + model, + best_f=y.min(), + X_pending=x_pending, + # Unfortunatly, there's no option to indicate that we minimize + # the AcqFunction so we need to do some kind of transformation. + # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607 + objective=LinearMCObjective(weights=torch.tensor([-1.0])), + ), + encoder=self.encoder, + q=N_CANDIDATES_REQUIRED, + raw_samples=N_INITIAL_RANDOM_SAMPLES, + num_restarts=N_RESTARTS, + acq_options={}, # options to underlying optim function of botorch + ) + config = self.encoder.decode_dicts(candidates)[0] config_id = str(len(trials) + 1) return SampledConfig( id=config_id, - config=config.hp_values(), + config=config, previous_config_id=None, ), optimizer_state diff --git a/neps/optimizers/bayesian_optimization/sobol.py b/neps/optimizers/bayesian_optimization/sobol.py new file mode 100644 index 00000000..e69de29b diff --git a/neps/runtime.py b/neps/runtime.py index 5cf0f29f..7d1cd60f 100644 --- a/neps/runtime.py +++ b/neps/runtime.py @@ -46,7 +46,7 @@ def _default_worker_name() -> str: return f"{os.getpid()}-{isoformat}" -N_FAILED_GET_NEXT_PENDING_ATTEMPTS_BEFORE_ERROR = 10 +N_FAILED_GET_NEXT_PENDING_ATTEMPTS_BEFORE_ERROR = 0 N_FAILED_TO_SET_TRIAL_STATE = 10 Loc = TypeVar("Loc") @@ -388,7 +388,7 @@ def run(self) -> None: # noqa: C901, PLR0915 _repeated_fail_get_next_trial_count = 0 except Exception as e: _repeated_fail_get_next_trial_count += 1 - logger.error( + logger.debug( "Error while trying to get the next trial to evaluate.", exc_info=True ) diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py index 06814862..e3e297de 100644 --- a/neps/search_spaces/domain.py +++ b/neps/search_spaces/domain.py @@ -33,18 +33,22 @@ class NumberDomain(Generic[V]): bins: int | None = None dtype: torch.dtype = field(init=False, repr=False) - is_unit: bool = field(init=False, repr=False) + is_unit_float: bool = field(init=False, repr=False) midpoint: V = field(init=False, repr=False) is_log: bool = field(init=False, repr=False) length: V = field(init=False, repr=False) cardinality: int | None = field(init=False, repr=False) + bounds: tuple[V, V] = field(init=False, repr=False) def __post_init__(self): assert isinstance(self.lower, type(self.upper)) - object.__setattr__(self, "is_unit", self.lower == 0 and self.upper == 1) + is_int = isinstance(self.lower, int) object.__setattr__(self, "is_log", self.log_bounds is not None) + object.__setattr__(self, "dtype", torch.int64 if is_int else torch.float64) object.__setattr__( - self, "dtype", torch.int64 if isinstance(self.lower, int) else torch.float64 + self, + "is_unit_float", + self.lower == 0 and self.upper == 1 and is_int and not self.round, ) object.__setattr__(self, "length", self.upper - self.lower) @@ -60,6 +64,7 @@ def __post_init__(self): if self.dtype == torch.int64: mid = int(round(mid)) object.__setattr__(self, "midpoint", mid) + object.__setattr__(self, "bounds", (self.lower, self.upper)) @classmethod def float( @@ -107,7 +112,7 @@ def indices(cls, n: int) -> NumberDomain[int]: return NumberDomain.int(0, n - 1) def to_unit(self, x: Tensor) -> Tensor: - if self.is_unit: + if self.is_unit_float: return x # type: ignore if self.log_bounds is not None: @@ -119,7 +124,7 @@ def to_unit(self, x: Tensor) -> Tensor: return (x - lower) / (upper - lower) def from_unit(self, x: Tensor) -> Tensor: - if self.is_unit: + if self.is_unit_float: return x bins = self.bins @@ -146,10 +151,6 @@ def cast( x: Tensor, frm: Domain, ) -> Tensor: - if isinstance(frm, OneHotDomain): - x = torch.argmax(x, dim=1) - frm = frm.int_domain - # NOTE: In general, we should always be able to go through the unit interval # [0, 1] to be able to transform between domains. However sometimes we can # bypass some steps, dependant on the domains, hence the ugliness... @@ -168,7 +169,7 @@ def cast( # Shortcut 2. (From normalized) # The domain we are coming from is already normalized, we only need to lift - if frm.is_unit: + if frm.is_unit_float: return self.from_unit(x) # type: ignore # Shortcut 3. (Log lift) @@ -289,28 +290,6 @@ def unit_float(cls) -> NumberDomain[float]: return UNIT_FLOAT_DOMAIN -@dataclass(frozen=True) -class OneHotDomain: - cardinality: int - int_domain: NumberDomain[int] = field(init=False, repr=False) - - def __post_init__(self): - object.__setattr__( - self, - "int_domain", - NumberDomain.indices(self.cardinality), - ) - - def cast(self, x: Tensor, frm: NumberDomain[int]) -> Tensor: - # Convert to integers first - x = self.int_domain.cast(x, frm) - - # Then one hot encode - buffer = torch.zeros((len(x), self.cardinality)) - buffer.scatter_(1, x.unsqueeze(1), 1) - return buffer - - UNIT_FLOAT_DOMAIN = NumberDomain.float(0.0, 1.0) -Domain: TypeAlias = NumberDomain | OneHotDomain +Domain: TypeAlias = NumberDomain diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py index adcaa121..3d9d2928 100644 --- a/neps/search_spaces/encoding.py +++ b/neps/search_spaces/encoding.py @@ -1,10 +1,12 @@ from __future__ import annotations from dataclasses import dataclass, field +from itertools import chain from typing import ( TYPE_CHECKING, Any, Generic, + Mapping, Sequence, Sized, TypeAlias, @@ -17,13 +19,16 @@ import numpy.typing as npt import torch from grakel.utils import graph_from_networkx +from torch._dynamo.utils import product +from neps.search_spaces.architecture.graph_grammar import GraphParameter from neps.search_spaces.domain import ( UNIT_FLOAT_DOMAIN, Domain, NumberDomain, - OneHotDomain, ) +from neps.search_spaces.hyperparameters.float import FloatParameter +from neps.search_spaces.hyperparameters.integer import IntegerParameter if TYPE_CHECKING: import networkx as nx @@ -47,7 +52,7 @@ class TensorTransformer(Transformer[torch.Tensor], Protocol): def encode( self, - x: list[Any], + x: Sequence[Any], *, out: torch.Tensor | None = None, dtype: torch.dtype | None = None, @@ -57,7 +62,7 @@ def encode( @dataclass class CategoricalToIntegerTransformer(TensorTransformer): - choices: list[Any] + choices: Sequence[Any] domain: NumberDomain = field(init=False) output_cols: int = field(init=False) @@ -68,6 +73,7 @@ def __post_init__(self): self.domain = NumberDomain.indices(len(self.choices)) self.output_cols = 1 + self._lookup = None if len(self.choices) > 3: try: self._lookup = {c: i for i, c in enumerate(self.choices)} @@ -77,7 +83,7 @@ def __post_init__(self): @override def encode( self, - x: list[Any], + x: Sequence[Any], *, out: torch.Tensor | None = None, dtype: torch.dtype | None = None, @@ -92,16 +98,16 @@ def encode( else [self.choices.index(c) for c in x] ) + tensor = torch.tensor(values, dtype=torch.int64, device=device) if out is None: - return torch.tensor(values, dtype=dtype, device=device) + return tensor.to(dtype) - assert out.shape == (len(x),), f"{out.shape} != {(len(x),)}" - out[:] = torch.tensor(values, dtype=out.dtype, device=out.device) + out.copy_(tensor.to(out.dtype)).round_() return out @override def decode(self, x: torch.Tensor) -> list[Any]: - return [self.choices[i] for i in x] + return [self.choices[int(i)] for i in torch.round(x).tolist()] # TODO: Maybe add a shift argument, could be useful to have `0` as midpoint @@ -137,8 +143,7 @@ def encode( if out is None: return values - assert out.shape == (len(x),), f"{out.shape} != {(len(x),)}" - out[:] = values + out.copy_(values) return out @override @@ -147,51 +152,6 @@ def decode(self, x: torch.Tensor) -> list[V]: return values.tolist() -@dataclass -class OneHotEncoder(TensorTransformer): - choices: list[Any] - - domain: OneHotDomain = field(init=False) - output_cols: int = field(init=False) - categorical_to_integer: CategoricalToIntegerTransformer = field(init=False) - - def __post_init__(self): - self.categorical_to_integer = CategoricalToIntegerTransformer(self.choices) - self.output_cols = len(self.choices) - - @override - def encode( - self, - x: list[Any], - *, - out: torch.Tensor | None = None, - dtype: torch.dtype | None = None, - device: torch.device | None = None, - ) -> torch.Tensor: - if out is not None: - dtype = out.dtype - device = out.device - else: - dtype = torch.float64 if dtype is None else dtype - - ints = self.categorical_to_integer.encode(x, dtype=torch.int64, device=device) - shape = (len(x), self.output_cols) - if out is None: - buffer = torch.zeros(size=shape, dtype=dtype, device=device) - else: - assert out.shape == shape, f"{out.shape} != {shape}" - buffer = out - - cat_tensor = torch.tensor(ints, dtype=torch.int64, device=device).unsqueeze(1) - buffer.scatter_(1, cat_tensor, 1) - return buffer - - @override - def decode(self, x: torch.Tensor) -> list[Any]: - ints = torch.argmax(x, dim=1) - return self.categorical_to_integer.decode(ints) - - @dataclass class WLInputTransformer(Transformer[WLInput]): hp: str @@ -199,7 +159,7 @@ class WLInputTransformer(Transformer[WLInput]): def encode(self, x: Sequence[nx.Graph]) -> list[WLInput]: return [graph_from_networkx(g) for g in x] # type: ignore - def decode(self, x: dict[str, list[WLInput]]) -> dict[str, list[Any]]: + def decode(self, x: Mapping[str, Sequence[WLInput]]) -> dict[str, list[Any]]: raise NotImplementedError("Cannot decode WLInput to values.") @@ -224,10 +184,10 @@ def select( return x[:, [self.column_lookup[h] for h in hp]] - def encode(self, x: list[SearchSpace]) -> npt.NDArray[np.object_]: + def encode(self, x: Sequence[Any]) -> npt.NDArray[np.object_]: buffer = np.empty((len(x), len(self.transformers)), dtype=np.object_) for hp, transformer in self.transformers.items(): - values = [conf[hp].value for conf in x] + values = [conf[hp] for conf in x] buffer[:, self.column_lookup[hp]] = transformer.encode(values) # type: ignore return buffer @@ -259,7 +219,7 @@ def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor: def encode( self, - x: list[SearchSpace], + x: Sequence[Mapping[str, Any]], *, device: torch.device | None = None, ) -> torch.Tensor: @@ -269,11 +229,12 @@ def encode( for hp_name, transformer in self.transformers.items(): values = [conf[hp_name] for conf in x] lookup = self.column_lookup[hp_name] + lookup = lookup[0] if lookup[1] - lookup[0] == 1 else slice(*lookup) # Encode directly into buffer transformer.encode( values, - out=buffer[:, slice(*lookup)], + out=buffer[:, lookup], dtype=torch.float64, device=device, ) @@ -284,7 +245,12 @@ def decode_dicts(self, x: torch.Tensor) -> list[dict[str, Any]]: values: dict[str, list[Any]] = {} for hp_name, transformer in self.transformers.items(): lookup = self.column_lookup[hp_name] - values[hp_name] = transformer.decode(x[:, slice(*lookup)]) + if lookup[1] == lookup[0] + 1: + tensor = x[:, lookup[0]] + else: + tensor = x[:, slice(*lookup)] + + values[hp_name] = transformer.decode(tensor) keys = list(values.keys()) return [dict(zip(keys, vals)) for vals in zip(*values.values())] @@ -297,13 +263,13 @@ class DataEncoder: def encode( self, - x: list[SearchSpace], + x: Sequence[Mapping[str, Any]], *, device: torch.device | None = None, - ) -> tuple[torch.Tensor | None, npt.NDArray[np.object_] | None]: + ) -> DataPack: tensor = self.tensors.encode(x, device=device) if self.tensors else None graphs = self.graphs.encode(x) if self.graphs else None - return tensor, graphs + return DataPack(encoder=self, tensor=tensor, graphs=graphs) @overload def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor: ... @@ -358,21 +324,110 @@ def decode_dicts( assert graph_values is not None return graph_values + def indices(self, hp: str | Sequence[str]) -> tuple[int, ...]: + if isinstance(hp, str): + if self.tensors and hp in self.tensors.transformers: + lower, upper = self.tensors.column_lookup[hp] + return tuple(torch.arange(lower, upper).tolist()) + + if self.graphs and hp in self.graphs.transformers: + raise ValueError("Cannot select indices from graphs.") + + tkeys = None if self.tensors is None else self.tensors.transformers.keys() + gkeys = None if self.graphs is None else self.graphs.transformers.keys() + raise KeyError( + f"Unknown hyperparameter {hp}. Not in either tensors or graphs" + f"\nTensors: {tkeys}" + f"\nGraphs: {gkeys}" + ) + + return tuple(sorted(chain.from_iterable(self.indices(h) for h in hp))) + + @classmethod + def default_encoder( + cls, + space: SearchSpace, + *, + include_fidelities: bool | list[str] = False, + ) -> DataEncoder: + tensor_transformers: dict[str, TensorTransformer] = {} + graph_transformers: dict[str, WLInputTransformer] = {} + + for hp_name, hp in space.categoricals.items(): + tensor_transformers[hp_name] = CategoricalToIntegerTransformer(hp.choices) + + for hp_name, hp in space.numerical.items(): + assert isinstance(hp, (FloatParameter, IntegerParameter)) + tensor_transformers[hp_name] = MinMaxNormalizer(hp.domain) + + for hp_name, hp in space.graphs.items(): + assert isinstance(hp, GraphParameter) + graph_transformers[hp_name] = WLInputTransformer(hp_name) + + if include_fidelities is True: + include_fidelities = list(space.fidelities.keys()) + + if include_fidelities: + for fid_name in include_fidelities: + hp = space.fidelities[fid_name] + assert isinstance(hp, (FloatParameter, IntegerParameter)) + tensor_transformers[fid_name] = MinMaxNormalizer(hp.domain) + + tensor_encoder = ( + TensorEncoder(tensor_transformers) if any(tensor_transformers) else None + ) + graph_encoder = ( + GraphEncoder(graph_transformers) if any(graph_transformers) else None + ) + return DataEncoder(tensors=tensor_encoder, graphs=graph_encoder) + + def has_categoricals(self) -> bool: + return self.tensors is not None and any( + isinstance(t, CategoricalToIntegerTransformer) + for t in self.tensors.transformers.values() + ) + + def has_graphs(self) -> bool: + return self.graphs is not None + + def has_numericals(self) -> bool: + return self.tensors is not None and any( + not isinstance(t, CategoricalToIntegerTransformer) + for t in self.tensors.transformers.values() + ) + + def categorical_product_indices(self) -> list[dict[int, int]]: + cats: dict[int, list[int]] = {} + if self.tensors is None: + return [] + + for i, (_hp_name, transformer) in enumerate(self.tensors.transformers.items()): + if isinstance(transformer, CategoricalToIntegerTransformer): + cats[i] = list(range(len(transformer.choices))) + + if len(cats) == 0: + return [] + + if len(cats) == 1: + key, values = cats.popitem() + return [{key: v} for v in values] + + return [dict(zip(cats.keys(), vs)) for vs in product(*cats.values())] + @dataclass class DataPack(Sized): - space: SearchSpace encoder: DataEncoder - numerical: torch.Tensor | None = None + tensor: torch.Tensor | None = None graphs: npt.NDArray[np.object_] | None = None _len: int = field(init=False) def __post_init__(self): - if self.numerical is not None and self.graphs is not None: - assert len(self.numerical) == len(self.graphs) - self._len = len(self.numerical) - elif self.numerical is not None: - self._len = len(self.numerical) + if self.tensor is not None and self.graphs is not None: + assert len(self.tensor) == len(self.graphs) + self._len = len(self.tensor) + elif self.tensor is not None: + self._len = len(self.tensor) elif self.graphs is not None: self._len = len(self.graphs) else: @@ -384,8 +439,8 @@ def __len__(self) -> int: def select(self, hp: str | Sequence[str]) -> torch.Tensor | npt.NDArray[np.object_]: if isinstance(hp, str): if self.encoder.tensors and hp in self.encoder.tensors.transformers: - assert self.numerical is not None - return self.encoder.tensors.select(self.numerical, hp) + assert self.tensor is not None + return self.encoder.tensors.select(self.tensor, hp) if self.encoder.graphs and hp in self.encoder.graphs.transformers: assert self.graphs is not None @@ -427,16 +482,67 @@ def select(self, hp: str | Sequence[str]) -> torch.Tensor | npt.NDArray[np.objec ) if all_in_tensors: - assert self.numerical is not None + assert self.tensor is not None assert self.encoder.tensors is not None - return self.encoder.tensors.select(self.numerical, hp) + return self.encoder.tensors.select(self.tensor, hp) assert self.graphs is not None assert self.encoder.graphs is not None return self.encoder.graphs.select(self.graphs, hp) - def decode(self) -> list[SearchSpace]: + def decode(self, space: SearchSpace) -> list[SearchSpace]: return [ - self.space.from_dict(d) - for d in self.encoder.decode_dicts((self.numerical, self.graphs)) + space.from_dict(d) + for d in self.encoder.decode_dicts((self.tensor, self.graphs)) ] + + def split(self, index: int) -> tuple[DataPack, DataPack]: + if self.tensor is not None: + numerical_left = self.tensor[:index] + numerical_right = self.tensor[index:] + else: + numerical_left = None + numerical_right = None + + if self.graphs is not None: + graphs_left = self.graphs[:index] + graphs_right = self.graphs[:index] + else: + graphs_left = None + graphs_right = None + + return ( + DataPack( + self.encoder, + tensor=numerical_left, + graphs=graphs_left, + ), + DataPack( + self.encoder, + tensor=numerical_right, + graphs=graphs_right, + ), + ) + + def join(self, *other: DataPack) -> DataPack: + assert all(o.encoder == self.encoder for o in other) + + if self.tensor is not None: + other_numericals = [] + for o in other: + assert o.tensor is not None + other_numericals.append(o.tensor) + numerical = torch.cat([self.tensor, *other_numericals], dim=0) + else: + numerical = None + + if self.graphs is not None: + other_graphs = [] + for o in other: + assert o.graphs is not None + other_graphs.append(o.graphs) + graphs = np.concatenate([self.graphs, *other_graphs], axis=0) + else: + graphs = None + + return DataPack(self.encoder, tensor=numerical, graphs=graphs) diff --git a/neps/search_spaces/hyperparameters/float.py b/neps/search_spaces/hyperparameters/float.py index b780f3ff..6086e3b7 100644 --- a/neps/search_spaces/hyperparameters/float.py +++ b/neps/search_spaces/hyperparameters/float.py @@ -8,6 +8,7 @@ import numpy as np +from neps.search_spaces.domain import NumberDomain from neps.search_spaces.hyperparameters.numerical import NumericalParameter if TYPE_CHECKING: @@ -70,6 +71,7 @@ def __init__( default=float(default) if default is not None else None, default_confidence=default_confidence, is_fidelity=is_fidelity, + domain=NumberDomain.float(lower, upper, log=log), ) @override diff --git a/neps/search_spaces/hyperparameters/integer.py b/neps/search_spaces/hyperparameters/integer.py index 6462cc63..da3bbd71 100644 --- a/neps/search_spaces/hyperparameters/integer.py +++ b/neps/search_spaces/hyperparameters/integer.py @@ -7,6 +7,7 @@ import numpy as np +from neps.search_spaces.domain import NumberDomain from neps.search_spaces.hyperparameters.float import FloatParameter from neps.search_spaces.hyperparameters.numerical import NumericalParameter @@ -75,6 +76,7 @@ def __init__( is_fidelity=is_fidelity, default=int(np.rint(default)) if default is not None else None, default_confidence=default_confidence, + domain=NumberDomain.int(lower, upper, log=log), ) # We subtract/add 0.499999 from lower/upper bounds respectively, such that diff --git a/neps/search_spaces/hyperparameters/numerical.py b/neps/search_spaces/hyperparameters/numerical.py index 9aaaf6d1..f00b590c 100644 --- a/neps/search_spaces/hyperparameters/numerical.py +++ b/neps/search_spaces/hyperparameters/numerical.py @@ -32,6 +32,7 @@ from neps.search_spaces.parameter import MutatableParameter, ParameterWithPrior if TYPE_CHECKING: + from neps.search_spaces.domain import NumberDomain from neps.search_spaces.hyperparameters.float import FloatParameter from neps.search_spaces.hyperparameters.integer import IntegerParameter from neps.utils.types import TruncNorm @@ -81,6 +82,7 @@ def __init__( log: bool = False, default: T | None, is_fidelity: bool, + domain: NumberDomain[T], default_confidence: Literal["low", "medium", "high"] = "low", ): """Initialize the numerical hyperparameter. @@ -133,6 +135,7 @@ def __init__( self.lower: T = lower self.upper: T = upper self.log: bool = log + self.domain = domain self.log_value: float | None = None self.log_bounds: tuple[float, float] | None = None self.log_default: float | None = None diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py index 2a20399d..a89c9bcc 100644 --- a/neps_examples/basic_usage/hyperparameters.py +++ b/neps_examples/basic_usage/hyperparameters.py @@ -2,19 +2,30 @@ import time import numpy as np +import math +import random import neps -def run_pipeline(float1, float2, categorical, integer1, integer2): - loss = -float(np.sum([float1, float2, int(categorical), integer1, integer2])) +def run_pipeline(float1, float2, float3, categorical, integer1, integer2): + loss = -float( + np.sum( + [ + (float1 * float2 / (float3 + 1)) * int(categorical), + integer1, + math.log(integer2), + ] + ) + ) # Random noise # time.sleep(0.7) # For demonstration purposes return loss pipeline_space = dict( float1=neps.FloatParameter(lower=0, upper=1), - float2=neps.FloatParameter(lower=-10, upper=10), + float2=neps.FloatParameter(lower=0, upper=20), + float3=neps.FloatParameter(lower=0, upper=5), categorical=neps.CategoricalParameter(choices=[0, 1]), integer1=neps.IntegerParameter(lower=0, upper=1), integer2=neps.IntegerParameter(lower=1, upper=1000, log=True), From cf43821cef6cd38cb53b4cfbdb50b3ca7f776f68 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Tue, 27 Aug 2024 18:03:25 +0200 Subject: [PATCH 12/63] refactor: Prior distributions --- .../acquisition_functions/__init__.py | 9 +- .../acquisition_functions/prior_weighted.py | 56 +++- .../bayesian_optimization/cost_cooling.py | 49 ++-- .../bayesian_optimization/models/gp.py | 19 +- .../bayesian_optimization/optimizer.py | 273 +++++++----------- neps/runtime.py | 7 +- neps/search_spaces/domain.py | 204 ++++++------- neps/search_spaces/encoding.py | 73 +++-- neps/search_spaces/hyperparameters/float.py | 4 +- neps/search_spaces/hyperparameters/integer.py | 4 +- .../hyperparameters/numerical.py | 4 +- neps/state/neps_state.py | 140 ++++----- neps/state/optimizer.py | 12 +- pyproject.toml | 1 + 14 files changed, 423 insertions(+), 432 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py b/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py index 89cfb4fb..0d0893a5 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py @@ -7,14 +7,13 @@ ComprehensiveExpectedImprovement, ) from neps.optimizers.bayesian_optimization.acquisition_functions.mf_ei import MFEI -from neps.optimizers.bayesian_optimization.acquisition_functions.ucb import ( - UpperConfidenceBound, - MF_UCB, -) from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import ( DecayingPriorWeightedAcquisition, ) - +from neps.optimizers.bayesian_optimization.acquisition_functions.ucb import ( + MF_UCB, + UpperConfidenceBound, +) AcquisitionMapping: dict[str, Callable] = { "EI": partial( diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py index ca3a3f5b..7b0d4318 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py @@ -1,9 +1,59 @@ -from typing import Iterable, Union +from __future__ import annotations + +from typing import TYPE_CHECKING, Iterable +from typing_extensions import override import numpy as np import torch +from botorch.acquisition import MCAcquisitionFunction + +from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( + BaseAcquisition, +) + +if TYPE_CHECKING: + from neps.priors import Prior + + +class PiboAcquisition(MCAcquisitionFunction): + """Compute a prior weighted acquisition function according to PiBO. + + * https://arxiv.org/pdf/2204.11051 + """ + + def __init__( + self, + acq_fn: MCAcquisitionFunction, + prior: Prior, + beta: float, + n: int, + ): + """Initialize the acquisition function. + + Args: + acq_fn: The acquisition function to be weighted. + prior: The prior distribution to be used for weighting. + beta: The beta parameter for weighting. + n: The denominator for the beta parameter. + """ + self._log = self.acq_fn._log + self.acq_fn = acq_fn + + self.beta = beta + self.n = n + self.prior = prior + + @override + def forward(self, X: torch.Tensor) -> torch.Tensor: + weight = self.beta / self.n + acq = self.acq_fn(X) + + # The weight is shown as being applied to the pdf and not the log_pdf + values = acq * self.prior.prob(X) * weight -from .base_acquisition import BaseAcquisition + # However, if the base acq function advertises as being log, + # i.e. self._log, then we should return the log of the values + return torch.log(values) if self._log else values class DecayingPriorWeightedAcquisition(BaseAcquisition): @@ -23,7 +73,7 @@ def eval( self, x: Iterable, **base_acquisition_kwargs, - ) -> Union[np.ndarray, torch.Tensor, float]: + ) -> np.ndarray | torch.Tensor | float: acquisition = self.base_acquisition(x, **base_acquisition_kwargs) if self.log: diff --git a/neps/optimizers/bayesian_optimization/cost_cooling.py b/neps/optimizers/bayesian_optimization/cost_cooling.py index 5a8926c7..eb3ee28e 100644 --- a/neps/optimizers/bayesian_optimization/cost_cooling.py +++ b/neps/optimizers/bayesian_optimization/cost_cooling.py @@ -1,35 +1,35 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING, Any from typing_extensions import override -from neps.state.optimizer import BudgetInfo -from neps.utils.types import ConfigResult -from neps.utils.common import instance_from_map +from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping from neps.optimizers.bayesian_optimization.acquisition_functions.cost_cooling import ( CostCooler, ) -from neps.search_spaces.search_space import SearchSpace -from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping -from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( - BaseAcquisition, -) -from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import ( - DecayingPriorWeightedAcquisition, -) from neps.optimizers.bayesian_optimization.acquisition_samplers import ( AcquisitionSamplerMapping, ) -from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, -) from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization +from neps.utils.common import instance_from_map + +if TYPE_CHECKING: + from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( + BaseAcquisition, + ) + from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( + AcquisitionSampler, + ) + from neps.search_spaces.search_space import SearchSpace + from neps.state.optimizer import BudgetInfo + from neps.utils.types import ConfigResult class CostCooling(BayesianOptimization): """Implements a basic cost-cooling as described in - "Cost-aware Bayesian Optimization" (https://arxiv.org/abs/2003.10870) by Lee et al.""" + "Cost-aware Bayesian Optimization" (https://arxiv.org/abs/2003.10870) by Lee et al. + """ def __init__( self, @@ -37,12 +37,12 @@ def __init__( initial_design_size: int = 10, surrogate_model: str | Any = "gp", cost_model: str | Any = "gp", - surrogate_model_args: dict = None, - cost_model_args: dict = None, + surrogate_model_args: dict | None = None, + cost_model_args: dict | None = None, optimal_assignment: bool = False, - domain_se_kernel: str = None, - graph_kernels: list = None, - hp_kernels: list = None, + domain_se_kernel: str | None = None, + graph_kernels: list | None = None, + hp_kernels: list | None = None, acquisition: str | BaseAcquisition = "EI", log_prior_weighted: bool = False, acquisition_sampler: str | AcquisitionSampler = "mutation", @@ -181,11 +181,6 @@ def __init__( self.acquisition = CostCooler(orig_acquisition) - if self.pipeline_space.has_prior: - self.acquisition = DecayingPriorWeightedAcquisition( - self.acquisition, log=log_prior_weighted - ) - self.acquisition_sampler = instance_from_map( AcquisitionSamplerMapping, acquisition_sampler, @@ -214,7 +209,7 @@ def load_optimization_state( train_y = [self.get_loss(el.result) for el in previous_results.values()] train_cost = [self.get_cost(el.result) for el in previous_results.values()] self._num_train_x = len(train_x) - self._pending_evaluations = [el for el in pending_evaluations.values()] + self._pending_evaluations = list(pending_evaluations.values()) if self._num_train_x >= self._initial_design_size: try: if len(self._pending_evaluations) > 0: diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index ab2884f3..b302edcd 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -144,7 +144,10 @@ def default_categorical_kernel( ) -def default_single_obj_gp(x: DataPack, y: torch.Tensor) -> SingleTaskGP: +def default_single_obj_gp( + x: DataPack, + y: torch.Tensor, +) -> SingleTaskGP: encoder = x.encoder assert x.tensor is not None assert encoder.tensors is not None @@ -222,9 +225,9 @@ def optimize_acq( acq_fn: AcquisitionFunction, encoder: DataEncoder, *, - q: int, - num_restarts: int, - raw_samples: int, + n_candidates_required: int = 1, + num_restarts: int = 20, + n_intial_start_points: int = 512, acq_options: Mapping[str, Any] | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: acq_options = acq_options or {} @@ -242,9 +245,9 @@ def optimize_acq( return optimize_acqf( acq_function=acq_fn, bounds=bounds, - q=q, + q=n_candidates_required, num_restarts=num_restarts, - raw_samples=raw_samples, + raw_samples=n_intial_start_points, **acq_options, ) @@ -262,8 +265,8 @@ def optimize_acq( acq_function=acq_fn, bounds=bounds, num_restarts=num_restarts, - raw_samples=raw_samples, - q=q, + raw_samples=n_intial_start_points, + q=n_candidates_required, fixed_features_list=fixed_categoricals, # type: ignore **acq_options, ) diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index a89a15aa..a08578d6 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -1,6 +1,6 @@ from __future__ import annotations -import random +import math from typing import TYPE_CHECKING, Any, Callable, Literal, Mapping import torch @@ -10,38 +10,25 @@ ) from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig -from neps.optimizers.bayesian_optimization.acquisition_functions import ( - DecayingPriorWeightedAcquisition, +from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import ( + PiboAcquisition, ) from neps.optimizers.bayesian_optimization.models.gp import ( default_single_obj_gp, optimize_acq, ) -from neps.search_spaces import ( - CategoricalParameter, - ConstantParameter, - FloatParameter, - IntegerParameter, - SearchSpace, -) -from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN +from neps.optimizers.initial_design import Sobol from neps.search_spaces.encoding import DataEncoder if TYPE_CHECKING: from botorch.models.model import Model + from neps.search_spaces import ( + SearchSpace, + ) from neps.search_spaces.encoding import DataPack from neps.state import BudgetInfo, Trial -# TODO(eddiebergman): Why not just include in the definition of the parameters. -CUSTOM_FLOAT_CONFIDENCE_SCORES = dict(FloatParameter.DEFAULT_CONFIDENCE_SCORES) -CUSTOM_FLOAT_CONFIDENCE_SCORES.update({"ultra": 0.05}) - -CUSTOM_CATEGORICAL_CONFIDENCE_SCORES = dict( - CategoricalParameter.DEFAULT_CONFIDENCE_SCORES -) -CUSTOM_CATEGORICAL_CONFIDENCE_SCORES.update({"ultra": 8}) - class BayesianOptimization(BaseOptimizer): """Implements the basic BO loop.""" @@ -50,18 +37,9 @@ def __init__( self, pipeline_space: SearchSpace, *, - initial_design_size: int = 10, + initial_design_size: int | None = None, surrogate_model: Literal["gp"] | Callable[[DataPack, torch.Tensor], Model] = "gp", - log_prior_weighted: bool = False, - random_interleave_prob: float = 0.0, - patience: int = 100, - budget: None | int | float = None, - ignore_errors: bool = False, - loss_value_on_error: None | float = None, - cost_value_on_error: None | float = None, - logger=None, - disable_priors: bool = False, - prior_confidence: Literal["low", "medium", "high"] | None = None, + use_priors: bool = False, sample_default_first: bool = False, device: torch.device | None = None, **kwargs: Any, # TODO: Remove @@ -70,95 +48,49 @@ def __init__( Args: pipeline_space: Space in which to search - initial_design_size: Number of 'x' samples that need to be evaluated before - selecting a sample using a strategy instead of randomly. + initial_design_size: Number of samples used before using the surrogate model. + If None, it will take `int(log(N) ** 2)` samples where `N` is the number + of parameters in the search space. surrogate_model: Surrogate model - acquisition: Acquisition strategy - log_prior_weighted: if to use log for prior - acquisition_sampler: Acquisition function fetching strategy - random_interleave_prob: Frequency at which random configurations are sampled - instead of configurations from the acquisition strategy. - patience: How many times we try something that fails before giving up. - budget: Maximum budget - ignore_errors: Ignore hyperparameter settings that threw an error and do not - raise an error. Error configs still count towards max_evaluations_total. - loss_value_on_error: Setting this and cost_value_on_error to any float will - supress any error during bayesian optimization and will use given loss - value instead. default: None - cost_value_on_error: Setting this and loss_value_on_error to any float will - supress any error during bayesian optimization and will use given cost - value instead. default: None - logger: logger object, or None to use the neps logger - disable_priors: allows to choose between BO and piBO regardless the search - space definition - sample_default_first: if True and a default prior exists, the first sampel is - the default configuration + use_priors: Whether to use priors set on the hyperparameters during search. Raises: - ValueError: if patience < 1 ValueError: if initial_design_size < 1 - ValueError: if random_interleave_prob is not between 0.0 and 1.0 ValueError: if no kernel is provided """ - if disable_priors: - pipeline_space.has_prior = False - self.prior_confidence = None - else: - self.prior_confidence = prior_confidence - - super().__init__( - pipeline_space=pipeline_space, - patience=patience, - logger=logger, - budget=budget, - loss_value_on_error=loss_value_on_error, - cost_value_on_error=cost_value_on_error, - ignore_errors=ignore_errors, - ) - - if initial_design_size < 1: + if initial_design_size is None: + N = len(pipeline_space.hyperparameters) + initial_design_size = int(max(1, math.log(N) ** 2)) + elif initial_design_size < 1: raise ValueError( "BayesianOptimization needs initial_design_size to be at least 1" ) - if not 0 <= random_interleave_prob <= 1: - raise ValueError("random_interleave_prob should be between 0.0 and 1.0") - self._initial_design_size = initial_design_size - self._random_interleave_prob = random_interleave_prob - self._num_error_evaluations: int = 0 + super().__init__(pipeline_space=pipeline_space) + + self.use_priors = use_priors + + # TODO: This needs to be moved to the search space class, however to not break + # the current prior based APIs, we will create this manually here + if use_priors: + self._prior_confidences = {} + self.device = device self.sample_default_first = sample_default_first - self.encoder: DataEncoder | None = None + self.n_initial_design = initial_design_size if surrogate_model == "gp": self._get_fitted_model = default_single_obj_gp else: self._get_fitted_model = surrogate_model - if self.pipeline_space.has_prior: - self.acquisition = DecayingPriorWeightedAcquisition( - self.acquisition, log=log_prior_weighted - ) - - if self.pipeline_space.has_prior: - for k, v in self.pipeline_space.items(): - if v.is_fidelity or isinstance(v, ConstantParameter): - continue - elif isinstance(v, (FloatParameter, IntegerParameter)): - confidence = CUSTOM_FLOAT_CONFIDENCE_SCORES[self.prior_confidence] - self.pipeline_space[k].default_confidence_score = confidence - elif isinstance(v, CategoricalParameter): - confidence = CUSTOM_CATEGORICAL_CONFIDENCE_SCORES[ - self.prior_confidence - ] - self.pipeline_space[k].default_confidence_score = confidence - - self._cached_sobol_configs: list[dict[str, Any]] | None = None + self.encoder_: DataEncoder | None = None + self.initial_design_: list[dict[str, Any]] | None = None def ask( self, trials: Mapping[str, Trial], - budget_info: BudgetInfo | None, + budget_info: BudgetInfo, optimizer_state: dict[str, Any], ) -> tuple[SampledConfig, dict[str, Any]]: # TODO: Lift this into runtime, let the @@ -174,96 +106,91 @@ def ask( dtype=torch.float64, ) # type: ignore - # We only do single objective for now but may as well include this for when we have MO + # We only do single objective for now but may as well include this + # for when we have MO if y.ndim == 1: y = y.unsqueeze(1) pending = [t.config for t in trials.values() if t.state.pending()] - if self.encoder is None: - self.encoder = DataEncoder.default_encoder( + if self.encoder_ is None: + self.encoder_ = DataEncoder.default_encoder( self.pipeline_space, include_fidelities=False, ) space = self.pipeline_space - if len(trials) == 0 and self.sample_default_first and space.has_prior: - config = space.sample_default_configuration( - patience=self.patience, ignore_fidelity=False - ).hp_values() - - elif len(trials) <= self._initial_design_size: - if self._cached_sobol_configs is None: - assert self.encoder.tensors is not None - ndim = len(self.encoder.tensors.transformers) - sobol = torch.quasirandom.SobolEngine( - dimension=ndim, - scramble=True, - seed=5, - ) + if self.initial_design_ is None: + size = self.n_initial_design + self.initial_design_ = [] - # TODO: Need a better encapsulation of this - x = sobol.draw(self._initial_design_size * ndim, dtype=torch.float64) - hp_normalized_values = [] - for i, (_k, v) in enumerate(self.encoder.tensors.transformers.items()): - tensor = v.domain.cast(x[:, i], frm=UNIT_FLOAT_DOMAIN) - tensor = tensor.unsqueeze(1) if tensor.ndim == 1 else tensor - hp_normalized_values.append(tensor) + if self.sample_default_first: + config = space.sample_default_configuration() + self.initial_design_.append(config.hp_values()) - tensor = torch.cat(hp_normalized_values, dim=1) - uniq = torch.unique(tensor, dim=0) - self._cached_sobol_configs = self.encoder.tensors.decode_dicts(uniq) + assert self.encoder_.tensors is not None + sobol = Sobol(seed=0, encoder=self.encoder_, allow_undersampling=True) + sobol_configs = sobol.sample(size - len(self.initial_design_)) + self.initial_design_.extend(sobol_configs) + else: + self.initial_design_ = [] - if len(trials) <= len(self._cached_sobol_configs): - config = self._cached_sobol_configs[len(trials) - 1] - else: - # The case where sobol sampling couldn't generate enough unique configs - config = space.sample( - patience=self.patience, ignore_fidelity=False, user_priors=False - ).hp_values() + config_id = str(len(trials) + 1) + if len(trials) < len(self.initial_design_): + config = self.initial_design_[len(trials)] + return ( + SampledConfig(id=config_id, config=config, previous_config_id=None), + optimizer_state, + ) - elif random.random() < self._random_interleave_prob: - config = space.sample( - patience=self.patience, user_priors=False, ignore_fidelity=False - ).hp_values() + assert self.encoder_ is not None + x = self.encoder_.encode(x_configs, device=self.device) + if any(pending): + x_pending = self.encoder_.encode(pending, device=self.device) + x_pending = x_pending.tensor + assert x_pending is not None else: - assert self.encoder is not None - x = self.encoder.encode(x_configs, device=self.device) - if any(pending): - x_pending = self.encoder.encode(pending, device=self.device) - x_pending = x_pending.tensor - assert x_pending is not None - else: - x_pending = None + x_pending = None - model = self._get_fitted_model(x, y) + model = self._get_fitted_model(x, y) - N_CANDIDATES_REQUIRED = 1 - N_INITIAL_RANDOM_SAMPLES = 512 - N_RESTARTS = 20 - - candidates, _eis = optimize_acq( - # TODO: We should evaluate whether LogNoisyEI is better than LogEI - acq_fn=qLogExpectedImprovement( - model, - best_f=y.min(), - X_pending=x_pending, - # Unfortunatly, there's no option to indicate that we minimize - # the AcqFunction so we need to do some kind of transformation. - # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607 - objective=LinearMCObjective(weights=torch.tensor([-1.0])), - ), - encoder=self.encoder, - q=N_CANDIDATES_REQUIRED, - raw_samples=N_INITIAL_RANDOM_SAMPLES, - num_restarts=N_RESTARTS, - acq_options={}, # options to underlying optim function of botorch - ) - config = self.encoder.decode_dicts(candidates)[0] + acq = qLogExpectedImprovement( + model, + best_f=y.min(), + X_pending=x_pending, + objective=LinearMCObjective(weights=torch.tensor([-1.0])), + ) - config_id = str(len(trials) + 1) - return SampledConfig( - id=config_id, - config=config, - previous_config_id=None, - ), optimizer_state + if self.use_priors: + # From the PIBO paper (Section 4.1) + # https://arxiv.org/pdf/2204.11051 + if budget_info.max_evaluations is not None: + beta = budget_info.max_evaluations / 10 + n = budget_info.used_evaluations + elif budget_info.max_cost_budget is not None: + # This might not work well if cost number is high + # early on, but it will start to normalize. + beta = budget_info.max_cost_budget / 10 + n = budget_info.used_cost_budget + + acq = PiboAcquisition(acq, n=n, beta=beta) + + candidates, _eis = optimize_acq( + # TODO: We should evaluate whether LogNoisyEI is better than LogEI + acq_fn=qLogExpectedImprovement( + model, + best_f=y.min(), + X_pending=x_pending, + # Unfortunatly, there's no option to indicate that we minimize + # the AcqFunction so we need to do some kind of transformation. + # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607 + objective=LinearMCObjective(weights=torch.tensor([-1.0])), + ), + encoder=self.encoder_, + acq_options={}, # options to underlying optim function of botorch + ) + config = self.encoder_.decode_dicts(candidates)[0] + return ( + SampledConfig(id=config_id, config=config, previous_config_id=None), + optimizer_state, + ) diff --git a/neps/runtime.py b/neps/runtime.py index 7d1cd60f..b102b153 100644 --- a/neps/runtime.py +++ b/neps/runtime.py @@ -513,7 +513,12 @@ def _launch_runtime( # noqa: PLR0913 optimizer_info=OptimizerInfo(optimizer_info), optimizer_state=OptimizationState( budget=( - BudgetInfo(max_cost_budget=max_cost_total, used_cost_budget=0) + BudgetInfo( + max_cost_budget=max_cost_total, + used_cost_budget=0, + max_evaluations=max_evaluations_total, + used_evaluations=0, + ) if max_cost_total is not None else None ), diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py index e3e297de..2081bf33 100644 --- a/neps/search_spaces/domain.py +++ b/neps/search_spaces/domain.py @@ -1,31 +1,58 @@ +"""A class representing a domain, a range for a value + properties. + +Some properties include: + +* The lower and upper bounds of the domain. +* Whether the domain is a log domain. +* Whether the domain is float/int. +* The midpoint of the domain. +* Whether the domain is split into bins. + +With that, the primary method of a domain is to be able to cast +values from one to domain to another, +e.g. `values_a = domain_a.cast(values_b, frm=domain_b)`. + +This can be used to convert float samples to integers, integers +to log space, etc. + +The core method to do so is to be able to cast `to_unit` which takes +values to a unit interval [0, 1], and then to be able to cast values in [0, 1] +to the new domain with `from_unit`. + +There are some shortcuts implemented in `cast`, such as skipping going through +the unit interval if the domains are the same, as no transformation is needed. + +The primary methods for creating a domain are + +* `Domain.float(l, u, ...)` - Used for modelling float ranges +* `Domain.int(l, u, ...)` - Used for modelling integer ranges +* `Domain.indices(n)` - Primarly used to model categorical choices + +If you have a tensor of values, where each column corresponds to a different domain, +you can take a look at `Domain.cast_many` to cast all the values in one go. + +If you need a unit-interval domain, please use the `Domain.unit_float()` or +`UNIT_FLOAT_DOMAIN` constant. +""" + # TODO: Could theoretically implement dtype,device,out for all methods here but # would need to be careful not to accidentally send to and from GPU. from __future__ import annotations import math from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Generic, TypeVar -from typing_extensions import TypeAlias +from typing import Generic, Sequence, TypeVar import torch from torch import Tensor -if TYPE_CHECKING: - from neps.search_spaces.distributions.truncnorm import TruncNormDistribution - from neps.search_spaces.distributions.uniform_float import ( - UniformFloatDistribution, - ) - from neps.search_spaces.distributions.uniform_int import UniformIntDistribution - from neps.search_spaces.distributions.weighted_ints import WeightedIntsDistribution - - Number = int | float V = TypeVar("V", int, float) V2 = TypeVar("V2", int, float) @dataclass(frozen=True) -class NumberDomain(Generic[V]): +class Domain(Generic[V]): lower: V upper: V round: bool @@ -74,8 +101,8 @@ def float( *, log: bool = False, bins: int | None = None, - ) -> NumberDomain[float]: - return NumberDomain( + ) -> Domain[float]: + return Domain( lower=float(lower), upper=float(upper), log_bounds=(math.log(lower), math.log(upper)) if log else None, @@ -91,8 +118,8 @@ def int( *, log: bool = False, bins: int | None = None, - ) -> NumberDomain[int]: - return NumberDomain( + ) -> Domain[int]: + return Domain( lower=int(round(lower)), upper=int(round(upper)), log_bounds=(math.log(lower), math.log(upper)) if log else None, @@ -101,7 +128,7 @@ def int( ) @classmethod - def indices(cls, n: int) -> NumberDomain[int]: + def indices(cls, n: int) -> Domain[int]: """Create a domain for a range of indices. Like range based functions this domain is inclusive of the lower bound @@ -109,7 +136,7 @@ def indices(cls, n: int) -> NumberDomain[int]: Use this method to create a domain for indices """ - return NumberDomain.int(0, n - 1) + return Domain.int(0, n - 1) def to_unit(self, x: Tensor) -> Tensor: if self.is_unit_float: @@ -187,109 +214,64 @@ def cast( lift = self.from_unit(norm) return lift # noqa: RET504 - def uniform_distribution(self) -> UniformFloatDistribution | UniformIntDistribution: - from neps.search_spaces.distributions import ( - UNIT_UNIFORM, - UniformFloatDistribution, - UniformIntDistribution, - ) - - # (Log Lift) - sample on it's log domain - if self.log_bounds is not None: - return UniformFloatDistribution.new(*self.log_bounds) - - # (Same Domain) - Just sample integers - if self.dtype == torch.int64 and self.bins is None: - return UniformIntDistribution.new(self.lower, self.upper) - - # NOTE: There's a possibility where you could use an integer distribution for - # binned domains, however the cost of sampling integers and casting is likely - # higher than just casting from normalized domain. Would need to verify this - # In any case, Normalized Uniform Float is a safe choice - - # (From Normalized) - return UNIT_UNIFORM - - def unit_uniform_distribution(self) -> UniformFloatDistribution: - from neps.search_spaces.distributions import UNIT_UNIFORM - - return UNIT_UNIFORM - - def truncnorm_distribution( - self, - center: Number, - *, - confidence: float | None = None, - std: float | None = None, - ) -> TruncNormDistribution: - from neps.search_spaces.distributions import TruncNormDistribution + @classmethod + def unit_float(cls) -> Domain[float]: + return UNIT_FLOAT_DOMAIN - # If you need a unit one, create this and then call `normalize()` on it. - if std is None and confidence is None: + @classmethod + def cast_many( + cls, x: Tensor, frm: Domain | Sequence[Domain], to: Domain | Sequence[Domain] + ) -> Tensor: + """Cast a tensor of mixed domains to a new set of mixed domains. + + Args: + x: Tensor of shape (n_samples, n_dims) with each dim `i` corresponding + to the domain `frm[i]`. + frm: List of domains to cast from. If list, must be length of `n_dims`, + otherwise we assume the single domain provided is the one to be used + across all dimensions. + to: List of domains to cast to. If list, must be length as `n_dims`, + otherwise we assume the single domain provided is the one to be used + across all dimensions. + + Returns: + Tensor of shape (n_samples, n_dims) with each dim `i` transformed + from the domain `frm[i]` to the domain `to[i]`. + """ + if x.ndim == 1: raise ValueError( - "Must specify either `std` in (lower, upper) or `confidence` in (0, 1)" + "Expected a 2D tensor of shape (n_samples, n_dims), got a 1D tensor." ) - if std is None: - assert 0 <= confidence <= 1 # type: ignore - _std = float(1 - confidence) # type: ignore - _is_normalized = True - else: - _std = float(std) - _is_normalized = False - - # (Log Lift) - sample on it's log domain - if self.log_bounds is not None: - return TruncNormDistribution.new( - lower=self.log_bounds[0], - center=math.log(center), - upper=self.log_bounds[1], - std=_std, - std_is_normalized=_is_normalized, - ) - - # NOTE: There's a possibility where you could use an integer distribution for - # binned domains, however the cost of sampling integers and casting is likely - # higher than just casting from normalized domain. Would need to verify this - # In any case, Normalized Uniform Float is a safe choice - - # (From Normalized) - truncnorm = TruncNormDistribution.new( - lower=self.lower, - center=math.log(center), - upper=self.upper, - std=_std, - std_is_normalized=_is_normalized, - ) - return truncnorm.normalize() - - def weighted_indices_distribution( - self, center_index: int, *, confidence: float - ) -> WeightedIntsDistribution: - from neps.search_spaces.distributions import WeightedIntsDistribution - - if self.cardinality is None: + if isinstance(frm, Sequence) and len(frm) != x.shape[1]: raise ValueError( - "Cannot create a weighted distribution for a continuous domain!" + "The number of domains in `frm` must match the number of tensors" + " if provided as a list." + f" Expected {x.shape[1]}, got {len(frm)}." ) - if not isinstance(center_index, int): + + if isinstance(to, Sequence) and len(to) != x.shape[1]: raise ValueError( - f"Center index must be an integer of type {self.dtype} to" - " create a weighted distribution!" + "The number of domains in `to` must match the number of tensors" + " if provided as a list." + f" Expected {x.shape[1]}, got {len(to)}." ) - assert 0 <= confidence <= 1 - return WeightedIntsDistribution.with_favoured( - n=self.cardinality, - favoured=int(round(center_index)), - confidence=confidence, - ) + # If both are not a list, we can just cast the whole tensor + if not isinstance(frm, Sequence) and not isinstance(to, Sequence): + return to.cast(x, frm=frm) - @classmethod - def unit_float(cls) -> NumberDomain[float]: - return UNIT_FLOAT_DOMAIN + # Otherwise, we need to go column by column + if isinstance(frm, Domain): + frm = [frm] * x.shape[1] + if isinstance(to, Domain): + to = [to] * x.shape[1] + + buffer = torch.empty_like(x) + for i, (f, t) in enumerate(zip(frm, to)): + buffer[:, i] = t.cast(x[:, i], frm=f) + return buffer -UNIT_FLOAT_DOMAIN = NumberDomain.float(0.0, 1.0) -Domain: TypeAlias = NumberDomain +UNIT_FLOAT_DOMAIN = Domain.float(0.0, 1.0) diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py index 3d9d2928..b4035297 100644 --- a/neps/search_spaces/encoding.py +++ b/neps/search_spaces/encoding.py @@ -25,7 +25,6 @@ from neps.search_spaces.domain import ( UNIT_FLOAT_DOMAIN, Domain, - NumberDomain, ) from neps.search_spaces.hyperparameters.float import FloatParameter from neps.search_spaces.hyperparameters.integer import IntegerParameter @@ -48,7 +47,6 @@ def decode(self, x: T) -> list[Any]: ... class TensorTransformer(Transformer[torch.Tensor], Protocol): domain: Domain - output_cols: int def encode( self, @@ -64,15 +62,13 @@ def encode( class CategoricalToIntegerTransformer(TensorTransformer): choices: Sequence[Any] - domain: NumberDomain = field(init=False) - output_cols: int = field(init=False) + domain: Domain = field(init=False) _lookup: dict[Any, int] | None = field(init=False) def __post_init__(self): assert len(self.choices) > 0 - self.domain = NumberDomain.indices(len(self.choices)) - self.output_cols = 1 + self.domain = Domain.indices(len(self.choices)) self._lookup = None if len(self.choices) > 3: try: @@ -114,14 +110,12 @@ def decode(self, x: torch.Tensor) -> list[Any]: # and `-0.5` as lower bound with `0.5` as upper bound. @dataclass class MinMaxNormalizer(TensorTransformer, Generic[V]): - original_domain: NumberDomain[V] + original_domain: Domain[V] - domain: NumberDomain[float] = field(init=False) - output_cols: int = field(init=False) + domain: Domain[float] = field(init=False) def __post_init__(self): self.domain = UNIT_FLOAT_DOMAIN - self.output_cols = 1 @override def encode( @@ -198,22 +192,30 @@ def decode_dicts(self, x: npt.NDArray[np.object_]) -> list[dict[str, Any]]: @dataclass class TensorEncoder: transformers: dict[str, TensorTransformer] - column_lookup: dict[str, tuple[int, int]] = field(init=False) + column_lookup: dict[str, int] = field(init=False) + n_numerical: int = field(init=False) + n_categorical: int = field(init=False) def __post_init__(self): - transformers = sorted( - self.transformers.items(), key=lambda t: (t[1].output_cols, t[0]) - ) + transformers = sorted(self.transformers.items(), key=lambda t: t[0]) self.transformers = dict(transformers) - self.column_lookup: dict[str, tuple[int, int]] = {} - offset = 0 - for name, transformer in self.transformers.items(): - self.column_lookup[name] = (offset, offset + transformer.output_cols) - offset += transformer.output_cols + self.column_lookup: dict[str, int] = {} + n_numerical = 0 + n_categorical = 0 + for i, (name, transformer) in enumerate(self.transformers.items()): + self.column_lookup[name] = i + if isinstance(transformer, CategoricalToIntegerTransformer): + n_categorical += 1 + else: + n_numerical += 1 + + self.n_numerical = n_numerical + self.n_categorical = n_categorical def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor: if isinstance(hp, str): - return x[:, slice(*self.column_lookup[hp])] + return x[:, self.column_lookup[hp]] + cols = torch.concatenate([torch.arange(*self.column_lookup[h]) for h in hp]) return x[:, cols] @@ -223,13 +225,12 @@ def encode( *, device: torch.device | None = None, ) -> torch.Tensor: - width = sum(t.output_cols for t in self.transformers.values()) + width = len(self.transformers) buffer = torch.empty((len(x), width), dtype=torch.float64, device=device) for hp_name, transformer in self.transformers.items(): values = [conf[hp_name] for conf in x] lookup = self.column_lookup[hp_name] - lookup = lookup[0] if lookup[1] - lookup[0] == 1 else slice(*lookup) # Encode directly into buffer transformer.encode( @@ -245,21 +246,39 @@ def decode_dicts(self, x: torch.Tensor) -> list[dict[str, Any]]: values: dict[str, list[Any]] = {} for hp_name, transformer in self.transformers.items(): lookup = self.column_lookup[hp_name] - if lookup[1] == lookup[0] + 1: - tensor = x[:, lookup[0]] - else: - tensor = x[:, slice(*lookup)] - + tensor = x[:, lookup] values[hp_name] = transformer.decode(tensor) keys = list(values.keys()) return [dict(zip(keys, vals)) for vals in zip(*values.values())] + def from_unit_tensor( + self, + x: torch.Tensor, + device: torch.device | None = None, + ) -> torch.Tensor: + buffer = torch.empty_like(x, dtype=torch.float64, device=device) + + for i, transformer in enumerate(self.transformers.values()): + buffer[:, i] = transformer.domain.cast(x[:, i], frm=UNIT_FLOAT_DOMAIN) + + return buffer + @dataclass class DataEncoder: tensors: TensorEncoder | None = None graphs: GraphEncoder | None = None + device: torch.device = field(default_factory=lambda: torch.device("cpu")) + + n_numerical: int = field(init=False) + n_categorical: int = field(init=False) + n_graphs: int = field(init=False) + + def __post_init__(self): + self.n_numerical = 0 if self.tensors is None else self.tensors.n_numerical + self.n_categorical = 0 if self.tensors is None else self.tensors.n_categorical + self.n_graphs = 0 if self.graphs is None else len(self.graphs.transformers) def encode( self, diff --git a/neps/search_spaces/hyperparameters/float.py b/neps/search_spaces/hyperparameters/float.py index 6086e3b7..f8808bfe 100644 --- a/neps/search_spaces/hyperparameters/float.py +++ b/neps/search_spaces/hyperparameters/float.py @@ -8,7 +8,7 @@ import numpy as np -from neps.search_spaces.domain import NumberDomain +from neps.search_spaces.domain import Domain from neps.search_spaces.hyperparameters.numerical import NumericalParameter if TYPE_CHECKING: @@ -71,7 +71,7 @@ def __init__( default=float(default) if default is not None else None, default_confidence=default_confidence, is_fidelity=is_fidelity, - domain=NumberDomain.float(lower, upper, log=log), + domain=Domain.float(lower, upper, log=log), ) @override diff --git a/neps/search_spaces/hyperparameters/integer.py b/neps/search_spaces/hyperparameters/integer.py index da3bbd71..b481ffc1 100644 --- a/neps/search_spaces/hyperparameters/integer.py +++ b/neps/search_spaces/hyperparameters/integer.py @@ -7,7 +7,7 @@ import numpy as np -from neps.search_spaces.domain import NumberDomain +from neps.search_spaces.domain import Domain from neps.search_spaces.hyperparameters.float import FloatParameter from neps.search_spaces.hyperparameters.numerical import NumericalParameter @@ -76,7 +76,7 @@ def __init__( is_fidelity=is_fidelity, default=int(np.rint(default)) if default is not None else None, default_confidence=default_confidence, - domain=NumberDomain.int(lower, upper, log=log), + domain=Domain.int(lower, upper, log=log), ) # We subtract/add 0.499999 from lower/upper bounds respectively, such that diff --git a/neps/search_spaces/hyperparameters/numerical.py b/neps/search_spaces/hyperparameters/numerical.py index f00b590c..8cca8309 100644 --- a/neps/search_spaces/hyperparameters/numerical.py +++ b/neps/search_spaces/hyperparameters/numerical.py @@ -32,7 +32,7 @@ from neps.search_spaces.parameter import MutatableParameter, ParameterWithPrior if TYPE_CHECKING: - from neps.search_spaces.domain import NumberDomain + from neps.search_spaces.domain import Domain from neps.search_spaces.hyperparameters.float import FloatParameter from neps.search_spaces.hyperparameters.integer import IntegerParameter from neps.utils.types import TruncNorm @@ -82,7 +82,7 @@ def __init__( log: bool = False, default: T | None, is_fidelity: bool, - domain: NumberDomain[T], + domain: Domain[T], default_confidence: Literal["low", "medium", "high"] = "low", ): """Initialize the numerical hyperparameter. diff --git a/neps/state/neps_state.py b/neps/state/neps_state.py index 8afaee62..163679d8 100644 --- a/neps/state/neps_state.py +++ b/neps/state/neps_state.py @@ -32,6 +32,75 @@ Loc = TypeVar("Loc") T = TypeVar("T") +def sample_trial( + neps_state, + optimizer: BaseOptimizer, + *, + worker_id: str, + _sample_hooks: list[Callable] | None = None, +) -> Trial: + """Sample a new trial from the optimizer. + + Args: + optimizer: The optimizer to sample the trial from. + worker_id: The worker that is sampling the trial. + _sample_hooks: A list of hooks to apply to the optimizer before sampling. + + Returns: + The new trial. + """ + with neps_state._optimizer_state.acquire() as ( + opt_state, + put_opt, + ), neps_state._seed_state.acquire() as (seed_state, put_seed_state): + trials: dict[Trial.ID, Trial] = {} + for trial_id, shared_trial in neps_state._trials.all().items(): + trial = shared_trial.synced() + trials[trial_id] = trial + + seed_state.set_as_global_seed_state() + + # TODO: Not sure if any existing pre_load hooks required + # it to be done after `load_results`... I hope not. + if _sample_hooks is not None: + for hook in _sample_hooks: + optimizer = hook(optimizer) + + # NOTE: We don't want optimizers mutating this before serialization + budget = opt_state.budget.clone() if opt_state.budget is not None else None + sampled_config, new_opt_state = optimizer.ask( + trials=trials, + budget_info=budget, + optimizer_state=opt_state.shared_state, + ) + + if sampled_config.previous_config_id is not None: + previous_trial = trials.get(sampled_config.previous_config_id) + if previous_trial is None: + raise ValueError( + f"Previous trial '{sampled_config.previous_config_id}' not found." + ) + previous_trial_location = previous_trial.metadata.location + else: + previous_trial_location = None + + trial = Trial.new( + trial_id=sampled_config.id, + location="", # HACK: This will be set by the `TrialRepo` + config=sampled_config.config, + previous_trial=sampled_config.previous_config_id, + previous_trial_location=previous_trial_location, + time_sampled=time.time(), + worker_id=worker_id, + ) + shared_trial = neps_state._trials.put_new(trial) + seed_state.recapture() + put_seed_state(seed_state) + put_opt( + OptimizationState(budget=opt_state.budget, shared_state=new_opt_state) + ) + + return trial @dataclass class NePSState(Generic[Loc]): @@ -71,75 +140,10 @@ def get_trials_by_ids(self, trial_ids: list[str], /) -> dict[str, Trial | None]: for _id, shared_trial in self._trials.get_by_ids(trial_ids).items() } - def sample_trial( - self, - optimizer: BaseOptimizer, - *, - worker_id: str, - _sample_hooks: list[Callable] | None = None, - ) -> Trial: - """Sample a new trial from the optimizer. - - Args: - optimizer: The optimizer to sample the trial from. - worker_id: The worker that is sampling the trial. - _sample_hooks: A list of hooks to apply to the optimizer before sampling. + def get_optimizer_instance(self) -> BaseOptimizer: + """Get the optimizer instance.""" + raise NotImplementedError - Returns: - The new trial. - """ - with self._optimizer_state.acquire() as ( - opt_state, - put_opt, - ), self._seed_state.acquire() as (seed_state, put_seed_state): - trials: dict[Trial.ID, Trial] = {} - for trial_id, shared_trial in self._trials.all().items(): - trial = shared_trial.synced() - trials[trial_id] = trial - - seed_state.set_as_global_seed_state() - - # TODO: Not sure if any existing pre_load hooks required - # it to be done after `load_results`... I hope not. - if _sample_hooks is not None: - for hook in _sample_hooks: - optimizer = hook(optimizer) - - # NOTE: We don't want optimizers mutating this before serialization - budget = opt_state.budget.clone() if opt_state.budget is not None else None - sampled_config, new_opt_state = optimizer.ask( - trials=trials, - budget_info=budget, - optimizer_state=opt_state.shared_state, - ) - - if sampled_config.previous_config_id is not None: - previous_trial = trials.get(sampled_config.previous_config_id) - if previous_trial is None: - raise ValueError( - f"Previous trial '{sampled_config.previous_config_id}' not found." - ) - previous_trial_location = previous_trial.metadata.location - else: - previous_trial_location = None - - trial = Trial.new( - trial_id=sampled_config.id, - location="", # HACK: This will be set by the `TrialRepo` - config=sampled_config.config, - previous_trial=sampled_config.previous_config_id, - previous_trial_location=previous_trial_location, - time_sampled=time.time(), - worker_id=worker_id, - ) - shared_trial = self._trials.put_new(trial) - seed_state.recapture() - put_seed_state(seed_state) - put_opt( - OptimizationState(budget=opt_state.budget, shared_state=new_opt_state) - ) - - return trial def report_trial_evaluation( self, diff --git a/neps/state/optimizer.py b/neps/state/optimizer.py index bd8cbc2e..11bd3eb6 100644 --- a/neps/state/optimizer.py +++ b/neps/state/optimizer.py @@ -10,18 +10,24 @@ class BudgetInfo: """Information about the budget of an optimizer.""" - max_cost_budget: float - used_cost_budget: float + max_cost_budget: float | None = None + used_cost_budget: float = 0.0 + max_evaluations: int | None = None + used_evaluations: int = 0 @property - def remaining_cost_budget(self) -> float: + def remaining_cost_budget(self) -> float | None: """The remaining budget.""" + if self.max_cost_budget is None: + return None return self.max_cost_budget - self.used_cost_budget def clone(self) -> BudgetInfo: return BudgetInfo( max_cost_budget=self.max_cost_budget, used_cost_budget=self.used_cost_budget, + max_evaluations=self.max_evaluations, + used_evaluations=self.used_evaluations, ) diff --git a/pyproject.toml b/pyproject.toml index b5be06c2..27e49fa2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -210,6 +210,7 @@ ignore = [ "PLR2004", # No magic numbers inline "N817", # CamelCase import as (ignore for ConfigSpace) "NPY002", # Replace legacy `np.random.choice` call with `np.random.Generator` + "N803", # Arguments should start with a lower case letter. ] From 6db710ecd2ca1d62749e13815c90814be34c76b3 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Tue, 27 Aug 2024 18:11:43 +0200 Subject: [PATCH 13/63] refactor: Add in the priors --- neps/distributions.py | 230 +++++++++++++++++++ neps/optimizers/initial_design.py | 74 ++++++ neps/priors.py | 366 ++++++++++++++++++++++++++++++ 3 files changed, 670 insertions(+) create mode 100644 neps/distributions.py create mode 100644 neps/optimizers/initial_design.py create mode 100644 neps/priors.py diff --git a/neps/distributions.py b/neps/distributions.py new file mode 100644 index 00000000..2361e191 --- /dev/null +++ b/neps/distributions.py @@ -0,0 +1,230 @@ +"""Custom distributions for NEPS.""" + +from __future__ import annotations + +import math +from dataclasses import dataclass +from numbers import Number +from typing import TYPE_CHECKING, ClassVar, Mapping +from typing_extensions import override + +import torch +from torch.distributions import Distribution, constraints +from torch.distributions.utils import broadcast_all + +if TYPE_CHECKING: + from neps.search_spaces.architecture.cfg_variants.constrained_cfg import Constraint + from neps.search_spaces.domain import Domain + +CONST_SQRT_2 = math.sqrt(2) +CONST_INV_SQRT_2PI = 1 / math.sqrt(2 * math.pi) +CONST_INV_SQRT_2 = 1 / math.sqrt(2) +CONST_LOG_INV_SQRT_2PI = math.log(CONST_INV_SQRT_2PI) +CONST_LOG_SQRT_2PI_E = 0.5 * math.log(2 * math.pi * math.e) + +# from https://github.com/toshas/torch_truncnorm + + +class TruncatedStandardNormal(Distribution): + """Truncated Standard Normal distribution. + + Source: https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + """ + + arg_constraints: ClassVar[Mapping[str, Constraint]] = { + "a": constraints.real, + "b": constraints.real, + } # type: ignore + has_rsample: ClassVar[bool] = True + eps: ClassVar[float] = 1e-6 + + def __init__( + self, + a: torch.Tensor, + b: torch.Tensor, + validate_args: bool | None = None, + device: torch.device | None = None, + ): + """Initialize a truncated standard normal distribution. + + Args: + a: Lower truncation bound. + b: Upper truncation bound. + validate_args: Whether to validate input. + device: Device to use. + """ + self.a, self.b = broadcast_all(a, b) + self.a = self.a.to(device) + self.b = self.b.to(device) + + if isinstance(a, Number) and isinstance(b, Number): + batch_shape = torch.Size() + else: + batch_shape = self.a.size() + + super().__init__(batch_shape, validate_args=validate_args) + + if self.a.dtype != self.b.dtype: + raise ValueError("Truncation bounds types are different") + + if any((self.a >= self.b).view(-1).tolist()): + raise ValueError("Incorrect truncation range") + + eps = self.eps + self._dtype_min_gt_0 = eps + self._dtype_max_lt_1 = 1 - eps + self._little_phi_a = self._little_phi(self.a) + self._little_phi_b = self._little_phi(self.b) + self._big_phi_a = self._big_phi(self.a) + self._big_phi_b = self._big_phi(self.b) + self._Z = (self._big_phi_b - self._big_phi_a).clamp(eps, 1 - eps) + self._log_Z = self._Z.log() + little_phi_coeff_a = torch.nan_to_num(self.a, nan=math.nan) + little_phi_coeff_b = torch.nan_to_num(self.b, nan=math.nan) + self._lpbb_m_lpaa_d_Z = ( + self._little_phi_b * little_phi_coeff_b + - self._little_phi_a * little_phi_coeff_a + ) / self._Z + self._mean = -(self._little_phi_b - self._little_phi_a) / self._Z + self._variance = ( + 1 + - self._lpbb_m_lpaa_d_Z + - ((self._little_phi_b - self._little_phi_a) / self._Z) ** 2 + ) + self._entropy = CONST_LOG_SQRT_2PI_E + self._log_Z - 0.5 * self._lpbb_m_lpaa_d_Z + + @constraints.dependent_property + @override + def support(self) -> constraints._Interval: + return constraints.interval(self.a, self.b) + + @property + @override + def mean(self) -> torch.Tensor: + return self._mean + + @property + @override + def variance(self) -> torch.Tensor: + return self._variance + + @override + def entropy(self) -> torch.Tensor: + return self._entropy + + @staticmethod + def _little_phi(x: torch.Tensor) -> torch.Tensor: + return (-(x**2) * 0.5).exp() * CONST_INV_SQRT_2PI + + def _big_phi(self, x: torch.Tensor) -> torch.Tensor: + phi = 0.5 * (1 + (x * CONST_INV_SQRT_2).erf()) + return phi.clamp(self.eps, 1 - self.eps) + + @staticmethod + def _inv_big_phi(x: torch.Tensor) -> torch.Tensor: + return CONST_SQRT_2 * (2 * x - 1).erfinv() + + @override + def cdf(self, value: torch.Tensor) -> torch.Tensor: + if self._validate_args: + self._validate_sample(value) + return ((self._big_phi(value) - self._big_phi_a) / self._Z).clamp(0, 1) + + @override + def icdf(self, value: torch.Tensor) -> torch.Tensor: + y = self._big_phi_a + value * self._Z + y = y.clamp(self.eps, 1 - self.eps) + return self._inv_big_phi(y) + + @override + def log_prob(self, value: torch.Tensor) -> torch.Tensor: + if self._validate_args: + self._validate_sample(value) + return CONST_LOG_INV_SQRT_2PI - self._log_Z - (value**2) * 0.5 + + @override + def rsample(self, sample_shape: torch.Size | None = None) -> torch.Tensor: + if sample_shape is None: + sample_shape = torch.Size([]) + shape = self._extended_shape(sample_shape) + p = torch.empty(shape, device=self.a.device).uniform_( + self._dtype_min_gt_0, self._dtype_max_lt_1 + ) + return self.icdf(p) + + +class TruncatedNormal(TruncatedStandardNormal): + """Truncated Normal distribution. + + https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + """ + + def __init__( + self, + loc: float | torch.Tensor, + scale: float | torch.Tensor, + a: float | torch.Tensor, + b: float | torch.Tensor, + validate_args: bool | None = None, + device: torch.device | None = None, + ): + """Initialize a truncated standard normal distribution. + + Args: + loc: The mean of the distribution. + scale: The std of the distribution. + a: The lower bound of the distribution. + b: The upper bound of the distribution. + validate_args: Whether to validate input. + device: Device to use. + """ + scale = torch.as_tensor(scale, device=device) + scale = scale.clamp_min(self.eps) + + self.loc, self.scale, a, b = broadcast_all(loc, scale, a, b) + a = a.to(device) # type: ignore + b = b.to(device) # type: ignore + self._non_std_a = a + self._non_std_b = b + a = (a - self.loc) / self.scale + b = (b - self.loc) / self.scale + super().__init__(a, b, validate_args=validate_args) # type: ignore + self._log_scale = self.scale.log() + self._mean = self._mean * self.scale + self.loc + self._variance = self._variance * self.scale**2 + self._entropy += self._log_scale + + def _to_std_rv(self, value): + return (value - self.loc) / self.scale + + def _from_std_rv(self, value): + return value * self.scale + self.loc + + @override + def cdf(self, value): + return super().cdf(self._to_std_rv(value)) + + @override + def icdf(self, value): + sample = self._from_std_rv(super().icdf(value)) + + # clamp data but keep gradients + sample_clip = torch.stack( + [sample.detach(), self._non_std_a.detach().expand_as(sample)], 0 + ).max(0)[0] + sample_clip = torch.stack( + [sample_clip, self._non_std_b.detach().expand_as(sample)], 0 + ).min(0)[0] + sample.data.copy_(sample_clip) + return sample + + @override + def log_prob(self, value): + value = self._to_std_rv(value) + return super().log_prob(value) - self._log_scale + + +@dataclass +class DistributionOverDomain: + distribution: Distribution + domain: Domain diff --git a/neps/optimizers/initial_design.py b/neps/optimizers/initial_design.py new file mode 100644 index 00000000..05553ab1 --- /dev/null +++ b/neps/optimizers/initial_design.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Protocol + +import torch + +if TYPE_CHECKING: + from neps.search_spaces.encoding import DataEncoder + + +class InitialDesign(Protocol): + def sample(self, n: int) -> list[dict[str, Any]]: ... + + +@dataclass +class Sobol(InitialDesign): + seed: int + """The seed for the Sobol sequence.""" + + encoder: DataEncoder + """The encoding used to encode the samples.""" + + scramble: bool = True + """Whether to scramble the Sobol sequence.""" + + buffer_sample_multiplier: int = 2 + """How many samples to generate in the buffer before checking for uniqueness.""" + + allow_undersampling: bool = False + """If True, will allow undersampling if we can't generate `n` unique samples.""" + + def sample(self, n: int) -> list[dict[str, Any]]: + """Sample `n` points from the Sobol sequence. + + !!! warning + + If `self.allow_undersampling` is False, this method will raise a ValueError if + it cannot generate `n` unique samples. + + Args: + n: The number of points to sample. + + Returns: + A list of `n` points sampled from the Sobol sequence. + """ + assert self.encoder.tensors is not None + + if self.encoder.has_graphs(): + # TODO: Won't work on graphs + raise NotImplementedError("Graphs are not yet supported.") + + if self.encoder.n_numerical == 0 and self.encoder.n_categorical > 0: + # TODO: We need to do something else if we have only categoricals + # as we are going to get a lot of duplicates + raise NotImplementedError("Only categorical variables are not yet supported.") + + ndim = self.encoder.n_numerical + self.encoder.n_categorical + sobol = torch.quasirandom.SobolEngine(dimension=ndim, scramble=True, seed=5) + + SAMPLE_SIZE = self.buffer_sample_multiplier * n + unit_x = sobol.draw(SAMPLE_SIZE, dtype=torch.float64) + + x = self.encoder.tensors.from_unit_tensor(unit_x) + + # NOTE: We have to check uniqueness after conversion from unit cube space + # as we could have multiple unit floats mapping to the same categories or integers + unique_x = torch.unique(x, dim=0) + if len(unique_x) < n and not self.allow_undersampling: + raise ValueError( + f"Could not generate {n} unique samples, got {len(unique_x)}\n{self=}" + ) + + return self.encoder.decode_dicts(unique_x[:n]) diff --git a/neps/priors.py b/neps/priors.py new file mode 100644 index 00000000..471718f0 --- /dev/null +++ b/neps/priors.py @@ -0,0 +1,366 @@ +"""Priors for search spaces. + +Loosely speaking, they are joint distributions over multiple independent +variables, i.e. each column of a tensor is assumed to be independent and +can be acted on independently. + +They are not a `torch.distributions.Distribution` subclass as methods like +`entropy` and `kl_divergence` are just more difficult to implement +(not impossible, just more difficult and not needed right now). + +See the class doc description of [`Prior`][neps.priors.Prior] for more details. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, Container, Mapping, Protocol +from typing_extensions import override + +import torch + +from neps.distributions import DistributionOverDomain, TruncatedNormal +from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain + +if TYPE_CHECKING: + from torch.distributions import Distribution + + +class Prior(Protocol): + """A protocol for priors over search spaces. + + At it's core, the two methods that need to be implemented are + `log_prob` and `sample`. The `log_prob` method should return the + log probability of a given tensor of samples under its distribution. + The `sample` method should return a tensor of samples from distribution. + + All values given to the `log_prob` and the ones returned from the + `sample` method are assumed to be in the value domain of the prior, + i.e. the [`.domains`][neps.priors.Prior] attribute. + + !!! warning + + The domain in which samples are actually drawn from not necessarily + need to match that of the value domain. For example, the + [`UniformPrior`][neps.priors.UniformPrior] class uses a unit uniform + distribution to sample from the unit interval before converting + samples to the value domain. + + **As a result, the `log_prob` and `prob` method may not give the same + values as you might expect for a distribution over the value domain.** + + For example, consider a value domain `[0, 1e9]`. You might expect + the `pdf` to be `1e-9` (1 / 1e9) for any given value inside the domain. + However, since the `UniformPrior` samples from the unit interval, the `pdf` will + actually be `1` (1 / 1) for any value inside the domain. + """ + + domains: list[Domain] + """Domain of values which this prior acts upon. + + Each domain corresponds to the corresponding `ndim` in a tensor + (n_samples, ndim). + """ + + device: torch.device | None + """Device to place the tensors on.""" + + def log_prob(self, x: torch.Tensor) -> torch.Tensor: + """Compute the log probability of values in `x` under a prior. + + All columns of `x` are assumed to be independent, such that the + log probability of the entire tensor is the sum of the log + probabilities of each column. + + Args: + x: Tensor of shape (n_samples, n_dims) + In the case of a 1D tensor, the shape is assumed to be (n_dims,) + + Returns: + Tensor of shape (n_samples,) with the log probabilities of each. In the + case that only single dimensional tensor is passed, the returns value + is a scalar. + """ + ... + + def sample(self, n: int) -> torch.Tensor: + """Sample from the prior. + + Args: + n: Number of samples to draw. + + Returns: + Tensor of shape (n, n_dims) with the samples. + """ + ... + + def prob(self, x: torch.Tensor) -> torch.Tensor: + """Compute the probability of values in `x` under a prior. + + See [`log_prob()`][neps.priors.Prior.log_prob] for details on shapes. + """ + return torch.exp(self.log_prob(x)) + + @classmethod + def uniform( + cls, + domains: Mapping[str, Domain] | list[Domain], + *, + device: torch.device | None = None, + ) -> UniformPrior: + """Create a uniform prior for a given list of domains. + + Args: + domains: domains over which to have a uniform prior. + device: Device to place the tensors on. + """ + domains = domains if isinstance(domains, list) else list(domains.values()) + return UniformPrior(domains=domains, device=device) + + @classmethod + def make_centered( # noqa: C901 + cls, + domains: Mapping[str, Domain], + centers: Mapping[str, tuple[Any, float]], + *, + categoricals: Container[str] = (), + device: torch.device | None = None, + ) -> CenteredPrior: + """Create a prior for a given list of domains. + + Will use a `TruncatedNormal` distribution for all parameters, + except those contained within `categoricals`, which will + use a `Categorical` instead. If no center is given for a domain, + a uniform prior will be used. + + For non-categoricals, this will be interpreted as the mean and + std `(1 - confidence)` for a truncnorm. For categorical values, + the _center_ will contain a probability mass of `confidence` with + the remaining `(1 - confidence)` probability mass distributed uniformly + amongest the other choices. + + The order of the items in `domains` matters and should align + with any tensors that you will use to evaluate from the prior. + I.e. the first domain in `domains` will be the first column + of a tensor that this prior can be used on. + + Args: + domains: domains over which to have a centered prior. + centers: centers for the priors. Should be a mapping + from the domain name to the center value and confidence level. + If no center is given, a uniform prior will be used. + + !!! warning + + The values contained in centers should be contained within the + domain. All confidence levels should be within the `[0, 1]` range. + + categoricals: The names of the domains that are categorical and which + a `Categorical` distribution will be used, rather than a + `TruncatedNormal`. + + !!! warning + + Categoricals require that the corresponding domain has a + `.cardinality`, i.e. it is not a float/continuous domain. + + device: Device to place the tensors on. + + + Returns: + A prior for the search space. + """ + for name, (_, confidence) in centers.items(): + if not 0 <= confidence <= 1: + raise ValueError( + f"Confidence level for {name} must be in the range [0, 1]." + f" Got {confidence}." + ) + + for name in domains: + if name not in centers: + raise ValueError( + f"Center for {name} is missing. " + f"Please provide a center for all domains." + ) + + distributions: list[DistributionOverDomain] = [] + for name, domain in domains.items(): + center_confidence = centers.get(name) + if center_confidence is None: + dist = DistributionOverDomain( + distribution=torch.distributions.Uniform(domain.lower, domain.upper), + domain=domain, + ) + continue + + center, confidence = center_confidence + if name in categoricals: + if domain.cardinality is None: + raise ValueError( + f"{name} is not a finite domain and cannot be used as a" + " categorical. Please remove it from the categoricals list." + ) + + if not isinstance(center, int): + raise ValueError( + f"{name} is a categorical domain and should have an integer" + f" center. Got {center} of type {type(center)}." + ) + + remaining_weight = 1 - confidence + distributed_weight = remaining_weight / (domain.cardinality - 1) + weights = torch.full( + (domain.cardinality,), + distributed_weight, + device=device, + dtype=torch.float64, + ) + + weights[center] = confidence + + dist = DistributionOverDomain( + distribution=torch.distributions.Categorical(probs=weights), + domain=domain, + ) + distributions.append(dist) + continue + + # We place a truncnorm over a unitnorm + if domain.log_bounds is not None: + domain.to_unit(torch.tensor(center, device=device, dtype=torch.float64)) + torch.tensor(1 - confidence, device=device, dtype=torch.float64) + + dist = DistributionOverDomain( + distribution=TruncatedNormal( + loc=center, + scale=(1 - confidence), + a=domain.lower, + b=domain.upper, + device=device, + ), + domain=UNIT_FLOAT_DOMAIN, + ) + distributions.append(dist) + + return CenteredPrior( + domains=list(domains.values()), distributions=distributions, device=device + ) + + +@dataclass +class CenteredPrior(Prior): + """A prior that is centered around a given value with a given confidence. + + This prior is useful for creating priors for search spaces where the + values are centered around a given value with a given confidence level. + + You can use a `torch.distribution.Uniform` for any values which do + not have a center and confidence level, i.e. no prior information. + + You can create this class more easily using + [`Prior.make_centered()`][neps.priors.Prior.make_centered]. + """ + + domains: list[Domain] + """Domain of values.""" + + device: torch.device | None + """Device to place the tensors on.""" + + distributions: list[DistributionOverDomain] + """Distributions along with the corresponding domains they sample from.""" + + _distribution_domains: list[Domain] = field(init=False, repr=False) + + def __post_init__(self): + self._distribution_domains = [dist.domain for dist in self.distributions] + + @override + def log_prob(self, x: torch.Tensor) -> torch.Tensor: + # Cast all values from the value domains to the domain of the sampler. + sample_domain_tensor = Domain.cast_many( + x, frm=self.domains, to=self._distribution_domains + ) + + # Calculate the log probabilities of the sample domain tensors under their + # respective distributions. + log_probs = torch.cat( + [ + dist.distribution.log_prob(sample_domain_tensor[:, i]) + for i, dist in enumerate(self.distributions) + ], + dim=1, + ) + return torch.sum(log_probs, dim=1) + + @override + def sample(self, n: int) -> torch.Tensor: + buffer = torch.empty( + n, + len(self.distributions), + device=self.device, + dtype=torch.float64, + ) + + size = torch.Size((n,)) + for i, (value_domain, frm) in enumerate(zip(self.domains, self.distributions)): + samples = frm.distribution.sample(size) + buffer[:, i] = value_domain.cast(samples, frm=frm.domain) + + return buffer + + +@dataclass +class UniformPrior(Prior): + """A prior that is uniform over a given domain. + + Uses a UnitUniform under the hood before converting to the value domain. + """ + + domains: list[Domain] + """Domain of values.""" + + device: torch.device | None + """Device to place the tensors on.""" + + _unit_uniform: Distribution = field(init=False, repr=False) + + def __post_init__(self): + self._unit_uniform = torch.distributions.Uniform(0.0, 1.0) + + def log_prob(self, x: torch.Tensor) -> torch.Tensor: + """Compute the log probability of values in `x` under a prior. + + All columns of `x` are assumed to be independent, such that the + log probability of the entire tensor is the sum of the log + probabilities of each column. + + Args: + x: Tensor of shape (n_samples, n_dims) + In the case of a 1D tensor, the shape is assumed to be (n_dims,) + + Returns: + Tensor of shape (n_samples,) with the log probabilities of each. In the + case that only single dimensional tensor is passed, the returns value + is a scalar. + """ + sample_domain_tensor = Domain.cast_many(x, frm=self.domains, to=UNIT_FLOAT_DOMAIN) + return torch.sum(self._unit_uniform.log_prob(sample_domain_tensor), dim=1) + + def sample(self, n: int) -> torch.Tensor: + """Sample from the prior. + + Args: + n: Number of samples to draw. + + Returns: + Tensor of shape (n, n_dims) with the samples. + """ + samples = torch.rand( + n, + len(self.domains), + device=self.device, + dtype=torch.float64, + ) + return Domain.cast_many(samples, frm=UNIT_FLOAT_DOMAIN, to=self.domains) From a013a13ab99efd0ab65945438e7c4d4f088a7794 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Tue, 27 Aug 2024 18:54:59 +0200 Subject: [PATCH 14/63] refactor: Clean up the BO --- .../acquisition_functions/prior_weighted.py | 2 +- .../bayesian_optimization/optimizer.py | 138 ++++++++++++------ neps/optimizers/initial_design.py | 8 +- neps/search_spaces/encoding.py | 20 +-- 4 files changed, 113 insertions(+), 55 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py index 7b0d4318..8a735d58 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py @@ -26,7 +26,7 @@ def __init__( acq_fn: MCAcquisitionFunction, prior: Prior, beta: float, - n: int, + n: float, ): """Initialize the acquisition function. diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index a08578d6..5a1314db 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -18,7 +18,9 @@ optimize_acq, ) from neps.optimizers.initial_design import Sobol +from neps.priors import Prior from neps.search_spaces.encoding import DataEncoder +from neps.search_spaces.hyperparameters.categorical import CategoricalParameter if TYPE_CHECKING: from botorch.models.model import Model @@ -26,6 +28,7 @@ from neps.search_spaces import ( SearchSpace, ) + from neps.search_spaces.domain import Domain from neps.search_spaces.encoding import DataPack from neps.state import BudgetInfo, Trial @@ -58,6 +61,11 @@ def __init__( ValueError: if initial_design_size < 1 ValueError: if no kernel is provided """ + if any(pipeline_space.graphs): + raise ValueError( + "BayesianOptimization currently only supports flat search spaces" + ) + if initial_design_size is None: N = len(pipeline_space.hyperparameters) initial_design_size = int(max(1, math.log(N) ** 2)) @@ -68,23 +76,61 @@ def __init__( super().__init__(pipeline_space=pipeline_space) - self.use_priors = use_priors - - # TODO: This needs to be moved to the search space class, however to not break - # the current prior based APIs, we will create this manually here + self.encoder = DataEncoder.default_encoder( + pipeline_space, + include_fidelities=False, + ) + # We should only be acting on tensor'able hyperparameters for now + assert self.encoder.tensors is not None + + # TODO: This needs to be moved to the search space class, however + # to not break the current prior based APIs used elsewhere, we can + # just manually create this here. + # We use confidence here where `0` means no confidence and `1` means + # absolute confidence. This gets translated in to std's and weights + # accordingly in a `CenteredPrior` + self.prior: Prior | None = None if use_priors: - self._prior_confidences = {} + _mapping = {"low": 0.25, "medium": 0.5, "high": 0.75} + + domains: dict[str, Domain] = {} + centers: dict[str, tuple[Any, float]] = {} + categoricals: set[str] = set() + for name in self.encoder.tensors.names(): + hp = self.pipeline_space.hyperparameters[name] + domains[name] = hp.domain # type: ignore + + if isinstance(hp, CategoricalParameter): + categoricals.add(name) + + if hp.default is None: + continue + + confidence_score: float = hp.default_confidence_choice # type: ignore + if isinstance(hp, CategoricalParameter): + center = hp._default_index + else: + center = hp.default + + centers[name] = (center, confidence_score) + + # Uses truncnorms for numerical and weighted choices categoricals + self.prior = Prior.make_centered( + domains=domains, + centers=centers, + categoricals=categoricals, + ) + else: + self.prior = None self.device = device self.sample_default_first = sample_default_first self.n_initial_design = initial_design_size - if surrogate_model == "gp": self._get_fitted_model = default_single_obj_gp else: self._get_fitted_model = surrogate_model - self.encoder_: DataEncoder | None = None self.initial_design_: list[dict[str, Any]] | None = None def ask( @@ -101,41 +147,38 @@ def ask( if t.report is not None and t.report.loss is not None ] x_configs = [t.config for t in completed] - y: torch.Tensor = torch.as_tensor( + y = torch.as_tensor( [t.report.loss for t in completed], dtype=torch.float64, + device=self.device, ) # type: ignore - # We only do single objective for now but may as well include this - # for when we have MO if y.ndim == 1: y = y.unsqueeze(1) pending = [t.config for t in trials.values() if t.state.pending()] - if self.encoder_ is None: - self.encoder_ = DataEncoder.default_encoder( - self.pipeline_space, - include_fidelities=False, - ) space = self.pipeline_space + config_id = str(len(trials) + 1) + assert self.encoder.tensors is not None + # Fill intitial design data if we don't have any... if self.initial_design_ is None: size = self.n_initial_design self.initial_design_ = [] + # Add the default configuration first (maybe) if self.sample_default_first: config = space.sample_default_configuration() self.initial_design_.append(config.hp_values()) - assert self.encoder_.tensors is not None - sobol = Sobol(seed=0, encoder=self.encoder_, allow_undersampling=True) + # Fill remaining with Sobol sequence samples + sobol = Sobol(seed=0, encoder=self.encoder, allow_undersampling=True) sobol_configs = sobol.sample(size - len(self.initial_design_)) self.initial_design_.extend(sobol_configs) - else: - self.initial_design_ = [] - config_id = str(len(trials) + 1) + # If we havn't passed the intial design phase, just return + # the next one. if len(trials) < len(self.initial_design_): config = self.initial_design_[len(trials)] return ( @@ -143,53 +186,66 @@ def ask( optimizer_state, ) - assert self.encoder_ is not None - x = self.encoder_.encode(x_configs, device=self.device) + # Now we actually do the BO loop, start by encoding the data + x = self.encoder.encode(x_configs, device=self.device) if any(pending): - x_pending = self.encoder_.encode(pending, device=self.device) + x_pending = self.encoder.encode(pending, device=self.device) x_pending = x_pending.tensor assert x_pending is not None else: x_pending = None + # Get our fitted model model = self._get_fitted_model(x, y) + # Build our acquisition function. This takes care of pending + # configs through x_pending. + # TODO: We should evaluate whether LogNoisyEI is better than LogEI acq = qLogExpectedImprovement( model, best_f=y.min(), X_pending=x_pending, + # Unfortunatly, there's no option to indicate that we minimize + # the AcqFunction so we need to do some kind of transformation. + # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607 objective=LinearMCObjective(weights=torch.tensor([-1.0])), ) - if self.use_priors: - # From the PIBO paper (Section 4.1) - # https://arxiv.org/pdf/2204.11051 + # If we have a prior, then we use it with PiBO + if self.prior: if budget_info.max_evaluations is not None: - beta = budget_info.max_evaluations / 10 + # From the PIBO paper (Section 4.1) + # https://arxiv.org/pdf/2204.11051 n = budget_info.used_evaluations + beta = budget_info.max_evaluations / 10 + elif budget_info.max_cost_budget is not None: # This might not work well if cost number is high # early on, but it will start to normalize. - beta = budget_info.max_cost_budget / 10 n = budget_info.used_cost_budget + beta = budget_info.max_cost_budget / 10 + + else: + # Otherwise, just some random heuristic based on the number + # of trials and dimensionality of the search space + # TODO: Think about and evaluate this more. + ndim = x.tensor.shape[1] # type: ignore + n = len(x_configs) + beta = ndim**2 / 10 - acq = PiboAcquisition(acq, n=n, beta=beta) + acq = PiboAcquisition(acq, prior=self.prior, n=n, beta=beta) + # Optimize it candidates, _eis = optimize_acq( - # TODO: We should evaluate whether LogNoisyEI is better than LogEI - acq_fn=qLogExpectedImprovement( - model, - best_f=y.min(), - X_pending=x_pending, - # Unfortunatly, there's no option to indicate that we minimize - # the AcqFunction so we need to do some kind of transformation. - # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607 - objective=LinearMCObjective(weights=torch.tensor([-1.0])), - ), - encoder=self.encoder_, + acq_fn=acq, + encoder=self.encoder, acq_options={}, # options to underlying optim function of botorch ) - config = self.encoder_.decode_dicts(candidates)[0] + + # Take the first (and only?) candidate + assert len(candidates) == 1 + config = self.encoder.decode_dicts(candidates)[0] + return ( SampledConfig(id=config_id, config=config, previous_config_id=None), optimizer_state, diff --git a/neps/optimizers/initial_design.py b/neps/optimizers/initial_design.py index 05553ab1..e80dfe33 100644 --- a/neps/optimizers/initial_design.py +++ b/neps/optimizers/initial_design.py @@ -5,6 +5,8 @@ import torch +from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain + if TYPE_CHECKING: from neps.search_spaces.encoding import DataEncoder @@ -61,7 +63,11 @@ def sample(self, n: int) -> list[dict[str, Any]]: SAMPLE_SIZE = self.buffer_sample_multiplier * n unit_x = sobol.draw(SAMPLE_SIZE, dtype=torch.float64) - x = self.encoder.tensors.from_unit_tensor(unit_x) + x = Domain.cast_many( + unit_x, + to=list(self.encoder.tensors.domains().values()), + frm=UNIT_FLOAT_DOMAIN, + ) # NOTE: We have to check uniqueness after conversion from unit cube space # as we could have multiple unit floats mapping to the same categories or integers diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py index b4035297..21d7acd4 100644 --- a/neps/search_spaces/encoding.py +++ b/neps/search_spaces/encoding.py @@ -212,6 +212,14 @@ def __post_init__(self): self.n_numerical = n_numerical self.n_categorical = n_categorical + def domains(self) -> dict[str, Domain]: + return { + name: transformer.domain for name, transformer in self.transformers.items() + } + + def names(self) -> list[str]: + return list(self.transformers.keys()) + def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor: if isinstance(hp, str): return x[:, self.column_lookup[hp]] @@ -252,18 +260,6 @@ def decode_dicts(self, x: torch.Tensor) -> list[dict[str, Any]]: keys = list(values.keys()) return [dict(zip(keys, vals)) for vals in zip(*values.values())] - def from_unit_tensor( - self, - x: torch.Tensor, - device: torch.device | None = None, - ) -> torch.Tensor: - buffer = torch.empty_like(x, dtype=torch.float64, device=device) - - for i, transformer in enumerate(self.transformers.values()): - buffer[:, i] = transformer.domain.cast(x[:, i], frm=UNIT_FLOAT_DOMAIN) - - return buffer - @dataclass class DataEncoder: From d4a11a2192efad6ba3e66e5779a1e38c926aeeda Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Tue, 27 Aug 2024 19:00:17 +0200 Subject: [PATCH 15/63] refactor: Delete unused files --- .../acquisition_functions/aq_functions.py | 88 ------ .../acquisition_sampler_2/__init__.py | 0 .../acquisition_sampler_2/aq_samplers.py | 22 -- .../acquisition_sampler_2/mutation_sampler.py | 163 ----------- .../acquisition_sampler_2/random_sampler.py | 15 -- .../bayesian_optimization/cost_cooling.py | 252 ------------------ 6 files changed, 540 deletions(-) delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/__init__.py delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py delete mode 100644 neps/optimizers/bayesian_optimization/cost_cooling.py diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py b/neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py deleted file mode 100644 index 70b6b4e6..00000000 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/aq_functions.py +++ /dev/null @@ -1,88 +0,0 @@ -from __future__ import annotations - -import math - -import torch - - -def ei( - mu: torch.Tensor, - cov: torch.Tensor, - optimum: float | torch.Tensor, - *, - augmented_ei_regularizer: float | None = None, # 0.01 - xi: float = 0.0, - log_ei: bool = False, - log_ei_epsilon: float = 1e-6, -) -> torch.Tensor: - improvement = optimum - mu - xi - - sigma_sq = torch.diag(cov) - sigma = torch.sqrt(sigma_sq) - - Z = improvement / sigma - - # If we calculate it ourselves, we save some computation as mu = 0 - # and sigma = 1 cancel a few terms out - # https://en.wikipedia.org/wiki/Normal_distribution - Z_cdf = 0.5 * (1 + torch.erf(Z / math.sqrt(2))) - Z_pdf = 1 / (math.sqrt(2 * math.pi)) * torch.exp(-0.5 * Z**2) - ei = improvement * Z_cdf + sigma * Z_pdf - - if augmented_ei_regularizer is not None: - regularization_term = 1 + sigma_sq / augmented_ei_regularizer - ei = ei / regularization_term - - if log_ei: - ei = torch.log(ei + log_ei_epsilon) - - return ei - - -def acq_by_confidence( - mu: torch.Tensor, - cov: torch.Tensor, - *, - confidence_scale: float = 1.0, -) -> torch.Tensor: - # Assumes we are trying to minimize our objective but - # this acquisition function will be maximized, i.e. optimize - # this function to find the point which is most likely to be - # the minimum of the objective. - - # **** - # * / \** - # ***** / \- **** - # * / \ *** - # * / \ | * *** - # ---/ \ | +** - # -/ \ | / \ - # \|/ --- - # - <- lcb = mu - c * sigma - # ______________________________ - lcb = mu - confidence_scale * torch.sqrt(torch.diag(cov)) - - return -lcb # Negate to make maximization - - -def weight_by_cost( - acquisition_scores: torch.Tensor, -) -> torch.Tensor: - # Assumes we are trying to minimize our objective but - # this acquisition function will be maximized, i.e. optimize - # this function to find the point which is most likely to be - # the minimum of the objective. - - # **** - # * / \** - # ***** / \- **** - # * / \ *** - # * / \ | * *** - # ---/ \ | +** - # -/ \ | / \ - # \|/ --- - # - <- lcb = mu - c * sigma - # ______________________________ - lcb = mu - cost_scale * torch.sqrt(torch.diag(cov)) - - return -lcb # Negate to make maximization diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/__init__.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py deleted file mode 100644 index f799252b..00000000 --- a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/aq_samplers.py +++ /dev/null @@ -1,22 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - import torch - - from neps.search_spaces import SearchSpace - - -def random_sample(search_space: SearchSpace, *, seed: torch.Generator) -> SearchSpace: - """Sample a random value from a search space. - - Args: - search_space: The search space to sample from. - user_priors: Whether to sample from user priors. - seed: The seed to use for sampling. - - Returns: - A search space with a sampled value. - """ - return search_space.sample_value(user_priors=user_priors) diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py deleted file mode 100644 index 972ad6c3..00000000 --- a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/mutation_sampler.py +++ /dev/null @@ -1,163 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Callable, Sequence - -import numpy as np -import torch -from more_itertools import first -from typing_extensions import override - -from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, -) -from neps.optimizers.bayesian_optimization.acquisition_samplers.random_sampler import ( - RandomSampler, -) - -if TYPE_CHECKING: - from neps.search_spaces.search_space import SearchSpace - - -def _propose_location( - acquisition_function: Callable, - candidates: list[SearchSpace], - top_n: int = 5, - return_distinct: bool = True, -) -> tuple[list[SearchSpace], np.ndarray | torch.Tensor, np.ndarray]: - """top_n: return the top n candidates wrt the acquisition function.""" - if return_distinct: - eis = acquisition_function(candidates, asscalar=True) # faster - eis_, unique_idx = np.unique(eis, return_index=True) - try: - i = np.argpartition(eis_, -top_n)[-top_n:] - indices = np.array([unique_idx[j] for j in i]) - except ValueError: - eis = torch.tensor([acquisition_function(c) for c in candidates]) - _, indices = eis.topk(top_n) - else: - eis = torch.tensor([acquisition_function(c) for c in candidates]) - _, indices = eis.topk(top_n) - - xs = [candidates[int(i)] for i in indices] - return xs, eis, indices - - -class MutationSampler(AcquisitionSampler): - def __init__( - self, - pipeline_space, - pool_size: int = 250, - n_best: int = 10, - mutate_size: float | int = 0.5, - allow_isomorphism: bool = False, - check_isomorphism_history: bool = True, - patience: int = 50, - ): - super().__init__(pipeline_space=pipeline_space, patience=patience) - self.pool_size = pool_size - self.n_best = n_best - self.mutate_size = mutate_size - if isinstance(mutate_size, int): - assert ( - pool_size >= mutate_size - ), " pool_size must be larger or equal to mutate_size" - - self.allow_isomorphism = allow_isomorphism - self.check_isomorphism_history = ( - check_isomorphism_history # check for isomorphisms also in previous graphs - ) - self.random_sampling = RandomSampler( - pipeline_space=pipeline_space, patience=patience - ) - - @override - def set_state( - self, x: list[SearchSpace], y: Sequence[float] | np.ndarray | torch.Tensor - ) -> None: - super().set_state(x, y) - self.random_sampling.set_state(x, y) - - @override - def sample(self, acquisition_function: Callable) -> SearchSpace: - return first(self.sample_batch(acquisition_function, batch=1)) - - @override - def sample_batch( - self, - acquisition_function: Callable, - batch: int, - ) -> list[SearchSpace]: - pool = self.create_pool( - x=self.x, - y=self.y, - acquisition_function=acquisition_function, - pool_size=self.pool_size, - ) - - samples, _, _ = _propose_location( - acquisition_function=acquisition_function, - top_n=batch, - candidates=pool, - ) - return samples - - def create_pool( - self, - x: list[SearchSpace], - y: Sequence[float] | np.ndarray | torch.Tensor, - acquisition_function: Callable, - pool_size: int, - ) -> list[SearchSpace]: - if len(x) == 0: - return self.random_sampling.sample_batch(acquisition_function, pool_size) - - if isinstance(self.mutate_size, int): - mutate_size = self.mutate_size - else: - mutate_size = int(self.mutate_size * pool_size) - - n_best = len(x) if len(x) < self.n_best else self.n_best - best_configs = [x for (_, x) in sorted(zip(y, x), key=lambda pair: pair[0])][ - :n_best - ] - - seen: set[int] = set() - - def _hash(_config: SearchSpace) -> int: - return hash(_config.hp_values().values()) - - evaluation_pool = [] - per_arch = mutate_size // n_best - - for config in best_configs: - remaining_patience = self.patience - for _ in range(per_arch): - while remaining_patience: - try: - # needs to throw an Exception if config is not valid, e.g., empty graph etc.! - child = config.mutate() - except Exception: - remaining_patience -= 1 - continue - hash_child = _hash(child) - - if not self.allow_isomorphism: - # if disallow isomorphism, we enforce that each time, we mutate n distinct graphs. - # For now we do not check the isomorphism in all of the previous graphs though - if child == config or hash_child in seen: - remaining_patience -= 1 - continue - - evaluation_pool.append(child) - seen.add(hash_child) - break - - # Fill missing pool with random samples - nrandom_archs = max(pool_size - len(evaluation_pool), 0) - if nrandom_archs: - random_evaluation_pool = self.random_sampling.sample_batch( - acquisition_function, nrandom_archs - ) - evaluation_pool += random_evaluation_pool - - return evaluation_pool diff --git a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py deleted file mode 100644 index f7a4da76..00000000 --- a/neps/optimizers/bayesian_optimization/acquisition_sampler_2/random_sampler.py +++ /dev/null @@ -1,15 +0,0 @@ -from __future__ import annotations - -import torch -from neps.search_spaces import SearchSpace -from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, -) - - -class RandomSampler(AcquisitionSampler): - - def sample(self, n: int, space: SearchSpace) -> torch.Tensor: - return self.pipeline_space.sample( - patience=self.patience, user_priors=False, ignore_fidelity=False - ) diff --git a/neps/optimizers/bayesian_optimization/cost_cooling.py b/neps/optimizers/bayesian_optimization/cost_cooling.py deleted file mode 100644 index eb3ee28e..00000000 --- a/neps/optimizers/bayesian_optimization/cost_cooling.py +++ /dev/null @@ -1,252 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any -from typing_extensions import override - -from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping -from neps.optimizers.bayesian_optimization.acquisition_functions.cost_cooling import ( - CostCooler, -) -from neps.optimizers.bayesian_optimization.acquisition_samplers import ( - AcquisitionSamplerMapping, -) -from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping -from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization -from neps.utils.common import instance_from_map - -if TYPE_CHECKING: - from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( - BaseAcquisition, - ) - from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, - ) - from neps.search_spaces.search_space import SearchSpace - from neps.state.optimizer import BudgetInfo - from neps.utils.types import ConfigResult - - -class CostCooling(BayesianOptimization): - """Implements a basic cost-cooling as described in - "Cost-aware Bayesian Optimization" (https://arxiv.org/abs/2003.10870) by Lee et al. - """ - - def __init__( - self, - pipeline_space: SearchSpace, - initial_design_size: int = 10, - surrogate_model: str | Any = "gp", - cost_model: str | Any = "gp", - surrogate_model_args: dict | None = None, - cost_model_args: dict | None = None, - optimal_assignment: bool = False, - domain_se_kernel: str | None = None, - graph_kernels: list | None = None, - hp_kernels: list | None = None, - acquisition: str | BaseAcquisition = "EI", - log_prior_weighted: bool = False, - acquisition_sampler: str | AcquisitionSampler = "mutation", - random_interleave_prob: float = 0.0, - patience: int = 100, - budget: None | int | float = None, - ignore_errors: bool = False, - loss_value_on_error: None | float = None, - cost_value_on_error: None | float = None, - logger=None, - ): - """Initialise the BO loop. - - Args: - pipeline_space: Space in which to search - initial_design_size: Number of 'x' samples that need to be evaluated before - selecting a sample using a strategy instead of randomly. - surrogate_model: Surrogate model - cost_model: Cost model - surrogate_model_args: Arguments that will be given to the surrogate model - (the Gaussian processes model). - cost_model_args: Arguments that will be given to the cost model - (the Gaussian processes model). - optimal_assignment: whether the optimal assignment kernel should be used. - domain_se_kernel: Stationary kernel name - graph_kernels: Kernels for NAS - hp_kernels: Kernels for HPO - acquisition: Acquisition strategy - log_prior_weighted: if to use log for prior - acquisition_sampler: Acquisition function fetching strategy - random_interleave_prob: Frequency at which random configurations are sampled - instead of configurations from the acquisition strategy. - patience: How many times we try something that fails before giving up. - budget: Maximum budget - ignore_errors: Ignore hyperparameter settings that threw an error and do not - raise an error. Error configs still count towards max_evaluations_total. - loss_value_on_error: Setting this and cost_value_on_error to any float will - supress any error during bayesian optimization and will use given loss - value instead. default: None - cost_value_on_error: Setting this and loss_value_on_error to any float will - supress any error during bayesian optimization and will use given cost - value instead. default: None - logger: logger object, or None to use the neps logger - - Raises: - ValueError: if patience < 1 - ValueError: if initial_design_size < 1 - ValueError: if random_interleave_prob is not between 0.0 and 1.0 - ValueError: if no kernel is provided - """ - super().__init__( - pipeline_space=pipeline_space, - patience=patience, - logger=logger, - budget=budget, - ignore_errors=ignore_errors, - loss_value_on_error=loss_value_on_error, - cost_value_on_error=cost_value_on_error, - ) - - if initial_design_size < 1: - raise ValueError( - "BayesianOptimization needs initial_design_size to be at least 1" - ) - if not 0 <= random_interleave_prob <= 1: - raise ValueError("random_interleave_prob should be between 0.0 and 1.0") - - self._initial_design_size = initial_design_size - self._random_interleave_prob = random_interleave_prob - self._num_train_x: int = 0 - self._pending_evaluations: list = [] - self._model_update_failed: bool = False - - if ignore_errors: - self.logger.warning( - "ignore_errors was set, but this optimizer does not support it" - ) - - surrogate_model_args = surrogate_model_args or {} - cost_model_args = cost_model_args or {} - graph_kernels, hp_kernels = get_default_kernels( - self.pipeline_space, - domain_se_kernel, - graph_kernels, - hp_kernels, - optimal_assignment, - ) - if "graph_kernels" not in surrogate_model_args: - surrogate_model_args["graph_kernels"] = graph_kernels - if "hp_kernels" not in surrogate_model_args: - surrogate_model_args["hp_kernels"] = hp_kernels - - if ( - not surrogate_model_args["graph_kernels"] - and not surrogate_model_args["hp_kernels"] - ): - raise ValueError("No kernels are provided!") - - if "vectorial_features" not in surrogate_model_args: - surrogate_model_args["vectorial_features"] = ( - self.pipeline_space.get_vectorial_dim() - ) - - self.surrogate_model = instance_from_map( - SurrogateModelMapping, - surrogate_model, - name="surrogate model", - kwargs=surrogate_model_args, - ) - - if "graph_kernels" not in cost_model_args: - cost_model_args["graph_kernels"] = graph_kernels - if "hp_kernels" not in cost_model_args: - cost_model_args["hp_kernels"] = hp_kernels - - if not cost_model_args["graph_kernels"] and not cost_model_args["hp_kernels"]: - raise ValueError("No kernels are provided!") - - if "vectorial_features" not in cost_model_args: - cost_model_args["vectorial_features"] = ( - self.pipeline_space.get_vectorial_dim() - ) - - self.cost_model = instance_from_map( - SurrogateModelMapping, - cost_model, - name="cost model", # does changing this string work? - kwargs=cost_model_args, - ) - - orig_acquisition = instance_from_map( - AcquisitionMapping, - acquisition, - name="acquisition function", - ) - - self.acquisition = CostCooler(orig_acquisition) - - self.acquisition_sampler = instance_from_map( - AcquisitionSamplerMapping, - acquisition_sampler, - name="acquisition sampler function", - kwargs={"patience": self.patience, "pipeline_space": self.pipeline_space}, - ) - - @override - def load_optimization_state( - self, - previous_results: dict[str, ConfigResult], - pending_evaluations: dict[str, SearchSpace], - budget_info: BudgetInfo | None, - optimizer_state: dict[str, Any], - ) -> None: - # TODO(Jan): read out cost and fit cost model - if budget_info is None: - raise ValueError( - "Used budget is not set in the optimizer state but is required" - " for cost cooling, please return a `'cost'` when you return results" - " and/or a `max_cost_budget` when running NePS!" - ) - self.used_budget = budget_info.used_cost_budget - - train_x = [el.config for el in previous_results.values()] - train_y = [self.get_loss(el.result) for el in previous_results.values()] - train_cost = [self.get_cost(el.result) for el in previous_results.values()] - self._num_train_x = len(train_x) - self._pending_evaluations = list(pending_evaluations.values()) - if self._num_train_x >= self._initial_design_size: - try: - if len(self._pending_evaluations) > 0: - # We want to use hallucinated results for the evaluations that have - # not finished yet. For this we fit a model on the finished - # evaluations and add these to the other results to fit another model. - self.surrogate_model.fit(train_x, train_y) - self.cost_model.fit(train_x, train_cost) - ys, _ = self.surrogate_model.predict(self._pending_evaluations) - zs, _ = self.cost_model.predict(self._pending_evaluations) - train_x += self._pending_evaluations - train_y += list(ys.detach().numpy()) - train_cost += list(zs.detach().numpy()) - - self.surrogate_model.fit(train_x, train_y) - self.cost_model.fit(train_x, train_cost) - # TODO: set acquisition state - self.acquisition.set_state( - self.surrogate_model, - alpha=1 - - (budget_info.used_cost_budget / budget_info.max_cost_budget), - cost_model=self.cost_model, - ) - self.acquisition_sampler.set_state(x=train_x, y=train_y) - - self._model_update_failed = False - except RuntimeError as runtime_error: - self.logger.exception( - "Model could not be updated due to below error. Sampling will not use" - " the model." - ) - if self.loss_value_on_error is None or self.cost_value_on_error is None: - raise ValueError( - "A RuntimeError happened and " - "loss_value_on_error or cost_value_on_error " - "value is not provided, please fix the error or " - "provide the values to continue without " - "updating the model" - ) from runtime_error - self._model_update_failed = True From 0338a99909491c96002223d88989c1d04beebbeb Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Tue, 27 Aug 2024 19:12:47 +0200 Subject: [PATCH 16/63] fix: CenteredPrior prefers [0, 1] sample domain --- neps/priors.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/neps/priors.py b/neps/priors.py index 471718f0..116b31f9 100644 --- a/neps/priors.py +++ b/neps/priors.py @@ -118,7 +118,7 @@ def uniform( return UniformPrior(domains=domains, device=device) @classmethod - def make_centered( # noqa: C901 + def make_centered( cls, domains: Mapping[str, Domain], centers: Mapping[str, tuple[Any, float]], @@ -189,8 +189,8 @@ def make_centered( # noqa: C901 center_confidence = centers.get(name) if center_confidence is None: dist = DistributionOverDomain( - distribution=torch.distributions.Uniform(domain.lower, domain.upper), - domain=domain, + distribution=torch.distributions.Uniform(0.0, 1.0), + domain=UNIT_FLOAT_DOMAIN, ) continue @@ -227,16 +227,15 @@ def make_centered( # noqa: C901 continue # We place a truncnorm over a unitnorm - if domain.log_bounds is not None: - domain.to_unit(torch.tensor(center, device=device, dtype=torch.float64)) - torch.tensor(1 - confidence, device=device, dtype=torch.float64) - + unit_center = domain.to_unit( + torch.tensor(center, device=device, dtype=torch.float64) + ) dist = DistributionOverDomain( distribution=TruncatedNormal( - loc=center, + loc=unit_center, scale=(1 - confidence), - a=domain.lower, - b=domain.upper, + a=0.0, + b=1.0, device=device, ), domain=UNIT_FLOAT_DOMAIN, From 058ab2b5419f72aed93c6693a9b0b333808e4cc5 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 28 Aug 2024 13:41:42 +0200 Subject: [PATCH 17/63] refactor: Revamp Sampler and Prior --- .../bayesian_optimization/models/gp.py | 85 ++-- .../bayesian_optimization/optimizer.py | 149 ++++--- neps/optimizers/initial_design.py | 88 +---- neps/sampling/__init__.py | 0 neps/{ => sampling}/priors.py | 270 ++++++++----- neps/sampling/samplers.py | 120 ++++++ neps/search_spaces/domain.py | 171 +++++--- neps/search_spaces/encoding.py | 364 ++++-------------- neps/search_spaces/search_space.py | 9 +- 9 files changed, 652 insertions(+), 604 deletions(-) create mode 100644 neps/sampling/__init__.py rename neps/{ => sampling}/priors.py (58%) create mode 100644 neps/sampling/samplers.py diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index b302edcd..206078cb 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -2,6 +2,7 @@ import logging import math +from functools import reduce from typing import TYPE_CHECKING, Any, Mapping, TypeVar import gpytorch @@ -13,11 +14,12 @@ from botorch.models.transforms.outcome import Standardize from botorch.optim import optimize_acqf, optimize_acqf_mixed from gpytorch.kernels import MaternKernel, ScaleKernel +from torch._dynamo.utils import product from neps.search_spaces.encoding import ( CategoricalToIntegerTransformer, - DataEncoder, - DataPack, + TensorEncoder, + TensorPack, ) if TYPE_CHECKING: @@ -114,7 +116,7 @@ def default_mean() -> gpytorch.means.ConstantMean: def default_matern_kernel( - N: int, # noqa: N803 + N: int, active_dims: tuple[int, ...] | None = None, ) -> ScaleKernel: lengthscale_prior, lengthscale_constraint = default_lengthscale_prior(N) @@ -131,7 +133,7 @@ def default_matern_kernel( def default_categorical_kernel( - N: int, # noqa: N803 + N: int, active_dims: tuple[int, ...] | None = None, ) -> ScaleKernel: # Following BoTorches implementation of the MixedSingleTaskGP @@ -145,30 +147,20 @@ def default_categorical_kernel( def default_single_obj_gp( - x: DataPack, + x: TensorPack, y: torch.Tensor, ) -> SingleTaskGP: encoder = x.encoder - assert x.tensor is not None - assert encoder.tensors is not None - # Here, we will collect all graph encoded hyperparameters and assign each - # to its own individual WL kernel. - if encoder.graphs is not None: - raise NotImplementedError("Graphs are not yet supported.") - - numerics: list[str] = [] - categoricals: list[str] = [] - for hp_name, transformer in encoder.tensors.transformers.items(): + numerics: list[int] = [] + categoricals: list[int] = [] + for hp_name, transformer in encoder.transformers.items(): if isinstance(transformer, CategoricalToIntegerTransformer): - categoricals.append(hp_name) + categoricals.append(encoder.index_of[hp_name]) else: - numerics.append(hp_name) - - categorical_indices = encoder.indices(categoricals) - numeric_indices = encoder.indices(numerics) + numerics.append(encoder.index_of[hp_name]) # Purely vectorial - if len(categorical_indices) == 0: + if len(categoricals) == 0: return SingleTaskGP( train_X=x.tensor, train_Y=y, @@ -180,7 +172,7 @@ def default_single_obj_gp( ) # Purely categorical - if len(numeric_indices) == 0: + if len(numerics) == 0: return SingleTaskGP( train_X=x.tensor, train_Y=y, @@ -214,7 +206,7 @@ def cont_kernel_factory( return MixedSingleTaskGP( train_X=x.tensor, train_Y=y, - cat_dims=list(categorical_indices), + cat_dims=categoricals, likelihood=default_likelihood_with_prior(), cont_kernel_factory=cont_kernel_factory, outcome_transform=Standardize(m=1), @@ -223,25 +215,26 @@ def cont_kernel_factory( def optimize_acq( acq_fn: AcquisitionFunction, - encoder: DataEncoder, + encoder: TensorEncoder, *, n_candidates_required: int = 1, num_restarts: int = 20, n_intial_start_points: int = 512, acq_options: Mapping[str, Any] | None = None, + maximum_allowed_categorical_combinations: int = 30, ) -> tuple[torch.Tensor, torch.Tensor]: acq_options = acq_options or {} - if encoder.has_graphs(): - raise NotImplementedError("Graphs are not yet supported.") - assert encoder.tensors is not None - lower = [t.domain.lower for t in encoder.tensors.transformers.values()] - upper = [t.domain.upper for t in encoder.tensors.transformers.values()] + lower = [domain.lower for domain in encoder.domains.values()] + upper = [domain.upper for domain in encoder.domains.values()] bounds = torch.tensor([lower, upper], dtype=torch.float) - fixed_categoricals = encoder.categorical_product_indices() - - if not any(fixed_categoricals): + cat_transformers = { + name: t + for name, t in encoder.transformers.items() + if isinstance(t, CategoricalToIntegerTransformer) + } + if not any(cat_transformers): return optimize_acqf( acq_function=acq_fn, bounds=bounds, @@ -251,14 +244,36 @@ def optimize_acq( **acq_options, ) - if len(fixed_categoricals) > 30: + # We need to generate the product of all possible combinations of categoricals, + # first we do a sanity check + n_combos = reduce( + lambda x, y: x * y, [len(t.choices) for t in cat_transformers.values()] + ) + if n_combos > maximum_allowed_categorical_combinations: raise ValueError( "The number of fixed categorical dimensions is too high. " "This will lead to an explosion in the number of possible " - "combinations. Please reduce the number of fixed categorical " + f"combinations. Got: {n_combos} while the setting for the function" + f" is: {maximum_allowed_categorical_combinations=}. Consider reducing the " "dimensions or consider encoding your categoricals in some other format." ) + # Right, now we generate all possible combinations + # First, just collect the possible values per cat column + # NOTE: Botorchs optim requires them to be as floats + cats: dict[int, list[float]] = { + encoder.index_of[name]: [float(i) for i in range(len(transformer.choices))] + for name, transformer in cat_transformers.items() + } + + # Second, generate all possible combinations + fixed_cats: list[dict[int, float]] + if len(cats) == 1: + col, choice_indices = next(iter(cats.items())) + fixed_cats = [{col: i} for i in choice_indices] + else: + fixed_cats = [dict(zip(cats.keys(), combo)) for combo in product(*cats.values())] + # TODO: we should deterministicall shuffle the fixed_categoricals as the # underlying function does not. return optimize_acqf_mixed( @@ -267,6 +282,6 @@ def optimize_acq( num_restarts=num_restarts, raw_samples=n_intial_start_points, q=n_candidates_required, - fixed_features_list=fixed_categoricals, # type: ignore + fixed_features_list=fixed_cats, **acq_options, ) diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 5a1314db..83a65ce2 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -17,9 +17,10 @@ default_single_obj_gp, optimize_acq, ) -from neps.optimizers.initial_design import Sobol +from neps.optimizers.initial_design import PriorInitialDesign, Sobol from neps.priors import Prior -from neps.search_spaces.encoding import DataEncoder +from neps.search_spaces.domain import Domain +from neps.search_spaces.encoding import TensorEncoder, TensorPack from neps.search_spaces.hyperparameters.categorical import CategoricalParameter if TYPE_CHECKING: @@ -28,11 +29,34 @@ from neps.search_spaces import ( SearchSpace, ) - from neps.search_spaces.domain import Domain - from neps.search_spaces.encoding import DataPack + from neps.search_spaces.hyperparameters.float import FloatParameter + from neps.search_spaces.hyperparameters.integer import IntegerParameter from neps.state import BudgetInfo, Trial +def pibo_acq_beta_and_n( + n_sampled_already: int, ndims: int, budget_info: BudgetInfo +) -> tuple[float, float]: + if budget_info.max_evaluations is not None: + # From the PIBO paper (Section 4.1) + # https://arxiv.org/pdf/2204.11051 + beta = budget_info.max_evaluations / 10 + return n_sampled_already, beta + + if budget_info.max_cost_budget is not None: + # This might not work well if cost number is high + # early on, but it will start to normalize. + n = budget_info.used_cost_budget + beta = budget_info.max_cost_budget / 10 + return n, beta + + # Otherwise, just some random heuristic based on the number + # of trials and dimensionality of the search space + # TODO: Think about and evaluate this more. + beta = ndims**2 / 10 + return n_sampled_already, beta + + class BayesianOptimization(BaseOptimizer): """Implements the basic BO loop.""" @@ -41,10 +65,14 @@ def __init__( pipeline_space: SearchSpace, *, initial_design_size: int | None = None, - surrogate_model: Literal["gp"] | Callable[[DataPack, torch.Tensor], Model] = "gp", + surrogate_model: ( + Literal["gp"] | Callable[[TensorPack, torch.Tensor], Model] + ) = "gp", use_priors: bool = False, sample_default_first: bool = False, device: torch.device | None = None, + encoder: TensorEncoder | None = None, + treat_fidelity_as_hyperparameters: bool = False, **kwargs: Any, # TODO: Remove ): """Initialise the BO loop. @@ -54,8 +82,16 @@ def __init__( initial_design_size: Number of samples used before using the surrogate model. If None, it will take `int(log(N) ** 2)` samples where `N` is the number of parameters in the search space. - surrogate_model: Surrogate model + surrogate_model: Surrogate model, either a known model str or a callable + that takes in the training data and returns a model fitted to (X, y). use_priors: Whether to use priors set on the hyperparameters during search. + sample_default_first: Whether to sample the default configuration first. + device: Device to use for the optimization. + encoder: Encoder to use for encoding the configurations. If None, it will + will use the default encoder. + treat_fidelity_as_hyperparameters: Whether to treat fidelities as + hyperparameters. If left as False, fidelities will be ignored + and configurations will always be sampled at the maximum fidelity. Raises: ValueError: if initial_design_size < 1 @@ -76,12 +112,20 @@ def __init__( super().__init__(pipeline_space=pipeline_space) - self.encoder = DataEncoder.default_encoder( - pipeline_space, - include_fidelities=False, - ) - # We should only be acting on tensor'able hyperparameters for now - assert self.encoder.tensors is not None + if encoder is None: + parameters: dict[ + str, + CategoricalParameter | FloatParameter | IntegerParameter, + ] = { + **pipeline_space.numerical, + **pipeline_space.categoricals, + } + if treat_fidelity_as_hyperparameters: + parameters.update(pipeline_space.fidelities) + + self.encoder = TensorEncoder.default(parameters) + else: + self.encoder = encoder # TODO: This needs to be moved to the search space class, however # to not break the current prior based APIs used elsewhere, we can @@ -96,8 +140,7 @@ def __init__( domains: dict[str, Domain] = {} centers: dict[str, tuple[Any, float]] = {} categoricals: set[str] = set() - for name in self.encoder.tensors.names(): - hp = self.pipeline_space.hyperparameters[name] + for name, hp in parameters.items(): domains[name] = hp.domain # type: ignore if isinstance(hp, CategoricalParameter): @@ -106,11 +149,13 @@ def __init__( if hp.default is None: continue - confidence_score: float = hp.default_confidence_choice # type: ignore - if isinstance(hp, CategoricalParameter): - center = hp._default_index - else: - center = hp.default + confidence_str = hp.default_confidence_choice + confidence_score = _mapping[confidence_str] + center = ( + hp._default_index + if isinstance(hp, CategoricalParameter) + else hp.default + ) centers[name] = (center, confidence_score) @@ -160,11 +205,9 @@ def ask( space = self.pipeline_space config_id = str(len(trials) + 1) - assert self.encoder.tensors is not None # Fill intitial design data if we don't have any... if self.initial_design_ is None: - size = self.n_initial_design self.initial_design_ = [] # Add the default configuration first (maybe) @@ -172,10 +215,24 @@ def ask( config = space.sample_default_configuration() self.initial_design_.append(config.hp_values()) - # Fill remaining with Sobol sequence samples - sobol = Sobol(seed=0, encoder=self.encoder, allow_undersampling=True) - sobol_configs = sobol.sample(size - len(self.initial_design_)) - self.initial_design_.extend(sobol_configs) + if self.prior: + sampler = PriorInitialDesign(prior=self.prior, seed=0) + else: + sampler = Sobol(ndim=self.encoder.ncols, seed=0, scramble=True) + + n_samples = self.n_initial_design - len(self.initial_design_) + + # We add a buffer of 2x the samples to help ensure + # we get get enough after removing duplicates. + x = sampler.sample(n_samples * 2) + x = Domain.translate( + x, + to=self.encoder.domains.values(), + frm=sampler.sample_domain, + ) + uniq_x = torch.unique(x, dim=0) + configs = self.encoder.decode_dicts(uniq_x[:n_samples]) + self.initial_design_.extend(configs) # If we havn't passed the intial design phase, just return # the next one. @@ -187,13 +244,10 @@ def ask( ) # Now we actually do the BO loop, start by encoding the data - x = self.encoder.encode(x_configs, device=self.device) + x = self.encoder.pack(x_configs, device=self.device) + x_pending = None if any(pending): - x_pending = self.encoder.encode(pending, device=self.device) - x_pending = x_pending.tensor - assert x_pending is not None - else: - x_pending = None + x_pending = self.encoder.pack(pending, device=self.device) # Get our fitted model model = self._get_fitted_model(x, y) @@ -204,7 +258,7 @@ def ask( acq = qLogExpectedImprovement( model, best_f=y.min(), - X_pending=x_pending, + X_pending=None if x_pending is None else x_pending.tensor, # Unfortunatly, there's no option to indicate that we minimize # the AcqFunction so we need to do some kind of transformation. # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607 @@ -213,34 +267,15 @@ def ask( # If we have a prior, then we use it with PiBO if self.prior: - if budget_info.max_evaluations is not None: - # From the PIBO paper (Section 4.1) - # https://arxiv.org/pdf/2204.11051 - n = budget_info.used_evaluations - beta = budget_info.max_evaluations / 10 - - elif budget_info.max_cost_budget is not None: - # This might not work well if cost number is high - # early on, but it will start to normalize. - n = budget_info.used_cost_budget - beta = budget_info.max_cost_budget / 10 - - else: - # Otherwise, just some random heuristic based on the number - # of trials and dimensionality of the search space - # TODO: Think about and evaluate this more. - ndim = x.tensor.shape[1] # type: ignore - n = len(x_configs) - beta = ndim**2 / 10 - + n, beta = pibo_acq_beta_and_n( + n_sampled_already=len(trials), + ndims=self.encoder.ncols, + budget_info=budget_info, + ) acq = PiboAcquisition(acq, prior=self.prior, n=n, beta=beta) # Optimize it - candidates, _eis = optimize_acq( - acq_fn=acq, - encoder=self.encoder, - acq_options={}, # options to underlying optim function of botorch - ) + candidates, _eis = optimize_acq(acq_fn=acq, encoder=self.encoder, acq_options={}) # Take the first (and only?) candidate assert len(candidates) == 1 diff --git a/neps/optimizers/initial_design.py b/neps/optimizers/initial_design.py index e80dfe33..c8039c6a 100644 --- a/neps/optimizers/initial_design.py +++ b/neps/optimizers/initial_design.py @@ -1,80 +1,34 @@ +"""Initial design of points for optimization.""" + from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Protocol - -import torch - -from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain +from typing import TYPE_CHECKING +from typing_extensions import override if TYPE_CHECKING: - from neps.search_spaces.encoding import DataEncoder - + import torch -class InitialDesign(Protocol): - def sample(self, n: int) -> list[dict[str, Any]]: ... + from neps.priors import Prior + from neps.search_spaces.domain import Domain @dataclass -class Sobol(InitialDesign): - seed: int - """The seed for the Sobol sequence.""" - - encoder: DataEncoder - """The encoding used to encode the samples.""" - - scramble: bool = True - """Whether to scramble the Sobol sequence.""" - - buffer_sample_multiplier: int = 2 - """How many samples to generate in the buffer before checking for uniqueness.""" - - allow_undersampling: bool = False - """If True, will allow undersampling if we can't generate `n` unique samples.""" - - def sample(self, n: int) -> list[dict[str, Any]]: - """Sample `n` points from the Sobol sequence. - - !!! warning - - If `self.allow_undersampling` is False, this method will raise a ValueError if - it cannot generate `n` unique samples. - - Args: - n: The number of points to sample. - - Returns: - A list of `n` points sampled from the Sobol sequence. - """ - assert self.encoder.tensors is not None - - if self.encoder.has_graphs(): - # TODO: Won't work on graphs - raise NotImplementedError("Graphs are not yet supported.") - - if self.encoder.n_numerical == 0 and self.encoder.n_categorical > 0: - # TODO: We need to do something else if we have only categoricals - # as we are going to get a lot of duplicates - raise NotImplementedError("Only categorical variables are not yet supported.") - - ndim = self.encoder.n_numerical + self.encoder.n_categorical - sobol = torch.quasirandom.SobolEngine(dimension=ndim, scramble=True, seed=5) +class PriorInitialDesign(InitialDesign): + """Sample from a prior distribution.""" - SAMPLE_SIZE = self.buffer_sample_multiplier * n - unit_x = sobol.draw(SAMPLE_SIZE, dtype=torch.float64) + prior: Prior + """The prior to sample from.""" - x = Domain.cast_many( - unit_x, - to=list(self.encoder.tensors.domains().values()), - frm=UNIT_FLOAT_DOMAIN, - ) + # TODO: Right now we don't have a way to set the seed temporarily + seed: int | None = None + """The seed for sampling.""" - # NOTE: We have to check uniqueness after conversion from unit cube space - # as we could have multiple unit floats mapping to the same categories or integers - unique_x = torch.unique(x, dim=0) - if len(unique_x) < n and not self.allow_undersampling: - raise ValueError( - f"Could not generate {n} unique samples, got {len(unique_x)}\n{self=}" - ) + @override + def sample(self, n: int) -> torch.Tensor: + return self.prior.sample(n) - return self.encoder.decode_dicts(unique_x[:n]) + @property + @override + def sample_domain(self) -> list[Domain]: + return self.prior.domains diff --git a/neps/sampling/__init__.py b/neps/sampling/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/neps/priors.py b/neps/sampling/priors.py similarity index 58% rename from neps/priors.py rename to neps/sampling/priors.py index 116b31f9..8ccc76e9 100644 --- a/neps/priors.py +++ b/neps/sampling/priors.py @@ -4,31 +4,31 @@ variables, i.e. each column of a tensor is assumed to be independent and can be acted on independently. -They are not a `torch.distributions.Distribution` subclass as methods like -`entropy` and `kl_divergence` are just more difficult to implement -(not impossible, just more difficult and not needed right now). - See the class doc description of [`Prior`][neps.priors.Prior] for more details. """ from __future__ import annotations from dataclasses import dataclass, field +from functools import reduce from typing import TYPE_CHECKING, Any, Container, Mapping, Protocol from typing_extensions import override import torch from neps.distributions import DistributionOverDomain, TruncatedNormal +from neps.sampling.samplers import Sampler from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain if TYPE_CHECKING: from torch.distributions import Distribution -class Prior(Protocol): +class Prior(Sampler, Protocol): """A protocol for priors over search spaces. + Extends from the [`Sampler`][neps.samplers.Sampler] protocol. + At it's core, the two methods that need to be implemented are `log_prob` and `sample`. The `log_prob` method should return the log probability of a given tensor of samples under its distribution. @@ -55,67 +55,51 @@ class Prior(Protocol): actually be `1` (1 / 1) for any value inside the domain. """ - domains: list[Domain] - """Domain of values which this prior acts upon. - - Each domain corresponds to the corresponding `ndim` in a tensor - (n_samples, ndim). - """ - - device: torch.device | None - """Device to place the tensors on.""" - - def log_prob(self, x: torch.Tensor) -> torch.Tensor: + def log_prob( + self, + x: torch.Tensor, + *, + frm: list[Domain] | Domain, + ) -> torch.Tensor: """Compute the log probability of values in `x` under a prior. - All columns of `x` are assumed to be independent, such that the + The last dimenion of `x` is assumed to be independent, such that the log probability of the entire tensor is the sum of the log - probabilities of each column. + probabilities of each element in that dimension. + + For example, if `x` is of shape `(n_samples, n_dims)`, then the + you will be given back a tensor of shape `(n_samples,)` with the + each entry being the log probability of the corresponding sample. Args: - x: Tensor of shape (n_samples, n_dims) + x: Tensor of shape (..., n_dims) In the case of a 1D tensor, the shape is assumed to be (n_dims,) + frm: The domain of the values in `x`. If a single domain, then all the + values are assumed to be from that domain, otherwise each column + `n_dims` in (n_samples, n_dims) is from the corresponding domain. Returns: - Tensor of shape (n_samples,) with the log probabilities of each. In the + Tensor of shape (...,), with the last dimension reduced out. In the case that only single dimensional tensor is passed, the returns value is a scalar. """ ... - def sample(self, n: int) -> torch.Tensor: - """Sample from the prior. - - Args: - n: Number of samples to draw. - - Returns: - Tensor of shape (n, n_dims) with the samples. - """ - ... - - def prob(self, x: torch.Tensor) -> torch.Tensor: + def prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor: """Compute the probability of values in `x` under a prior. See [`log_prob()`][neps.priors.Prior.log_prob] for details on shapes. """ - return torch.exp(self.log_prob(x)) + return torch.exp(self.log_prob(x, frm=frm)) @classmethod - def uniform( - cls, - domains: Mapping[str, Domain] | list[Domain], - *, - device: torch.device | None = None, - ) -> UniformPrior: + def uniform(cls, ncols: int) -> UniformPrior: """Create a uniform prior for a given list of domains. Args: - domains: domains over which to have a uniform prior. - device: Device to place the tensors on. + ncols: The number of columns in the tensor to sample. """ - domains = domains if isinstance(domains, list) else list(domains.values()) - return UniformPrior(domains=domains, device=device) + return UniformPrior(ncols=ncols) @classmethod def make_centered( @@ -242,9 +226,7 @@ def make_centered( ) distributions.append(dist) - return CenteredPrior( - domains=list(domains.values()), distributions=distributions, device=device - ) + return CenteredPrior(distributions=distributions) @dataclass @@ -261,12 +243,6 @@ class CenteredPrior(Prior): [`Prior.make_centered()`][neps.priors.Prior.make_centered]. """ - domains: list[Domain] - """Domain of values.""" - - device: torch.device | None - """Device to place the tensors on.""" - distributions: list[DistributionOverDomain] """Distributions along with the corresponding domains they sample from.""" @@ -275,11 +251,24 @@ class CenteredPrior(Prior): def __post_init__(self): self._distribution_domains = [dist.domain for dist in self.distributions] + @property + @override + def ncols(self) -> int: + return len(self.distributions) + @override - def log_prob(self, x: torch.Tensor) -> torch.Tensor: + def log_prob(self, x: torch.Tensor, *, frm: list[Domain] | Domain) -> torch.Tensor: + if x.ndim == 0: + raise ValueError("Expected a tensor of shape (..., ncols).") + + if x.ndim == 1: + x = x.unsqueeze(0) + # Cast all values from the value domains to the domain of the sampler. - sample_domain_tensor = Domain.cast_many( - x, frm=self.domains, to=self._distribution_domains + sample_domain_tensor = Domain.translate( + x, + frm=frm, + to=self._distribution_domains, ) # Calculate the log probabilities of the sample domain tensors under their @@ -289,25 +278,34 @@ def log_prob(self, x: torch.Tensor) -> torch.Tensor: dist.distribution.log_prob(sample_domain_tensor[:, i]) for i, dist in enumerate(self.distributions) ], - dim=1, + dim=-1, ) - return torch.sum(log_probs, dim=1) + return torch.sum(log_probs, dim=-1) @override - def sample(self, n: int) -> torch.Tensor: - buffer = torch.empty( - n, - len(self.distributions), - device=self.device, - dtype=torch.float64, + def sample( + self, + n: int | torch.Size, + *, + to: Domain | list[Domain], + seed: int | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: + if seed is not None: + raise NotImplementedError("Seeding is not yet implemented.") + + _out_shape = ( + torch.Size((n, self.ncols)) + if isinstance(n, int) + else torch.Size((*n, self.ncols)) ) + _n = torch.Size((n,)) if isinstance(n, int) else n - size = torch.Size((n,)) - for i, (value_domain, frm) in enumerate(zip(self.domains, self.distributions)): - samples = frm.distribution.sample(size) - buffer[:, i] = value_domain.cast(samples, frm=frm.domain) + out = torch.empty(_out_shape, device=device, dtype=torch.float64) + for i, dist in enumerate(self.distributions): + out[..., i] = dist.distribution.sample(_n) - return buffer + return Domain.translate(out, frm=self._distribution_domains, to=to) @dataclass @@ -317,49 +315,119 @@ class UniformPrior(Prior): Uses a UnitUniform under the hood before converting to the value domain. """ - domains: list[Domain] - """Domain of values.""" - - device: torch.device | None - """Device to place the tensors on.""" + ncols: int + """The number of columns in the tensor to sample from.""" _unit_uniform: Distribution = field(init=False, repr=False) def __post_init__(self): self._unit_uniform = torch.distributions.Uniform(0.0, 1.0) - def log_prob(self, x: torch.Tensor) -> torch.Tensor: - """Compute the log probability of values in `x` under a prior. + @override + def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor: + sample_domain_tensor = Domain.translate(x, frm=frm, to=UNIT_FLOAT_DOMAIN) + return torch.sum(self._unit_uniform.log_prob(sample_domain_tensor), dim=-1) - All columns of `x` are assumed to be independent, such that the - log probability of the entire tensor is the sum of the log - probabilities of each column. + @override + def sample( + self, + n: int | torch.Size, + *, + to: Domain | list[Domain], + seed: int | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: + if seed is not None: + raise NotImplementedError("Seeding is not yet implemented.") + + _n = ( + torch.Size((n, self.ncols)) + if isinstance(n, int) + else torch.Size((*n, self.ncols)) + ) + samples = torch.rand(_n, device=device, dtype=torch.float64) + return Domain.translate(samples, frm=UNIT_FLOAT_DOMAIN, to=to) - Args: - x: Tensor of shape (n_samples, n_dims) - In the case of a 1D tensor, the shape is assumed to be (n_dims,) - Returns: - Tensor of shape (n_samples,) with the log probabilities of each. In the - case that only single dimensional tensor is passed, the returns value - is a scalar. - """ - sample_domain_tensor = Domain.cast_many(x, frm=self.domains, to=UNIT_FLOAT_DOMAIN) - return torch.sum(self._unit_uniform.log_prob(sample_domain_tensor), dim=1) +@dataclass +class WeightedPrior(Prior): + """A prior consisting of multiple priors with weights.""" - def sample(self, n: int) -> torch.Tensor: - """Sample from the prior. + priors: list[Prior] + weights: torch.Tensor + probabilities: torch.Tensor = field(init=False, repr=False) - Args: - n: Number of samples to draw. + def __post_init__(self): + if len(self.priors) < 2: + raise ValueError(f"At least two priors must be given. Got {len(self.priors)}") - Returns: - Tensor of shape (n, n_dims) with the samples. - """ - samples = torch.rand( - n, - len(self.domains), - device=self.device, - dtype=torch.float64, + if self.weights.ndim != 1: + raise ValueError("Weights must be a 1D tensor.") + + if len(self.priors) != len(self.weights): + raise ValueError("The number of priors and weights must be the same.") + + self.probabilities = self.weights / self.weights.sum() + + @override + def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor: + # OPTIM: Avoid an initial allocation by using the output of the first + # distribution to store the weighted probabilities + itr = zip(self.probabilities, self.priors) + first_prob, first_prior = next(itr) + + weighted_probs = first_prob * first_prior.log_prob(x, frm=frm) + for prob, prior in itr: + weighted_probs += prob * prior.log_prob(x, frm=frm) + + return weighted_probs + + @override + def sample( + self, + n: int | torch.Size, + *, + to: Domain | list[Domain], + seed: int | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: + if seed is not None: + raise NotImplementedError("Seeding is not yet implemented.") + + # Calculate the total number of samples required + if isinstance(n, int): + total_samples = n + output_shape = (n, self.ncols) + else: + total_samples = reduce(lambda x, y: x * y, n) + output_shape = (*n, self.ncols) + + # Randomly select which prior to sample from for each of the total_samples + chosen_priors = torch.empty((total_samples,), device=device, dtype=torch.int64) + chosen_priors = torch.multinomial( + self.probabilities, + total_samples, + replacement=True, + out=chosen_priors, + ) + + # Create an empty tensor to hold all samples + output_samples = torch.empty( + (total_samples, self.ncols), device=device, dtype=torch.float64 ) - return Domain.cast_many(samples, frm=UNIT_FLOAT_DOMAIN, to=self.domains) + + # Loop through each prior and its associated indices + for i, prior in enumerate(self.priors): + # Find indices where the chosen prior is i + _i = torch.tensor(i, dtype=torch.int64, device=device) + indices = torch.where(chosen_priors == _i)[0] + + if len(indices) > 0: + # Sample from the prior for the required number of indices + samples_from_prior = prior.sample(len(indices), to=to, device=device) + output_samples[indices] = samples_from_prior + + # Reshape to the output shape including ncols dimension + output_samples = output_samples.view(output_shape) + + return Domain.translate(output_samples, frm=UNIT_FLOAT_DOMAIN, to=to) diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py new file mode 100644 index 00000000..f0298d84 --- /dev/null +++ b/neps/sampling/samplers.py @@ -0,0 +1,120 @@ +"""Samplers for generating points in a search space. + +These are similar to [`Prior`][neps.priors.Prior] objects, but they +do not necessarily have an easily definable pdf. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from functools import reduce +from typing import Protocol +from typing_extensions import override + +import torch + +from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain + + +class Sampler(Protocol): + """A protocol for sampling tensors and vonerting them to a given domain.""" + + @property + def ncols(self) -> int: + """The number of columns in the samples produced by this sampler.""" + ... + + def sample( + self, + n: int | torch.Size, + *, + to: Domain | list[Domain], + seed: int | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: + """Sample `n` points and convert them to the given domain. + + Args: + n: The number of points to sample. If a torch.Size, an additional dimension + will be added with [`.ncols`][neps.samplers.Sampler.ncols]. + For example, if `n = 5`, the output will be `(5, ncols)`. If + `n = (5, 3)`, the output will be `(5, 3, ncols)`. + to: The domain or list of domains to cast the points to. + If a single domain, all points are cast to that domain, otherwise + each column `ndim_i` in (n, ndim) is cast to the corresponding domain + in `to`. As a result, the length of `to` must match the number of columns + from [`.ncols`][neps.samplers.Sampler.ncols]. + seed: The seed for the random number generator. + device: The device to cast the samples to. + + Returns: + A tensor of (n, ndim) points sampled cast to the given domain. + """ + ... + + @classmethod + def sobol(cls, ndim: int, *, scramble: bool = True, seed: int | None = None) -> Sobol: + """Create a Sobol sampler. + + Args: + ndim: The number of dimensions to sample for. + scramble: Whether to scramble the Sobol sequence. + seed: The seed for the Sobol sequence. + + Returns: + A Sobol sampler. + """ + return Sobol(ndim=ndim, scramble=scramble, seed=seed) + + +# Technically this could be a prior with a uniform distribution +@dataclass +class Sobol(Sampler): + """Sample from a Sobol sequence.""" + + ndim: int + """The number of dimensions to sample for.""" + + seed: int | None = None + """The seed for the Sobol sequence.""" + + scramble: bool = True + """Whether to scramble the Sobol sequence.""" + + @property + @override + def ncols(self) -> int: + return self.ndim + + @override + def sample( + self, + n: int | torch.Size, + *, + to: Domain | list[Domain], + seed: int | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: + if seed is not None: + raise NotImplementedError("Setting the seed is not supported yet") + + # Sobol can only produce 2d tensors. To handle batches or arbitrary + # dimensions, we get a count of the total number of samples needed + # and reshape the output tensor to the desired shape, if needed. + _n = n if isinstance(n, int) else reduce(lambda x, y: x * y, n) + + sobol = torch.quasirandom.SobolEngine( + dimension=self.ndim, + scramble=self.scramble, + seed=self.seed, + ) + + out = torch.empty(_n, self.ncols, dtype=torch.float64, device=device) + x = sobol.draw(_n, dtype=torch.float64, out=out) + + # If we got extra dimensions, such as batch dimensions, we need to + # reshape the tensor to the desired shape. + if isinstance(n, torch.Size): + x = x.view(*n, self.ncols) + + return Domain.translate(x, frm=UNIT_FLOAT_DOMAIN, to=to) diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py index 2081bf33..5d92fac3 100644 --- a/neps/search_spaces/domain.py +++ b/neps/search_spaces/domain.py @@ -8,40 +8,44 @@ * The midpoint of the domain. * Whether the domain is split into bins. -With that, the primary method of a domain is to be able to cast +With that, the primary method of a domain is to be able to +[`cast()`][neps.search_spaces.domain.Domain.cast] a tensor of values from one to domain to another, e.g. `values_a = domain_a.cast(values_b, frm=domain_b)`. This can be used to convert float samples to integers, integers to log space, etc. -The core method to do so is to be able to cast `to_unit` which takes +The core method to do so is to be able to cast +[`to_unit()`][neps.search_spaces.domain.Domain.to_unit] which takes values to a unit interval [0, 1], and then to be able to cast values in [0, 1] -to the new domain with `from_unit`. +to the new domain with [`from_unit()`][neps.search_spaces.domain.Domain.from_unit]. There are some shortcuts implemented in `cast`, such as skipping going through the unit interval if the domains are the same, as no transformation is needed. The primary methods for creating a domain are -* `Domain.float(l, u, ...)` - Used for modelling float ranges -* `Domain.int(l, u, ...)` - Used for modelling integer ranges -* `Domain.indices(n)` - Primarly used to model categorical choices +* [`Domain.float(l, u, ...)`][neps.search_spaces.domain.Domain.float] - + Used for modelling float ranges +* [`Domain.int(l, u, ...)`][neps.search_spaces.domain.Domain.int] - + Used for modelling integer ranges +* [`Domain.indices(n)`][neps.search_spaces.domain.Domain.indices] - + Primarly used to model categorical choices If you have a tensor of values, where each column corresponds to a different domain, -you can take a look at `Domain.cast_many` to cast all the values in one go. +you can take a look at [`Domain.translate()`][neps.search_spaces.domain.Domain.translate] -If you need a unit-interval domain, please use the `Domain.unit_float()` or -`UNIT_FLOAT_DOMAIN` constant. +If you need a unit-interval domain, please use the +[`Domain.unit_float()`][neps.search_spaces.domain.Domain.unit_float] +or `UNIT_FLOAT_DOMAIN` constant. """ -# TODO: Could theoretically implement dtype,device,out for all methods here but -# would need to be careful not to accidentally send to and from GPU. from __future__ import annotations import math from dataclasses import dataclass, field -from typing import Generic, Sequence, TypeVar +from typing import Generic, Iterable, TypeVar import torch from torch import Tensor @@ -53,11 +57,36 @@ @dataclass(frozen=True) class Domain(Generic[V]): + """A domain for a value. + + The primary methods for creating a domain are + + * [`Domain.float(l, u, ...)`][neps.search_spaces.domain.Domain.float] - + Used for modelling float ranges + * [`Domain.int(l, u, ...)`][neps.search_spaces.domain.Domain.int] - + Used for modelling integer ranges + * [`Domain.indices(n)`][neps.search_spaces.domain.Domain.indices] - + Primarly used to model categorical choices + """ + lower: V + """The lower bound of the domain.""" + upper: V + """The upper bound of the domain.""" + round: bool + """Whether to round the values to the nearest integer.""" + log_bounds: tuple[float, float] | None = None + """The log bounds of the domain, if the domain is in log space.""" + bins: int | None = None + """The number of discrete bins to split the domain into. + + Includes both endpoints of the domain and values are rounded to the nearest bin + value. + """ dtype: torch.dtype = field(init=False, repr=False) is_unit_float: bool = field(init=False, repr=False) @@ -102,6 +131,17 @@ def float( log: bool = False, bins: int | None = None, ) -> Domain[float]: + """Create a domain for a range of float values. + + Args: + lower: The lower bound of the domain. + upper: The upper bound of the domain. + log: Whether the domain is in log space. + bins: The number of discrete bins to split the domain into. + + Returns: + A domain for a range of float values. + """ return Domain( lower=float(lower), upper=float(upper), @@ -119,6 +159,17 @@ def int( log: bool = False, bins: int | None = None, ) -> Domain[int]: + """Create a domain for a range of integer values. + + Args: + lower: The lower bound of the domain. + upper: The upper bound of the domain. + log: Whether the domain is in log space. + bins: The number of discrete bins to split the domain into. + + Returns: + A domain for a range of integer values. + """ return Domain( lower=int(round(lower)), upper=int(round(upper)), @@ -134,13 +185,25 @@ def indices(cls, n: int) -> Domain[int]: Like range based functions this domain is inclusive of the lower bound and exclusive of the upper bound. - Use this method to create a domain for indices + Args: + n: The number of indices. + + Returns: + A domain for a range of indices. """ return Domain.int(0, n - 1) def to_unit(self, x: Tensor) -> Tensor: + """Transform a tensor of values from this domain to the unit interval [0, 1]. + + Args: + x: Tensor of values in this domain to convert. + + Returns: + Same shape tensor with the values normalized to the unit interval [0, 1]. + """ if self.is_unit_float: - return x # type: ignore + return x if self.log_bounds is not None: x = torch.log(x) @@ -151,6 +214,14 @@ def to_unit(self, x: Tensor) -> Tensor: return (x - lower) / (upper - lower) def from_unit(self, x: Tensor) -> Tensor: + """Transform a tensor of values from the unit interval [0, 1] to this domain. + + Args: + x: A tensor of values in the unit interval [0, 1] to convert. + + Returns: + Same shape tensor with the lifted into this domain. + """ if self.is_unit_float: return x @@ -173,11 +244,19 @@ def from_unit(self, x: Tensor) -> Tensor: return x.type(self.dtype) - def cast( - self, - x: Tensor, - frm: Domain, - ) -> Tensor: + def cast(self, x: Tensor, frm: Domain) -> Tensor: + """Cast a tensor of values frm the domain `frm` to this domain. + + If you need to cast a tensor of mixed domains, use + [`Domain.translate()`][neps.search_spaces.domain.Domain.translate]. + + Args: + x: Tensor of values in the `frm` domain to cast to this domain. + frm: The domain to cast from. + + Returns: + Same shape tensor with the values cast to this domain. + """ # NOTE: In general, we should always be able to go through the unit interval # [0, 1] to be able to transform between domains. However sometimes we can # bypass some steps, dependant on the domains, hence the ugliness... @@ -216,16 +295,20 @@ def cast( @classmethod def unit_float(cls) -> Domain[float]: + """Get a domain for the unit interval [0, 1].""" return UNIT_FLOAT_DOMAIN @classmethod - def cast_many( - cls, x: Tensor, frm: Domain | Sequence[Domain], to: Domain | Sequence[Domain] + def translate( + cls, + x: Tensor, + frm: Domain | Iterable[Domain], + to: Domain | Iterable[Domain], ) -> Tensor: """Cast a tensor of mixed domains to a new set of mixed domains. Args: - x: Tensor of shape (n_samples, n_dims) with each dim `i` corresponding + x: Tensor of shape (..., n_dims) with each dim `i` corresponding to the domain `frm[i]`. frm: List of domains to cast from. If list, must be length of `n_dims`, otherwise we assume the single domain provided is the one to be used @@ -235,43 +318,43 @@ def cast_many( across all dimensions. Returns: - Tensor of shape (n_samples, n_dims) with each dim `i` transformed - from the domain `frm[i]` to the domain `to[i]`. + Tensor of the same shape as `x` with the last dimension casted + from the domain `frm[i]` to the domain `to[i]`. """ + if x.ndim == 0: + raise ValueError("Expected a tensor with at least one dimension.") + if x.ndim == 1: - raise ValueError( - "Expected a 2D tensor of shape (n_samples, n_dims), got a 1D tensor." - ) + x = x.unsqueeze(0) - if isinstance(frm, Sequence) and len(frm) != x.shape[1]: + ndims = x.shape[-1] + + # If both are not a list, we can just cast the whole tensor + if isinstance(frm, Domain) and isinstance(to, Domain): + return to.cast(x, frm=frm) + + frm = [frm] * ndims if isinstance(frm, Domain) else list(frm) + to = [to] * ndims if isinstance(to, Domain) else list(to) + + if len(frm) != ndims: raise ValueError( "The number of domains in `frm` must match the number of tensors" " if provided as a list." - f" Expected {x.shape[1]}, got {len(frm)}." + f" Expected {ndims} from last dimension of {x.shape}, got {len(frm)}." ) - if isinstance(to, Sequence) and len(to) != x.shape[1]: + if len(to) != ndims: raise ValueError( "The number of domains in `to` must match the number of tensors" " if provided as a list." - f" Expected {x.shape[1]}, got {len(to)}." + f" Expected {ndims} from last dimension of {x.shape}, got {len(to)}." ) - # If both are not a list, we can just cast the whole tensor - if not isinstance(frm, Sequence) and not isinstance(to, Sequence): - return to.cast(x, frm=frm) - - # Otherwise, we need to go column by column - if isinstance(frm, Domain): - frm = [frm] * x.shape[1] - if isinstance(to, Domain): - to = [to] * x.shape[1] - - buffer = torch.empty_like(x) + out = torch.empty_like(x) for i, (f, t) in enumerate(zip(frm, to)): - buffer[:, i] = t.cast(x[:, i], frm=f) + out[..., i] = t.cast(x[..., i], frm=f) - return buffer + return out UNIT_FLOAT_DOMAIN = Domain.float(0.0, 1.0) diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py index 21d7acd4..ad3b00ae 100644 --- a/neps/search_spaces/encoding.py +++ b/neps/search_spaces/encoding.py @@ -1,17 +1,14 @@ from __future__ import annotations from dataclasses import dataclass, field -from itertools import chain from typing import ( TYPE_CHECKING, Any, Generic, Mapping, Sequence, - Sized, TypeAlias, TypeVar, - overload, ) from typing_extensions import Protocol, override @@ -19,19 +16,19 @@ import numpy.typing as npt import torch from grakel.utils import graph_from_networkx -from torch._dynamo.utils import product -from neps.search_spaces.architecture.graph_grammar import GraphParameter from neps.search_spaces.domain import ( UNIT_FLOAT_DOMAIN, Domain, ) +from neps.search_spaces.hyperparameters.categorical import CategoricalParameter from neps.search_spaces.hyperparameters.float import FloatParameter from neps.search_spaces.hyperparameters.integer import IntegerParameter if TYPE_CHECKING: import networkx as nx + from neps.search_spaces.parameter import Parameter from neps.search_spaces.search_space import SearchSpace WLInput: TypeAlias = tuple[dict, dict | None, dict | None] @@ -192,18 +189,18 @@ def decode_dicts(self, x: npt.NDArray[np.object_]) -> list[dict[str, Any]]: @dataclass class TensorEncoder: transformers: dict[str, TensorTransformer] - column_lookup: dict[str, int] = field(init=False) + index_of: dict[str, int] = field(init=False) n_numerical: int = field(init=False) n_categorical: int = field(init=False) def __post_init__(self): transformers = sorted(self.transformers.items(), key=lambda t: t[0]) self.transformers = dict(transformers) - self.column_lookup: dict[str, int] = {} + self.index_of: dict[str, int] = {} n_numerical = 0 n_categorical = 0 for i, (name, transformer) in enumerate(self.transformers.items()): - self.column_lookup[name] = i + self.index_of[name] = i if isinstance(transformer, CategoricalToIntegerTransformer): n_categorical += 1 else: @@ -212,6 +209,11 @@ def __post_init__(self): self.n_numerical = n_numerical self.n_categorical = n_categorical + @property + def ncols(self) -> int: + return len(self.transformers) + + @property def domains(self) -> dict[str, Domain]: return { name: transformer.domain for name, transformer in self.transformers.items() @@ -222,10 +224,9 @@ def names(self) -> list[str]: def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor: if isinstance(hp, str): - return x[:, self.column_lookup[hp]] + return x[:, self.index_of[hp]] - cols = torch.concatenate([torch.arange(*self.column_lookup[h]) for h in hp]) - return x[:, cols] + return x[:, [self.index_of[h] for h in hp]] def encode( self, @@ -238,7 +239,7 @@ def encode( for hp_name, transformer in self.transformers.items(): values = [conf[hp_name] for conf in x] - lookup = self.column_lookup[hp_name] + lookup = self.index_of[hp_name] # Encode directly into buffer transformer.encode( @@ -250,314 +251,85 @@ def encode( return buffer + def pack( + self, x: Sequence[Mapping[str, Any]], *, device: torch.device | None = None + ) -> TensorPack: + return TensorPack(self.encode(x, device=device), self) + def decode_dicts(self, x: torch.Tensor) -> list[dict[str, Any]]: values: dict[str, list[Any]] = {} for hp_name, transformer in self.transformers.items(): - lookup = self.column_lookup[hp_name] + lookup = self.index_of[hp_name] tensor = x[:, lookup] values[hp_name] = transformer.decode(tensor) keys = list(values.keys()) return [dict(zip(keys, vals)) for vals in zip(*values.values())] - -@dataclass -class DataEncoder: - tensors: TensorEncoder | None = None - graphs: GraphEncoder | None = None - device: torch.device = field(default_factory=lambda: torch.device("cpu")) - - n_numerical: int = field(init=False) - n_categorical: int = field(init=False) - n_graphs: int = field(init=False) - - def __post_init__(self): - self.n_numerical = 0 if self.tensors is None else self.tensors.n_numerical - self.n_categorical = 0 if self.tensors is None else self.tensors.n_categorical - self.n_graphs = 0 if self.graphs is None else len(self.graphs.transformers) - - def encode( - self, - x: Sequence[Mapping[str, Any]], - *, - device: torch.device | None = None, - ) -> DataPack: - tensor = self.tensors.encode(x, device=device) if self.tensors else None - graphs = self.graphs.encode(x) if self.graphs else None - return DataPack(encoder=self, tensor=tensor, graphs=graphs) - - @overload - def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor: ... - - @overload - def select( - self, x: npt.NDArray[np.object_], hp: str | Sequence[str] - ) -> npt.NDArray[np.object_]: ... - - def select( - self, - x: torch.Tensor | npt.NDArray[np.object_], - hp: str | Sequence[str], - ) -> torch.Tensor | npt.NDArray[np.object_]: - if isinstance(x, torch.Tensor): - assert self.tensors is not None - return self.tensors.select(x, hp) - - assert self.graphs is not None - return self.graphs.select(x, hp) - - def decode_dicts( - self, - x: torch.Tensor - | npt.NDArray[np.object_] - | tuple[torch.Tensor | None, npt.NDArray[np.object_] | None], - ) -> list[dict[str, Any]]: - if isinstance(x, tuple): - tensors, graphs = x - elif isinstance(x, torch.Tensor): - tensors, graphs = x, None - else: - tensors, graphs = None, x - - tensor_values: list[dict[str, Any]] | None = None - if tensors is not None: - assert self.tensors is not None - tensor_values = self.tensors.decode_dicts(tensors) - - graph_values: list[dict[str, Any]] | None = None - if graphs is not None: - assert self.graphs is not None - graph_values = self.graphs.decode_dicts(graphs) - - if tensor_values is not None and graph_values is not None: - assert len(tensor_values) == len(graph_values) - return [{**t, **g} for t, g in zip(tensor_values, graph_values)] - - if tensor_values is not None: - return tensor_values - - assert graph_values is not None - return graph_values - - def indices(self, hp: str | Sequence[str]) -> tuple[int, ...]: - if isinstance(hp, str): - if self.tensors and hp in self.tensors.transformers: - lower, upper = self.tensors.column_lookup[hp] - return tuple(torch.arange(lower, upper).tolist()) - - if self.graphs and hp in self.graphs.transformers: - raise ValueError("Cannot select indices from graphs.") - - tkeys = None if self.tensors is None else self.tensors.transformers.keys() - gkeys = None if self.graphs is None else self.graphs.transformers.keys() - raise KeyError( - f"Unknown hyperparameter {hp}. Not in either tensors or graphs" - f"\nTensors: {tkeys}" - f"\nGraphs: {gkeys}" - ) - - return tuple(sorted(chain.from_iterable(self.indices(h) for h in hp))) - @classmethod - def default_encoder( - cls, - space: SearchSpace, - *, - include_fidelities: bool | list[str] = False, - ) -> DataEncoder: - tensor_transformers: dict[str, TensorTransformer] = {} - graph_transformers: dict[str, WLInputTransformer] = {} - - for hp_name, hp in space.categoricals.items(): - tensor_transformers[hp_name] = CategoricalToIntegerTransformer(hp.choices) - - for hp_name, hp in space.numerical.items(): - assert isinstance(hp, (FloatParameter, IntegerParameter)) - tensor_transformers[hp_name] = MinMaxNormalizer(hp.domain) - - for hp_name, hp in space.graphs.items(): - assert isinstance(hp, GraphParameter) - graph_transformers[hp_name] = WLInputTransformer(hp_name) - - if include_fidelities is True: - include_fidelities = list(space.fidelities.keys()) - - if include_fidelities: - for fid_name in include_fidelities: - hp = space.fidelities[fid_name] - assert isinstance(hp, (FloatParameter, IntegerParameter)) - tensor_transformers[fid_name] = MinMaxNormalizer(hp.domain) - - tensor_encoder = ( - TensorEncoder(tensor_transformers) if any(tensor_transformers) else None - ) - graph_encoder = ( - GraphEncoder(graph_transformers) if any(graph_transformers) else None - ) - return DataEncoder(tensors=tensor_encoder, graphs=graph_encoder) - - def has_categoricals(self) -> bool: - return self.tensors is not None and any( - isinstance(t, CategoricalToIntegerTransformer) - for t in self.tensors.transformers.values() - ) - - def has_graphs(self) -> bool: - return self.graphs is not None - - def has_numericals(self) -> bool: - return self.tensors is not None and any( - not isinstance(t, CategoricalToIntegerTransformer) - for t in self.tensors.transformers.values() - ) + def default(cls, parameters: Mapping[str, Parameter]) -> TensorEncoder: + sorted_params = sorted(parameters.items()) + transformers: dict[str, TensorTransformer] = {} + for name, hp in sorted_params: + if isinstance(hp, (FloatParameter, IntegerParameter)): + transformers[name] = MinMaxNormalizer(hp.domain) + else: + assert isinstance(hp, CategoricalParameter) + transformers[name] = CategoricalToIntegerTransformer(hp.choices) - def categorical_product_indices(self) -> list[dict[int, int]]: - cats: dict[int, list[int]] = {} - if self.tensors is None: - return [] + return TensorEncoder(transformers) - for i, (_hp_name, transformer) in enumerate(self.tensors.transformers.items()): - if isinstance(transformer, CategoricalToIntegerTransformer): - cats[i] = list(range(len(transformer.choices))) - if len(cats) == 0: - return [] +@dataclass +class TensorPack: + tensor: torch.Tensor + encoder: TensorEncoder - if len(cats) == 1: - key, values = cats.popitem() - return [{key: v} for v in values] + def __len__(self) -> int: + return len(self.tensor) - return [dict(zip(cats.keys(), vs)) for vs in product(*cats.values())] + @property + def n_numerical(self) -> int: + return self.encoder.n_numerical + @property + def n_categorical(self) -> int: + return self.encoder.n_categorical -@dataclass -class DataPack(Sized): - encoder: DataEncoder - tensor: torch.Tensor | None = None - graphs: npt.NDArray[np.object_] | None = None - _len: int = field(init=False) + @property + def ncols(self) -> int: + return self.encoder.ncols - def __post_init__(self): - if self.tensor is not None and self.graphs is not None: - assert len(self.tensor) == len(self.graphs) - self._len = len(self.tensor) - elif self.tensor is not None: - self._len = len(self.tensor) - elif self.graphs is not None: - self._len = len(self.graphs) - else: - raise ValueError("At least one of numerical or graphs must be provided") - - def __len__(self) -> int: - return self._len + @property + def domains(self) -> dict[str, Domain]: + return self.encoder.domains def select(self, hp: str | Sequence[str]) -> torch.Tensor | npt.NDArray[np.object_]: - if isinstance(hp, str): - if self.encoder.tensors and hp in self.encoder.tensors.transformers: - assert self.tensor is not None - return self.encoder.tensors.select(self.tensor, hp) - - if self.encoder.graphs and hp in self.encoder.graphs.transformers: - assert self.graphs is not None - return self.encoder.graphs.select(self.graphs, hp) - - tkeys = ( - None - if self.encoder.tensors is None - else self.encoder.tensors.transformers.keys() - ) - gkeys = ( - None - if self.encoder.graphs is None - else self.encoder.graphs.transformers.keys() - ) - raise KeyError( - f"Unknown hyperparameter {hp}. Not in either tensors or graphs" - f"\nTensors: {tkeys}" - f"\nGraphs: {gkeys}" - ) + return self.encoder.select(self.tensor, hp) - all_in_tensors = False - all_in_graphs = False - tkeys = None - gkeys = None - if self.encoder.tensors: - all_in_tensors = all(h in self.encoder.tensors.transformers for h in hp) - - if self.encoder.graphs: - all_in_graphs = all(h in self.encoder.graphs.transformers for h in hp) - gkeys = self.encoder.graphs.transformers.keys() - - if not all_in_tensors and not all_in_graphs: - raise ValueError( - "Cannot select from both tensors and graphs!" - f"Got keys: {hp}" - f"\nTensors: {tkeys}" - f"\nGraphs: {gkeys}" - ) + def names(self) -> list[str]: + return self.encoder.names() - if all_in_tensors: - assert self.tensor is not None - assert self.encoder.tensors is not None - return self.encoder.tensors.select(self.tensor, hp) - - assert self.graphs is not None - assert self.encoder.graphs is not None - return self.encoder.graphs.select(self.graphs, hp) - - def decode(self, space: SearchSpace) -> list[SearchSpace]: - return [ - space.from_dict(d) - for d in self.encoder.decode_dicts((self.tensor, self.graphs)) - ] - - def split(self, index: int) -> tuple[DataPack, DataPack]: - if self.tensor is not None: - numerical_left = self.tensor[:index] - numerical_right = self.tensor[index:] - else: - numerical_left = None - numerical_right = None + def to_dicts(self) -> list[dict[str, Any]]: + return self.encoder.decode_dicts(self.tensor) - if self.graphs is not None: - graphs_left = self.graphs[:index] - graphs_right = self.graphs[:index] - else: - graphs_left = None - graphs_right = None - - return ( - DataPack( - self.encoder, - tensor=numerical_left, - graphs=graphs_left, - ), - DataPack( - self.encoder, - tensor=numerical_right, - graphs=graphs_right, - ), - ) + def split(self, index: int) -> tuple[TensorPack, TensorPack]: + left = TensorPack(self.encoder, tensor=self.tensor[:index]) + right = TensorPack(self.encoder, tensor=self.tensor[index:]) + return left, right - def join(self, *other: DataPack) -> DataPack: + def join(self, *other: TensorPack) -> TensorPack: assert all(o.encoder == self.encoder for o in other) - if self.tensor is not None: - other_numericals = [] - for o in other: - assert o.tensor is not None - other_numericals.append(o.tensor) - numerical = torch.cat([self.tensor, *other_numericals], dim=0) - else: - numerical = None - - if self.graphs is not None: - other_graphs = [] - for o in other: - assert o.graphs is not None - other_graphs.append(o.graphs) - graphs = np.concatenate([self.graphs, *other_graphs], axis=0) - else: - graphs = None + numerical = torch.cat([self.tensor, *[o.tensor for o in other]], dim=0) + return TensorPack(self.encoder, tensor=numerical) - return DataPack(self.encoder, tensor=numerical, graphs=graphs) + @classmethod + def default_encoding( + cls, + x: Sequence[Mapping[str, Any]], + space: SearchSpace, + ) -> TensorPack: + default_encoder = TensorEncoder.default(space) + tensor = default_encoder.encode(x) + return TensorPack(default_encoder, tensor) diff --git a/neps/search_spaces/search_space.py b/neps/search_spaces/search_space.py index 40ecd0cf..671728f0 100644 --- a/neps/search_spaces/search_space.py +++ b/neps/search_spaces/search_space.py @@ -235,10 +235,10 @@ def __init__(self, **hyperparameters: Parameter): self.categoricals: Mapping[str, CategoricalParameter] = { k: hp for k, hp in _hyperparameters if isinstance(hp, CategoricalParameter) } - self.numerical: Mapping[str, NumericalParameter] = { + self.numerical: Mapping[str, IntegerParameter | FloatParameter] = { k: hp for k, hp in _hyperparameters - if isinstance(hp, NumericalParameter) and not hp.is_fidelity + if isinstance(hp, IntegerParameter | FloatParameter) and not hp.is_fidelity } self.graphs: Mapping[str, GraphParameter] = { k: hp for k, hp in _hyperparameters if isinstance(hp, GraphParameter) @@ -247,8 +247,9 @@ def __init__(self, **hyperparameters: Parameter): k: hp.value for k, hp in _hyperparameters if isinstance(hp, ConstantParameter) } # NOTE: For future of multiple fidelities - self.fidelities: Mapping[str, NumericalParameter] = {} - if _fidelity_param is not None and _fidelity_name is None: + self.fidelities: Mapping[str, IntegerParameter | FloatParameter] = {} + if _fidelity_param is not None and _fidelity_name is not None: + assert isinstance(_fidelity_param, IntegerParameter | FloatParameter) self.fidelities = {_fidelity_name: _fidelity_param} def set_custom_grid_space( From 1cdc3074636024c5ad693b66a99486cfb6b6a013 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 28 Aug 2024 13:55:17 +0200 Subject: [PATCH 18/63] refactor: Weighted Sampler --- .../bayesian_optimization/optimizer.py | 7 +- neps/sampling/__init__.py | 4 + neps/sampling/priors.py | 85 +++++++---------- neps/sampling/samplers.py | 95 ++++++++++++++++++- 4 files changed, 133 insertions(+), 58 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 83a65ce2..3b7d7f4b 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -17,8 +17,7 @@ default_single_obj_gp, optimize_acq, ) -from neps.optimizers.initial_design import PriorInitialDesign, Sobol -from neps.priors import Prior +from neps.sampling import Prior, Sampler from neps.search_spaces.domain import Domain from neps.search_spaces.encoding import TensorEncoder, TensorPack from neps.search_spaces.hyperparameters.categorical import CategoricalParameter @@ -216,9 +215,9 @@ def ask( self.initial_design_.append(config.hp_values()) if self.prior: - sampler = PriorInitialDesign(prior=self.prior, seed=0) + sampler = self.prior else: - sampler = Sobol(ndim=self.encoder.ncols, seed=0, scramble=True) + sampler = Sampler.sobol(ndim=self.encoder.ncols, seed=0, scramble=True) n_samples = self.n_initial_design - len(self.initial_design_) diff --git a/neps/sampling/__init__.py b/neps/sampling/__init__.py index e69de29b..a7f4f36f 100644 --- a/neps/sampling/__init__.py +++ b/neps/sampling/__init__.py @@ -0,0 +1,4 @@ +from neps.sampling.priors import CenteredPrior, Prior, UniformPrior, WeightedPrior +from neps.sampling.samplers import Sampler, Sobol + +__all__ = ["Sobol", "Sampler", "Prior", "UniformPrior", "CenteredPrior", "WeightedPrior"] diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py index 8ccc76e9..f7976bf4 100644 --- a/neps/sampling/priors.py +++ b/neps/sampling/priors.py @@ -10,14 +10,13 @@ from __future__ import annotations from dataclasses import dataclass, field -from functools import reduce -from typing import TYPE_CHECKING, Any, Container, Mapping, Protocol +from typing import TYPE_CHECKING, Any, Container, Iterable, Mapping, Protocol, Sequence from typing_extensions import override import torch from neps.distributions import DistributionOverDomain, TruncatedNormal -from neps.sampling.samplers import Sampler +from neps.sampling.samplers import Sampler, WeightedSampler from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain if TYPE_CHECKING: @@ -228,6 +227,18 @@ def make_centered( return CenteredPrior(distributions=distributions) + @classmethod + def weighted(cls, priors: Iterable[Prior], weights: torch.Tensor) -> WeightedPrior: + """Create a weighted prior for a given list of priors. + + Args: + priors: The list of priors to sample from. + weights: The weights for each prior. Will be normalized to sum to 1. + Please specify the device of your weights if required. + """ + priors = list(priors) + return WeightedPrior(priors=list(priors), weights=weights) + @dataclass class CenteredPrior(Prior): @@ -353,21 +364,30 @@ def sample( class WeightedPrior(Prior): """A prior consisting of multiple priors with weights.""" - priors: list[Prior] + priors: Sequence[Prior] + """The list of priors to sample from.""" + weights: torch.Tensor - probabilities: torch.Tensor = field(init=False, repr=False) + """The weights for each prior.""" + + _weighted_sampler: WeightedSampler = field(init=False, repr=False) def __post_init__(self): - if len(self.priors) < 2: - raise ValueError(f"At least two priors must be given. Got {len(self.priors)}") + from neps.sampling.samplers import WeightedSampler - if self.weights.ndim != 1: - raise ValueError("Weights must be a 1D tensor.") + self._weighted_sampler = WeightedSampler( + samplers=self.priors, weights=self.weights + ) - if len(self.priors) != len(self.weights): - raise ValueError("The number of priors and weights must be the same.") + @property + def probabilities(self) -> torch.Tensor: + """The probabilities for each sampler. Normalized weights.""" + return self._weighted_sampler.probabilities - self.probabilities = self.weights / self.weights.sum() + @property + @override + def ncols(self) -> int: + return self._weighted_sampler.ncols @override def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor: @@ -391,43 +411,4 @@ def sample( seed: int | None = None, device: torch.device | None = None, ) -> torch.Tensor: - if seed is not None: - raise NotImplementedError("Seeding is not yet implemented.") - - # Calculate the total number of samples required - if isinstance(n, int): - total_samples = n - output_shape = (n, self.ncols) - else: - total_samples = reduce(lambda x, y: x * y, n) - output_shape = (*n, self.ncols) - - # Randomly select which prior to sample from for each of the total_samples - chosen_priors = torch.empty((total_samples,), device=device, dtype=torch.int64) - chosen_priors = torch.multinomial( - self.probabilities, - total_samples, - replacement=True, - out=chosen_priors, - ) - - # Create an empty tensor to hold all samples - output_samples = torch.empty( - (total_samples, self.ncols), device=device, dtype=torch.float64 - ) - - # Loop through each prior and its associated indices - for i, prior in enumerate(self.priors): - # Find indices where the chosen prior is i - _i = torch.tensor(i, dtype=torch.int64, device=device) - indices = torch.where(chosen_priors == _i)[0] - - if len(indices) > 0: - # Sample from the prior for the required number of indices - samples_from_prior = prior.sample(len(indices), to=to, device=device) - output_samples[indices] = samples_from_prior - - # Reshape to the output shape including ncols dimension - output_samples = output_samples.view(output_shape) - - return Domain.translate(output_samples, frm=UNIT_FLOAT_DOMAIN, to=to) + return self._weighted_sampler.sample(n, to=to, seed=seed, device=device) diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py index f0298d84..6802f6d7 100644 --- a/neps/sampling/samplers.py +++ b/neps/sampling/samplers.py @@ -6,12 +6,13 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field from functools import reduce -from typing import Protocol +from typing import Protocol, Sequence from typing_extensions import override import torch +from more_itertools import all_equal from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain @@ -118,3 +119,93 @@ def sample( x = x.view(*n, self.ncols) return Domain.translate(x, frm=UNIT_FLOAT_DOMAIN, to=to) + + +@dataclass +class WeightedSampler(Sampler): + """A sampler that samples from a weighted combination of samplers.""" + + samplers: Sequence[Sampler] + """The samplers to sample from.""" + + weights: torch.Tensor + """The weights for each sampler.""" + + probabilities: torch.Tensor = field(init=False, repr=False) + """The probabilities for each sampler. Normalized weights.""" + + def __post_init__(self): + if len(self.samplers) < 2: + raise ValueError( + f"At least two samplers must be given. Got {len(self.samplers)}" + ) + + if self.weights.ndim != 1: + raise ValueError("Weights must be a 1D tensor.") + + if len(self.samplers) != len(self.weights): + raise ValueError("The number of samplers and weights must be the same.") + + ncols = [sampler.ncols for sampler in self.samplers] + if not all_equal(ncols): + raise ValueError( + "All samplers must have the same number of columns." f" Got {ncols}." + ) + + self._ncols = ncols[0] + self.probabilities = self.weights / self.weights.sum() + + @property + @override + def ncols(self) -> int: + return self._ncols + + @override + def sample( + self, + n: int | torch.Size, + *, + to: Domain | list[Domain], + seed: int | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: + if seed is not None: + raise NotImplementedError("Seeding is not yet implemented.") + + # Calculate the total number of samples required + if isinstance(n, int): + total_samples = n + output_shape = (n, self.ncols) + else: + total_samples = reduce(lambda x, y: x * y, n) + output_shape = (*n, self.ncols) + + # Randomly select which prior to sample from for each of the total_samples + chosen_priors = torch.empty((total_samples,), device=device, dtype=torch.int64) + chosen_priors = torch.multinomial( + self.probabilities, + total_samples, + replacement=True, + out=chosen_priors, + ) + + # Create an empty tensor to hold all samples + output_samples = torch.empty( + (total_samples, self.ncols), device=device, dtype=torch.float64 + ) + + # Loop through each prior and its associated indices + for i, prior in enumerate(self.samplers): + # Find indices where the chosen prior is i + _i = torch.tensor(i, dtype=torch.int64, device=device) + indices = torch.where(chosen_priors == _i)[0] + + if len(indices) > 0: + # Sample from the prior for the required number of indices + samples_from_prior = prior.sample(len(indices), to=to, device=device) + output_samples[indices] = samples_from_prior + + # Reshape to the output shape including ncols dimension + output_samples = output_samples.view(output_shape) + + return Domain.translate(output_samples, frm=UNIT_FLOAT_DOMAIN, to=to) From dd7200887038be419febc2b8cdda9951404f1fad Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 28 Aug 2024 14:17:20 +0200 Subject: [PATCH 19/63] refactor: Simplify BO some more --- .../bayesian_optimization/optimizer.py | 231 ++++++++---------- neps/search_spaces/encoding.py | 18 +- 2 files changed, 108 insertions(+), 141 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 3b7d7f4b..700ce673 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -18,7 +18,6 @@ optimize_acq, ) from neps.sampling import Prior, Sampler -from neps.search_spaces.domain import Domain from neps.search_spaces.encoding import TensorEncoder, TensorPack from neps.search_spaces.hyperparameters.categorical import CategoricalParameter @@ -28,13 +27,16 @@ from neps.search_spaces import ( SearchSpace, ) + from neps.search_spaces.domain import Domain from neps.search_spaces.hyperparameters.float import FloatParameter from neps.search_spaces.hyperparameters.integer import IntegerParameter from neps.state import BudgetInfo, Trial -def pibo_acq_beta_and_n( - n_sampled_already: int, ndims: int, budget_info: BudgetInfo +def _pibo_acq_beta_and_n( + n_sampled_already: int, + ndims: int, + budget_info: BudgetInfo, ) -> tuple[float, float]: if budget_info.max_evaluations is not None: # From the PIBO paper (Section 4.1) @@ -56,10 +58,47 @@ def pibo_acq_beta_and_n( return n_sampled_already, beta +# TODO: This needs to be moved to the search space class, however +# to not break the current prior based APIs used elsewhere, we can +# just manually create this here. +# We use confidence here where `0` means no confidence and `1` means +# absolute confidence. This gets translated in to std's and weights +# accordingly in a `CenteredPrior` +def _make_prior( + parameters: dict[str, CategoricalParameter | FloatParameter | IntegerParameter], +) -> Prior: + _mapping = {"low": 0.25, "medium": 0.5, "high": 0.75} + + domains: dict[str, Domain] = {} + centers: dict[str, tuple[Any, float]] = {} + categoricals: set[str] = set() + for name, hp in parameters.items(): + domains[name] = hp.domain # type: ignore + + if isinstance(hp, CategoricalParameter): + categoricals.add(name) + + if hp.default is None: + continue + + confidence_str = hp.default_confidence_choice + confidence_score = _mapping[confidence_str] + center = hp._default_index if isinstance(hp, CategoricalParameter) else hp.default + + centers[name] = (center, confidence_score) + + # Uses truncnorms for numerical and weighted choices categoricals + return Prior.make_centered( + domains=domains, + centers=centers, + categoricals=categoricals, + ) + + class BayesianOptimization(BaseOptimizer): """Implements the basic BO loop.""" - def __init__( + def __init__( # noqa: D417 self, pipeline_space: SearchSpace, *, @@ -97,83 +136,30 @@ def __init__( ValueError: if no kernel is provided """ if any(pipeline_space.graphs): - raise ValueError( - "BayesianOptimization currently only supports flat search spaces" - ) + raise NotImplementedError("Only supports flat search spaces for now!") + super().__init__(pipeline_space=pipeline_space) if initial_design_size is None: N = len(pipeline_space.hyperparameters) initial_design_size = int(max(1, math.log(N) ** 2)) elif initial_design_size < 1: - raise ValueError( - "BayesianOptimization needs initial_design_size to be at least 1" - ) - - super().__init__(pipeline_space=pipeline_space) + raise ValueError("Initial_design_size to be at least 1") - if encoder is None: - parameters: dict[ - str, - CategoricalParameter | FloatParameter | IntegerParameter, - ] = { - **pipeline_space.numerical, - **pipeline_space.categoricals, - } - if treat_fidelity_as_hyperparameters: - parameters.update(pipeline_space.fidelities) - - self.encoder = TensorEncoder.default(parameters) - else: - self.encoder = encoder - - # TODO: This needs to be moved to the search space class, however - # to not break the current prior based APIs used elsewhere, we can - # just manually create this here. - # We use confidence here where `0` means no confidence and `1` means - # absolute confidence. This gets translated in to std's and weights - # accordingly in a `CenteredPrior` - self.prior: Prior | None = None - if use_priors: - _mapping = {"low": 0.25, "medium": 0.5, "high": 0.75} - - domains: dict[str, Domain] = {} - centers: dict[str, tuple[Any, float]] = {} - categoricals: set[str] = set() - for name, hp in parameters.items(): - domains[name] = hp.domain # type: ignore - - if isinstance(hp, CategoricalParameter): - categoricals.add(name) - - if hp.default is None: - continue - - confidence_str = hp.default_confidence_choice - confidence_score = _mapping[confidence_str] - center = ( - hp._default_index - if isinstance(hp, CategoricalParameter) - else hp.default - ) - - centers[name] = (center, confidence_score) - - # Uses truncnorms for numerical and weighted choices categoricals - self.prior = Prior.make_centered( - domains=domains, - centers=centers, - categoricals=categoricals, - ) - else: - self.prior = None + params: dict[str, CategoricalParameter | FloatParameter | IntegerParameter] = { + **pipeline_space.numerical, + **pipeline_space.categoricals, + } + if treat_fidelity_as_hyperparameters: + params.update(pipeline_space.fidelities) + self.encoder = TensorEncoder.default(params) if encoder is None else encoder + self.prior = _make_prior(params) if use_priors is True else None self.device = device self.sample_default_first = sample_default_first self.n_initial_design = initial_design_size - if surrogate_model == "gp": - self._get_fitted_model = default_single_obj_gp - else: - self._get_fitted_model = surrogate_model + self._get_fitted_model = ( + default_single_obj_gp if surrogate_model == "gp" else surrogate_model + ) self.initial_design_: list[dict[str, Any]] | None = None @@ -182,25 +168,12 @@ def ask( trials: Mapping[str, Trial], budget_info: BudgetInfo, optimizer_state: dict[str, Any], + seed: int | None = None, ) -> tuple[SampledConfig, dict[str, Any]]: - # TODO: Lift this into runtime, let the - # optimizer advertise the encoding wants... - completed = [ - t - for t in trials.values() - if t.report is not None and t.report.loss is not None - ] - x_configs = [t.config for t in completed] - y = torch.as_tensor( - [t.report.loss for t in completed], - dtype=torch.float64, - device=self.device, - ) # type: ignore - - if y.ndim == 1: - y = y.unsqueeze(1) - - pending = [t.config for t in trials.values() if t.state.pending()] + if seed is not None: + raise NotImplementedError( + "Seed is not yet implemented for BayesianOptimization" + ) space = self.pipeline_space config_id = str(len(trials) + 1) @@ -209,51 +182,55 @@ def ask( if self.initial_design_ is None: self.initial_design_ = [] - # Add the default configuration first (maybe) if self.sample_default_first: config = space.sample_default_configuration() self.initial_design_.append(config.hp_values()) - if self.prior: - sampler = self.prior - else: - sampler = Sampler.sobol(ndim=self.encoder.ncols, seed=0, scramble=True) - + sampler = ( + self.prior if self.prior else Sampler.sobol(self.encoder.ncols, seed=seed) + ) n_samples = self.n_initial_design - len(self.initial_design_) - # We add a buffer of 2x the samples to help ensure - # we get get enough after removing duplicates. - x = sampler.sample(n_samples * 2) - x = Domain.translate( - x, - to=self.encoder.domains.values(), - frm=sampler.sample_domain, + x = sampler.sample( + n_samples * 2, + to=self.encoder.domains, + seed=seed, + device=self.device, ) uniq_x = torch.unique(x, dim=0) - configs = self.encoder.decode_dicts(uniq_x[:n_samples]) + configs = self.encoder.unpack(uniq_x[:n_samples]) self.initial_design_.extend(configs) - # If we havn't passed the intial design phase, just return - # the next one. + # If we havn't passed the intial design phase if len(trials) < len(self.initial_design_): - config = self.initial_design_[len(trials)] - return ( - SampledConfig(id=config_id, config=config, previous_config_id=None), - optimizer_state, - ) + config = self.initial_design_[len(trials) - 1] + sample = SampledConfig(id=config_id, config=config, previous_config_id=None) + return sample, optimizer_state # Now we actually do the BO loop, start by encoding the data + # TODO: Lift this into runtime, let the optimizer advertise the encoding wants... + x_configs: list[dict[str, Any]] = [] + ys: list[float] = [] + pending: list[dict[str, Any]] = [] + for trial in trials.values(): + if trial.state.pending(): + pending.append(trial.config) + else: + assert trial.report is not None + assert trial.report.loss is not None + x_configs.append(trial.config) + ys.append(trial.report.loss) + x = self.encoder.pack(x_configs, device=self.device) - x_pending = None - if any(pending): - x_pending = self.encoder.pack(pending, device=self.device) + x_pending = ( + None if len(pending) == 0 else self.encoder.pack(pending, device=self.device) + ) + y = torch.tensor(ys, dtype=torch.float64, device=self.device) + if y.ndim == 1: + y = y.unsqueeze(1) - # Get our fitted model model = self._get_fitted_model(x, y) - # Build our acquisition function. This takes care of pending - # configs through x_pending. - # TODO: We should evaluate whether LogNoisyEI is better than LogEI acq = qLogExpectedImprovement( model, best_f=y.min(), @@ -263,24 +240,14 @@ def ask( # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607 objective=LinearMCObjective(weights=torch.tensor([-1.0])), ) - - # If we have a prior, then we use it with PiBO if self.prior: - n, beta = pibo_acq_beta_and_n( - n_sampled_already=len(trials), - ndims=self.encoder.ncols, - budget_info=budget_info, - ) + n, beta = _pibo_acq_beta_and_n(len(trials), self.encoder.ncols, budget_info) acq = PiboAcquisition(acq, prior=self.prior, n=n, beta=beta) - # Optimize it candidates, _eis = optimize_acq(acq_fn=acq, encoder=self.encoder, acq_options={}) - # Take the first (and only?) candidate - assert len(candidates) == 1 - config = self.encoder.decode_dicts(candidates)[0] + assert len(candidates) == 1, "Expected only one candidate!" + config = self.encoder.unpack(candidates)[0] - return ( - SampledConfig(id=config_id, config=config, previous_config_id=None), - optimizer_state, - ) + sample = SampledConfig(id=config_id, config=config, previous_config_id=None) + return sample, optimizer_state diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py index ad3b00ae..71555fef 100644 --- a/neps/search_spaces/encoding.py +++ b/neps/search_spaces/encoding.py @@ -190,22 +190,24 @@ def decode_dicts(self, x: npt.NDArray[np.object_]) -> list[dict[str, Any]]: class TensorEncoder: transformers: dict[str, TensorTransformer] index_of: dict[str, int] = field(init=False) + domain_of: dict[str, Domain] = field(init=False) n_numerical: int = field(init=False) n_categorical: int = field(init=False) def __post_init__(self): transformers = sorted(self.transformers.items(), key=lambda t: t[0]) self.transformers = dict(transformers) - self.index_of: dict[str, int] = {} + n_numerical = 0 n_categorical = 0 - for i, (name, transformer) in enumerate(self.transformers.items()): - self.index_of[name] = i + for _, transformer in transformers: if isinstance(transformer, CategoricalToIntegerTransformer): n_categorical += 1 else: n_numerical += 1 + self.index_of = {name: i for i, name in enumerate(self.transformers.keys())} + self.domain_of = {name: t.domain for name, t in self.transformers.items()} self.n_numerical = n_numerical self.n_categorical = n_categorical @@ -214,10 +216,8 @@ def ncols(self) -> int: return len(self.transformers) @property - def domains(self) -> dict[str, Domain]: - return { - name: transformer.domain for name, transformer in self.transformers.items() - } + def domains(self) -> list[Domain]: + return list(self.domain_of.values()) def names(self) -> list[str]: return list(self.transformers.keys()) @@ -256,7 +256,7 @@ def pack( ) -> TensorPack: return TensorPack(self.encode(x, device=device), self) - def decode_dicts(self, x: torch.Tensor) -> list[dict[str, Any]]: + def unpack(self, x: torch.Tensor) -> list[dict[str, Any]]: values: dict[str, list[Any]] = {} for hp_name, transformer in self.transformers.items(): lookup = self.index_of[hp_name] @@ -311,7 +311,7 @@ def names(self) -> list[str]: return self.encoder.names() def to_dicts(self) -> list[dict[str, Any]]: - return self.encoder.decode_dicts(self.tensor) + return self.encoder.unpack(self.tensor) def split(self, index: int) -> tuple[TensorPack, TensorPack]: left = TensorPack(self.encoder, tensor=self.tensor[:index]) From 8658d70f088b1b86be965e05b8b0bc3b2204889b Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 28 Aug 2024 14:24:06 +0200 Subject: [PATCH 20/63] fix: Need at least 2 points for intial design --- neps/optimizers/bayesian_optimization/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 700ce673..9499926c 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -141,7 +141,7 @@ def __init__( # noqa: D417 if initial_design_size is None: N = len(pipeline_space.hyperparameters) - initial_design_size = int(max(1, math.log(N) ** 2)) + initial_design_size = int(max(2, math.log(N) ** 2)) elif initial_design_size < 1: raise ValueError("Initial_design_size to be at least 1") From b135c748062df903d073bc2cdde320b9a4236da6 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 28 Aug 2024 14:25:33 +0200 Subject: [PATCH 21/63] fix: Ensure we do the last intial design --- neps/optimizers/bayesian_optimization/optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 9499926c..139de839 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -202,7 +202,7 @@ def ask( self.initial_design_.extend(configs) # If we havn't passed the intial design phase - if len(trials) < len(self.initial_design_): + if len(trials) <= len(self.initial_design_): config = self.initial_design_[len(trials) - 1] sample = SampledConfig(id=config_id, config=config, previous_config_id=None) return sample, optimizer_state From 6fd4a2914b84cbf77ee5a08cb90d1ecf224c061a Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 28 Aug 2024 14:27:46 +0200 Subject: [PATCH 22/63] doc: Add todo note on no reported loss --- neps/optimizers/bayesian_optimization/optimizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 139de839..355fc39a 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -217,6 +217,7 @@ def ask( pending.append(trial.config) else: assert trial.report is not None + # TODO: Figure out what to do if there's no reported loss value. assert trial.report.loss is not None x_configs.append(trial.config) ys.append(trial.report.loss) From bb0eb38dac025145ff1b569f41771733c306a6fe Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Thu, 29 Aug 2024 11:13:08 +0200 Subject: [PATCH 23/63] fix: Add in the GP optimization --- neps/optimizers/bayesian_optimization/optimizer.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 355fc39a..d9f322aa 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -8,6 +8,7 @@ LinearMCObjective, qLogExpectedImprovement, ) +from gpytorch import ExactMarginalLogLikelihood from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import ( @@ -157,7 +158,7 @@ def __init__( # noqa: D417 self.device = device self.sample_default_first = sample_default_first self.n_initial_design = initial_design_size - self._get_fitted_model = ( + self._get_model = ( default_single_obj_gp if surrogate_model == "gp" else surrogate_model ) @@ -230,7 +231,12 @@ def ask( if y.ndim == 1: y = y.unsqueeze(1) - model = self._get_fitted_model(x, y) + model = self._get_model(x, y) + + from botorch.fit import fit_gpytorch_mll + + mll = ExactMarginalLogLikelihood(likelihood=model.likelihood, model=model) + _fit_mll = fit_gpytorch_mll(mll) acq = qLogExpectedImprovement( model, From 41deea6f6cac8602f95fda3b732ea80038a39726 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Thu, 29 Aug 2024 11:14:01 +0200 Subject: [PATCH 24/63] fix: Memory efficient log_prior for CenteredPrior --- neps/sampling/priors.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py index f7976bf4..2deda010 100644 --- a/neps/sampling/priors.py +++ b/neps/sampling/priors.py @@ -284,14 +284,15 @@ def log_prob(self, x: torch.Tensor, *, frm: list[Domain] | Domain) -> torch.Tens # Calculate the log probabilities of the sample domain tensors under their # respective distributions. - log_probs = torch.cat( - [ - dist.distribution.log_prob(sample_domain_tensor[:, i]) - for i, dist in enumerate(self.distributions) - ], - dim=-1, - ) - return torch.sum(log_probs, dim=-1) + itr = enumerate(self.distributions) + first_i, first_dist = next(itr) + + log_probs = first_dist.distribution.log_prob(sample_domain_tensor[..., first_i]) + for i, dist in itr: + log_probs = log_probs + dist.distribution.log_prob( + sample_domain_tensor[..., i] + ) + return log_probs @override def sample( @@ -398,7 +399,7 @@ def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tens weighted_probs = first_prob * first_prior.log_prob(x, frm=frm) for prob, prior in itr: - weighted_probs += prob * prior.log_prob(x, frm=frm) + weighted_probs = weighted_probs + prob * prior.log_prob(x, frm=frm) return weighted_probs From 205f503b4b9fb4603b7b3a7975a684716f733da9 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Thu, 29 Aug 2024 18:35:55 +0200 Subject: [PATCH 25/63] feat: pibo and cost cooling --- .../acquisition_functions/_ehvi.py | 213 ------ .../acquisition_functions/base_acquisition.py | 17 - .../acquisition_functions/cost_cooling.py | 99 +-- .../acquisition_functions/ei.py | 120 --- .../acquisition_functions/mf_ei.py | 35 +- .../acquisition_functions/pibo.py | 63 ++ .../acquisition_functions/prior_weighted.py | 111 --- .../acquisition_functions/ucb.py | 60 -- .../weighted_acquisition.py | 147 ++++ .../bayesian_optimization/mf_tpe.py | 719 ------------------ .../bayesian_optimization/models/gp.py | 23 +- .../bayesian_optimization/optimizer.py | 187 ++++- .../optimizers/bayesian_optimization/sobol.py | 0 neps/optimizers/initial_design.py | 34 - neps/{ => sampling}/distributions.py | 2 +- neps/sampling/priors.py | 12 +- neps/sampling/samplers.py | 39 +- neps/search_spaces/distributions/__init__.py | 16 - .../distributions/distribution.py | 21 - neps/search_spaces/distributions/truncnorm.py | 112 --- .../distributions/uniform_float.py | 47 -- .../distributions/uniform_int.py | 46 -- .../distributions/weighted_ints.py | 91 --- neps/search_spaces/samplers/__init__.py | 9 - neps/search_spaces/samplers/model.py | 186 ----- neps/search_spaces/samplers/prior.py | 110 --- neps/search_spaces/samplers/sampler.py | 22 - neps/search_spaces/samplers/uniform.py | 79 -- .../samplers/weighted_sampler.py | 51 -- 29 files changed, 476 insertions(+), 2195 deletions(-) delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/ei.py create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py delete mode 100644 neps/optimizers/bayesian_optimization/mf_tpe.py delete mode 100644 neps/optimizers/bayesian_optimization/sobol.py delete mode 100644 neps/optimizers/initial_design.py rename neps/{ => sampling}/distributions.py (99%) delete mode 100644 neps/search_spaces/distributions/__init__.py delete mode 100644 neps/search_spaces/distributions/distribution.py delete mode 100644 neps/search_spaces/distributions/truncnorm.py delete mode 100644 neps/search_spaces/distributions/uniform_float.py delete mode 100644 neps/search_spaces/distributions/uniform_int.py delete mode 100644 neps/search_spaces/distributions/weighted_ints.py delete mode 100644 neps/search_spaces/samplers/__init__.py delete mode 100644 neps/search_spaces/samplers/model.py delete mode 100644 neps/search_spaces/samplers/prior.py delete mode 100644 neps/search_spaces/samplers/sampler.py delete mode 100644 neps/search_spaces/samplers/uniform.py delete mode 100644 neps/search_spaces/samplers/weighted_sampler.py diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py b/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py deleted file mode 100644 index 8722c545..00000000 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py +++ /dev/null @@ -1,213 +0,0 @@ -# from abc import ABC, abstractmethod -from itertools import product - -import torch -from torch import Tensor -from torch.distributions import Normal -from torch.nn import Module - -# class MultiObjectiveBaseAcqusition(ABC): -# def __init__(self, surrogate_models: dict): -# self.surrogate_models = surrogate_models -# -# def propose_location(self, *args): -# """Propose new locations for subsequent sampling -# This method should be overriden by respective acquisition function implementations.""" -# raise NotImplementedError -# -# def optimize(self): -# """This is the method that user should call for the Bayesian optimisation main loop.""" -# raise NotImplementedError -# -# @abstractmethod -# def eval(self, x, asscalar: bool = False): -# """Evaluate the acquisition function at point x2. This should be overridden by respective acquisition -# function implementations""" -# raise NotImplementedError -# -# def __call__(self, *args, **kwargs): -# return self.eval(*args, **kwargs) -# -# def reset_surrogate_model(self, surrogate_models: dict): -# for objective, surrogate_model in surrogate_models.items(): -# self.surrogate_models[objective] = surrogate_model -# - - -class ExpectedHypervolumeImprovement(Module): # , MultiObjectiveBaseAcqusition): - def __init__( - self, - model, - ref_point, - partitioning, - ) -> None: - r"""Expected Hypervolume Improvement supporting m>=2 outcomes. - - Implementation from BOtorch, adapted from - https://github.com/pytorch/botorch/blob/353f37649fa8d90d881e8ea20c11986b15723ef1/botorch/acquisition/multi_objective/analytic.py#L78 - - This implements the computes EHVI using the algorithm from [Yang2019]_, but - additionally computes gradients via auto-differentiation as proposed by - [Daulton2020qehvi]_. - - Note: this is currently inefficient in two ways due to the binary partitioning - algorithm that we use for the box decomposition: - - - We have more boxes in our decomposition - - If we used a box decomposition that used `inf` as the upper bound for - the last dimension *in all hypercells*, then we could reduce the number - of terms we need to compute from 2^m to 2^(m-1). [Yang2019]_ do this - by using DKLV17 and LKF17 for the box decomposition. - - TODO: Use DKLV17 and LKF17 for the box decomposition as in [Yang2019]_ for - greater efficiency. - - TODO: Add support for outcome constraints. - - Example: - >>> model = SingleTaskGP(train_X, train_Y) - >>> ref_point = [0.0, 0.0] - >>> EHVI = ExpectedHypervolumeImprovement(model, ref_point, partitioning) - >>> ehvi = EHVI(test_X) - - Args: - model: A fitted model. - ref_point: A list with `m` elements representing the reference point (in the - outcome space) w.r.t. to which compute the hypervolume. This is a - reference point for the objective values (i.e. after applying - `objective` to the samples). - partitioning: A `NondominatedPartitioning` module that provides the non- - dominated front and a partitioning of the non-dominated space in hyper- - rectangles. - objective: An `AnalyticMultiOutputObjective`. - """ - # TODO: we could refactor this __init__ logic into a - # HypervolumeAcquisitionFunction Mixin - if len(ref_point) != partitioning.num_outcomes: - raise ValueError( - "The length of the reference point must match the number of outcomes. " - f"Got ref_point with {len(ref_point)} elements, but expected " - f"{partitioning.num_outcomes}." - ) - ref_point = torch.tensor( - ref_point, - dtype=partitioning.pareto_Y.dtype, - device=partitioning.pareto_Y.device, - ) - better_than_ref = (partitioning.pareto_Y > ref_point).all(dim=1) - if not better_than_ref.any() and partitioning.pareto_Y.shape[0] > 0: - raise ValueError( - "At least one pareto point must be better than the reference point." - ) - super().__init__() - self.model = model - self.register_buffer("ref_point", ref_point) - self.partitioning = partitioning - cell_bounds = self.partitioning.get_hypercell_bounds() - self.register_buffer("cell_lower_bounds", cell_bounds[0]) - self.register_buffer("cell_upper_bounds", cell_bounds[1]) - # create indexing tensor of shape `2^m x m` - self._cross_product_indices = torch.tensor( - list(product(*[[0, 1] for _ in range(ref_point.shape[0])])), - dtype=torch.long, - device=ref_point.device, - ) - self.normal = Normal(0, 1) - - def psi(self, lower: Tensor, upper: Tensor, mu: Tensor, sigma: Tensor) -> None: - r"""Compute Psi function. - - For each cell i and outcome k: - - Psi(lower_{i,k}, upper_{i,k}, mu_k, sigma_k) = ( - sigma_k * PDF((upper_{i,k} - mu_k) / sigma_k) + ( - mu_k - lower_{i,k} - ) * (1 - CDF(upper_{i,k} - mu_k) / sigma_k) - ) - - See Equation 19 in [Yang2019]_ for more details. - - Args: - lower: A `num_cells x m`-dim tensor of lower cell bounds - upper: A `num_cells x m`-dim tensor of upper cell bounds - mu: A `batch_shape x 1 x m`-dim tensor of means - sigma: A `batch_shape x 1 x m`-dim tensor of standard deviations (clamped). - - Returns: - A `batch_shape x num_cells x m`-dim tensor of values. - """ - u = (upper - mu) / sigma - return sigma * self.normal.log_prob(u).exp() + (mu - lower) * ( - 1 - self.normal.cdf(u) - ) - - def nu(self, lower: Tensor, upper: Tensor, mu: Tensor, sigma: Tensor) -> None: - r"""Compute Nu function. - - For each cell i and outcome k: - - nu(lower_{i,k}, upper_{i,k}, mu_k, sigma_k) = ( - upper_{i,k} - lower_{i,k} - ) * (1 - CDF((upper_{i,k} - mu_k) / sigma_k)) - - See Equation 25 in [Yang2019]_ for more details. - - Args: - lower: A `num_cells x m`-dim tensor of lower cell bounds - upper: A `num_cells x m`-dim tensor of upper cell bounds - mu: A `batch_shape x 1 x m`-dim tensor of means - sigma: A `batch_shape x 1 x m`-dim tensor of standard deviations (clamped). - - Returns: - A `batch_shape x num_cells x m`-dim tensor of values. - """ - return (upper - lower) * (1 - self.normal.cdf((upper - mu) / sigma)) - - def forward(self, X: Tensor) -> Tensor: - posterior = [[_m.predict(_x) for _m in self.model] for _x in X] - mu = torch.tensor([[_m[0].item() for _m in _p] for _p in posterior])[:, None, :] - sigma = torch.tensor([[_s[1].item() for _s in _p] for _p in posterior])[ - :, None, : - ] - - # clamp here, since upper_bounds will contain `inf`s, which - # are not differentiable - cell_upper_bounds = self.cell_upper_bounds.clamp_max(1e8) - # Compute psi(lower_i, upper_i, mu_i, sigma_i) for i=0, ... m-2 - psi_lu = self.psi( - lower=self.cell_lower_bounds, upper=cell_upper_bounds, mu=mu, sigma=sigma - ) - # Compute psi(lower_m, lower_m, mu_m, sigma_m) - psi_ll = self.psi( - lower=self.cell_lower_bounds, - upper=self.cell_lower_bounds, - mu=mu, - sigma=sigma, - ) - # Compute nu(lower_m, upper_m, mu_m, sigma_m) - nu = self.nu( - lower=self.cell_lower_bounds, upper=cell_upper_bounds, mu=mu, sigma=sigma - ) - # compute the difference psi_ll - psi_lu - psi_diff = psi_ll - psi_lu - - # this is batch_shape x num_cells x 2 x (m-1) - stacked_factors = torch.stack([psi_diff, nu], dim=-2) - - # Take the cross product of psi_diff and nu across all outcomes - # e.g. for m = 2 - # for each batch and cell, compute - # [psi_diff_0, psi_diff_1] - # [nu_0, psi_diff_1] - # [psi_diff_0, nu_1] - # [nu_0, nu_1] - # this tensor has shape: `batch_shape x num_cells x 2^m x m` - all_factors_up_to_last = stacked_factors.gather( - dim=-2, - index=self._cross_product_indices.expand( - stacked_factors.shape[:-2] + self._cross_product_indices.shape - ), - ) - # compute product for all 2^m terms, - # sum across all terms and hypercells - return all_factors_up_to_last.prod(dim=-1).sum(dim=-1).sum(dim=-1) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py deleted file mode 100644 index 7249c0fd..00000000 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py +++ /dev/null @@ -1,17 +0,0 @@ -from abc import ABC, abstractmethod - - -class BaseAcquisition(ABC): - def __init__(self): - self.surrogate_model = None - - @abstractmethod - def eval(self, x, asscalar: bool = False): - """Evaluate the acquisition function at point x2.""" - raise NotImplementedError - - def __call__(self, *args, **kwargs): - return self.eval(*args, **kwargs) - - def set_state(self, surrogate_model, **kwargs): - self.surrogate_model = surrogate_model diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py b/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py index a45cd051..4741705f 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py @@ -1,46 +1,53 @@ -from typing import Iterable, Union - -import numpy as np -import torch - -from .base_acquisition import BaseAcquisition -from .ei import ComprehensiveExpectedImprovement - - -class CostCooler(BaseAcquisition): - def __init__( - self, - base_acquisition: BaseAcquisition = ComprehensiveExpectedImprovement, - ): - self.base_acquisition = base_acquisition - self.cost_model = None - self.alpha = None - - def eval( - self, - x: Iterable, - **base_acquisition_kwargs, - ) -> Union[np.ndarray, torch.Tensor, float]: - base_acquisition_value = self.base_acquisition.eval( - x=x, **base_acquisition_kwargs - ) - costs, _ = self.cost_model.predict(x) - # if costs < 0.001: - # costs = 1 - if torch.is_tensor(costs): - cost_cooled = torch.zeros_like(costs) - index = 0 - for _, y in enumerate(costs.detach().numpy()): - if y < 0.0001: - cost_cooled[index] = base_acquisition_value[index] - else: - cost_cooled[index] = base_acquisition_value[index] / (y**self.alpha) - index += 1 - # return base_acquisition_value # / (costs**self.alpha).detach().numpy() - return cost_cooled - - def set_state(self, surrogate_model, alpha, cost_model, **kwargs): - super().set_state(surrogate_model=surrogate_model) - self.base_acquisition.set_state(surrogate_model=surrogate_model, **kwargs) - self.alpha = alpha - self.cost_model = cost_model +from __future__ import annotations + +from typing import TYPE_CHECKING + +from botorch.acquisition.logei import partial + +from neps.optimizers.bayesian_optimization.acquisition_functions.weighted_acquisition import ( + WeightedAcquisition, +) + +if TYPE_CHECKING: + import torch + from botorch.acquisition import AcquisitionFunction + from botorch.models.gp_regression import Likelihood + from botorch.models.model import Model + from torch import Tensor + + +def apply_cost_cooling( + acq_values: Tensor, + X: Tensor, + acq: AcquisitionFunction, + cost_model: Model, + likelihood: Likelihood, + alpha: float, +) -> Tensor: + posterior = likelihood(cost_model(X)) + cost = posterior.mean + + if acq._log: + # can derive from eq log(x) = log(acq / cost^alpha) + return acq_values - alpha * cost.log() + return acq_values / cost.pow(alpha) + + +def cost_cooled_acq( + acq_fn: AcquisitionFunction, + model: Model, + likelihood: Likelihood, + used_budget_percentage: float, + X_pending: torch.Tensor | None = None, +) -> WeightedAcquisition: + assert 0 <= used_budget_percentage <= 1 + return WeightedAcquisition( + acq=acq_fn, + apply_weight=partial( + apply_cost_cooling, + cost_model=model, + likelihood=likelihood, + alpha=1 - used_budget_percentage, + ), + X_pending=X_pending, + ) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py deleted file mode 100644 index 1a4e24d0..00000000 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py +++ /dev/null @@ -1,120 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Sequence - -import torch -from torch.distributions import Normal - -from .base_acquisition import BaseAcquisition - -if TYPE_CHECKING: - import numpy as np - - from neps.search_spaces import SearchSpace - - -class ComprehensiveExpectedImprovement(BaseAcquisition): - def __init__( - self, - augmented_ei: bool = False, - xi: float = 0.0, - in_fill: str = "best", - log_ei: bool = False, - optimize_on_max_fidelity: bool = True, - ): - """This is the graph BO version of the expected improvement - key differences are: - - 1. The input x2 is a networkx graph instead of a vectorial input - - 2. The search space (a collection of x1_graphs) is discrete, so there is no - gradient-based optimisation. Instead, we compute the EI at all candidate points - and empirically select the best position during optimisation - - Args: - augmented_ei: Using the Augmented EI heuristic modification to the standard - expected improvement algorithm according to Huang (2006). - xi: manual exploration-exploitation trade-off parameter. - in_fill: the criterion to be used for in-fill for the determination of mu_star - 'best' means the empirical best observation so far (but could be - susceptible to noise), 'posterior' means the best *posterior GP mean* - encountered so far, and is recommended for optimization of more noisy - functions. Defaults to "best". - log_ei: log-EI if true otherwise usual EI. - """ - super().__init__() - - if in_fill not in ["best", "posterior"]: - raise ValueError(f"Invalid value for in_fill ({in_fill})") - self.augmented_ei = augmented_ei - self.xi = xi - self.in_fill = in_fill - self.log_ei = log_ei - self.incumbent = None - self.optimize_on_max_fidelity = optimize_on_max_fidelity - - def eval( - self, - x: Sequence[SearchSpace], - asscalar: bool = False, - ) -> np.ndarray | torch.Tensor | float: - """Return the negative expected improvement at the query point x2.""" - assert self.incumbent is not None, "EI function not fitted on model" - - if x[0].has_fidelity and self.optimize_on_max_fidelity: - _x = [e.clone() for e in x] - for e in _x: - e.set_to_max_fidelity() - else: - _x = x - - mu, cov = self.surrogate_model.predict(_x) - - std = torch.sqrt(torch.diag(cov)) - mu_star = self.incumbent - - gauss = Normal(torch.zeros(1, device=mu.device), torch.ones(1, device=mu.device)) - # u = (mu - mu_star - self.xi) / std - # ei = std * updf + (mu - mu_star - self.xi) * ucdf - if self.log_ei: - # we expect that f_min is in log-space - f_min = mu_star - self.xi - v = (f_min - mu) / std - ei = torch.exp(f_min) * gauss.cdf(v) - torch.exp( - 0.5 * torch.diag(cov) + mu - ) * gauss.cdf(v - std) - else: - u = (mu_star - mu - self.xi) / std - try: - ucdf = gauss.cdf(u) - except ValueError as e: - print(f"u: {u}") # noqa: T201 - print(f"mu_star: {mu_star}") # noqa: T201 - print(f"mu: {mu}") # noqa: T201 - print(f"std: {std}") # noqa: T201 - print(f"diag: {cov.diag()}") # noqa: T201 - raise e - updf = torch.exp(gauss.log_prob(u)) - ei = std * updf + (mu_star - mu - self.xi) * ucdf - if self.augmented_ei: - sigma_n = self.surrogate_model.likelihood - ei *= 1.0 - torch.sqrt(torch.tensor(sigma_n, device=mu.device)) / torch.sqrt( - sigma_n + torch.diag(cov) - ) - if isinstance(_x, list) and asscalar: - return ei.detach().numpy() - if asscalar: - ei = ei.detach().numpy().item() - return ei - - def set_state(self, surrogate_model, **kwargs): - super().set_state(surrogate_model, **kwargs) - - # Compute incumbent - if self.in_fill == "best": - self.incumbent = torch.min(self.surrogate_model.y_) - else: - x = self.surrogate_model.x - mu_train, _ = self.surrogate_model.predict(x) - incumbent_idx = torch.argmin(mu_train) - self.incumbent = self.surrogate_model.y_[incumbent_idx] diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py index 3d19040d..c8502ca1 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py @@ -1,22 +1,28 @@ +# Left in as reference for now. # type: ignore -from typing import Any, Iterable, Tuple, Union +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Iterable import numpy as np import pandas as pd import torch from torch.distributions import Normal -from ....optimizers.utils import map_real_hyperparameters_from_tabular_ids -from ....search_spaces.search_space import SearchSpace -from ...multi_fidelity.utils import MFObservedData +from neps.optimizers.utils import map_real_hyperparameters_from_tabular_ids + from .ei import ComprehensiveExpectedImprovement +if TYPE_CHECKING: + from neps.optimizers.multi_fidelity.utils import MFObservedData + from neps.search_spaces.search_space import SearchSpace + class MFEI(ComprehensiveExpectedImprovement): def __init__( self, pipeline_space: SearchSpace, - surrogate_model_name: str = None, + surrogate_model_name: str | None = None, augmented_ei: bool = False, xi: float = 0.0, in_fill: str = "best", @@ -32,7 +38,7 @@ def __init__( def get_budget_level(self, config) -> int: return int((config.fidelity.value - config.fidelity.lower) / self.b_step) - def preprocess(self, x: pd.Series) -> Tuple[Iterable, Iterable]: + def preprocess(self, x: pd.Series) -> tuple[Iterable, Iterable]: """Prepares the configurations for appropriate EI calculation. Takes a set of points and computes the budget and incumbent for each point, as @@ -65,7 +71,7 @@ def preprocess(self, x: pd.Series) -> Tuple[Iterable, Iterable]: budget_list.append(self.get_budget_level(config)) # Drop unused configs - x.drop(labels=indices_to_drop, inplace=True) + x = x.drop(labels=indices_to_drop) performances = self.observations.get_best_performance_for_each_budget() inc_list = [] @@ -78,11 +84,11 @@ def preprocess(self, x: pd.Series) -> Tuple[Iterable, Iterable]: return x, torch.Tensor(inc_list) - def preprocess_gp(self, x: Iterable) -> Tuple[Iterable, Iterable]: + def preprocess_gp(self, x: Iterable) -> tuple[Iterable, Iterable]: x, inc_list = self.preprocess(x) return x.values.tolist(), inc_list - def preprocess_deep_gp(self, x: Iterable) -> Tuple[Iterable, Iterable]: + def preprocess_deep_gp(self, x: Iterable) -> tuple[Iterable, Iterable]: x, inc_list = self.preprocess(x) x_lcs = [] for idx in x.index: @@ -97,7 +103,7 @@ def preprocess_deep_gp(self, x: Iterable) -> Tuple[Iterable, Iterable]: self.surrogate_model.set_prediction_learning_curves(x_lcs) return x.values.tolist(), inc_list - def preprocess_pfn(self, x: Iterable) -> Tuple[Iterable, Iterable, Iterable]: + def preprocess_pfn(self, x: Iterable) -> tuple[Iterable, Iterable, Iterable]: """Prepares the configurations for appropriate EI calculation. Takes a set of points and computes the budget and incumbent for each point, as @@ -114,7 +120,7 @@ def preprocess_pfn(self, x: Iterable) -> Tuple[Iterable, Iterable, Iterable]: ) / self.b_step return _x_tok, _x, inc_list - def eval(self, x: pd.Series, asscalar: bool = False) -> Tuple[np.ndarray, pd.Series]: + def eval(self, x: pd.Series, asscalar: bool = False) -> tuple[np.ndarray, pd.Series]: # _x = x.copy() # preprocessing needs to change the reference x Series so we don't copy here if self.surrogate_model_name == "pfn": _x_tok, _x, inc_list = self.preprocess_pfn( @@ -143,7 +149,7 @@ def eval(self, x: pd.Series, asscalar: bool = False) -> Tuple[np.ndarray, pd.Ser def eval_pfn_ei( self, x: Iterable, inc_list: Iterable - ) -> Union[np.ndarray, torch.Tensor, float]: + ) -> np.ndarray | torch.Tensor | float: """PFN-EI modified to preprocess samples and accept list of incumbents.""" # x, inc_list = self.preprocess(x) # IMPORTANT change from vanilla-EI # _x = x.copy() @@ -154,7 +160,7 @@ def eval_pfn_ei( def eval_gp_ei( self, x: Iterable, inc_list: Iterable - ) -> Union[np.ndarray, torch.Tensor, float]: + ) -> np.ndarray | torch.Tensor | float: """Vanilla-EI modified to preprocess samples and accept list of incumbents.""" # x, inc_list = self.preprocess(x) # IMPORTANT change from vanilla-EI _x = x.copy() @@ -194,7 +200,7 @@ def set_state( pipeline_space: SearchSpace, surrogate_model: Any, observations: MFObservedData, - b_step: Union[int, float], + b_step: int | float, **kwargs, ): # overload to select incumbent differently through observations @@ -202,4 +208,3 @@ def set_state( self.surrogate_model = surrogate_model self.observations = observations self.b_step = b_step - return diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py new file mode 100644 index 00000000..0f1668f1 --- /dev/null +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py @@ -0,0 +1,63 @@ +"""# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +Prior-Guided Acquisition Functions + +References: + +.. [Hvarfner2022] + C. Hvarfner, D. Stoll, A. Souza, M. Lindauer, F. Hutter, L. Nardi. PiBO: + Augmenting Acquisition Functions with User Beliefs for Bayesian Optimization. + ICLR 2022. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from botorch.acquisition.logei import partial + +from neps.optimizers.bayesian_optimization.acquisition_functions.weighted_acquisition import ( + WeightedAcquisition, +) + +if TYPE_CHECKING: + from botorch.acquisition.acquisition import AcquisitionFunction + from torch import Tensor + + from neps.sampling.priors import Prior + from neps.search_spaces.domain import Domain + + +def apply_pibo_acquisition_weight( + acq_values: Tensor, + X: Tensor, + acq: AcquisitionFunction, + *, + prior: Prior, + x_domain: Domain | list[Domain], + prior_exponent: float, +): + if acq._log: + return acq_values + prior.log_prob(X, frm=x_domain) * prior_exponent + return acq_values * prior.prob(X, frm=x_domain).pow(prior_exponent) + + +def pibo_acquisition( + acq_fn: AcquisitionFunction, + prior: Prior, + prior_exponent: float, + x_domain: Domain | list[Domain], + X_pending: Tensor | None = None, +) -> WeightedAcquisition: + return WeightedAcquisition( + acq=acq_fn, + apply_weight=partial( + apply_pibo_acquisition_weight, + prior=prior, + x_domain=x_domain, + prior_exponent=prior_exponent, + ), + X_pending=X_pending, + ) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py deleted file mode 100644 index 8a735d58..00000000 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py +++ /dev/null @@ -1,111 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Iterable -from typing_extensions import override - -import numpy as np -import torch -from botorch.acquisition import MCAcquisitionFunction - -from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( - BaseAcquisition, -) - -if TYPE_CHECKING: - from neps.priors import Prior - - -class PiboAcquisition(MCAcquisitionFunction): - """Compute a prior weighted acquisition function according to PiBO. - - * https://arxiv.org/pdf/2204.11051 - """ - - def __init__( - self, - acq_fn: MCAcquisitionFunction, - prior: Prior, - beta: float, - n: float, - ): - """Initialize the acquisition function. - - Args: - acq_fn: The acquisition function to be weighted. - prior: The prior distribution to be used for weighting. - beta: The beta parameter for weighting. - n: The denominator for the beta parameter. - """ - self._log = self.acq_fn._log - self.acq_fn = acq_fn - - self.beta = beta - self.n = n - self.prior = prior - - @override - def forward(self, X: torch.Tensor) -> torch.Tensor: - weight = self.beta / self.n - acq = self.acq_fn(X) - - # The weight is shown as being applied to the pdf and not the log_pdf - values = acq * self.prior.prob(X) * weight - - # However, if the base acq function advertises as being log, - # i.e. self._log, then we should return the log of the values - return torch.log(values) if self._log else values - - -class DecayingPriorWeightedAcquisition(BaseAcquisition): - def __init__( - self, - base_acquisition, - pibo_beta=10, - log: bool = False, - ): - super().__init__() - self.pibo_beta = pibo_beta - self.base_acquisition = base_acquisition - self.log = log - self.decay_t = 0.0 - - def eval( - self, - x: Iterable, - **base_acquisition_kwargs, - ) -> np.ndarray | torch.Tensor | float: - acquisition = self.base_acquisition(x, **base_acquisition_kwargs) - - if self.log: - min_acq_val = abs(min(acquisition)) if min(acquisition) < 0 else 0 - - for i, candidate in enumerate(x): - prior_weight = candidate.compute_prior(log=self.log) - if prior_weight != 1.0: - if self.log: - # for log -> the smaller the prior_weight, - # the more unlikely it is from the prior - # also shift acquisition values to avoid negativ values - acquisition[i] = ( - np.log(acquisition[i] + min_acq_val + 1e-12) - + (self.pibo_beta / self.decay_t) * prior_weight - ) - else: - acquisition[i] *= np.power( - prior_weight + 1e-12, self.pibo_beta / self.decay_t - ) - return acquisition - - def set_state(self, surrogate_model, **kwargs): - if "decay_t" in kwargs: - decay_t = kwargs.pop("decay_t") - else: - train_x = surrogate_model.x - if train_x[0].has_fidelity: - decay_t = np.sum( - [float(_x.fidelity.value >= _x.fidelity.upper) for _x in train_x] - ) - else: - decay_t = len(train_x) - self.decay_t = decay_t - self.base_acquisition.set_state(surrogate_model, **kwargs) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py deleted file mode 100644 index adf57266..00000000 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py +++ /dev/null @@ -1,60 +0,0 @@ -from typing import Iterable, Union - -import numpy as np -import torch - -from .base_acquisition import BaseAcquisition - - -class UpperConfidenceBound(BaseAcquisition): - def __init__(self, beta: float=1.0, maximize: bool=False): - """Upper Confidence Bound (UCB) acquisition function. - - Args: - beta: Controls the balance between exploration and exploitation. - maximize: If True, maximize the given model, else minimize. - DEFAULT=False, assumes minimzation. - """ - super().__init__() - self.beta = beta # can be updated as part of the state for dynamism or a schedule - self.maximize = maximize - - # to be initialized as part of the state - self.surrogate_model = None - - def set_state(self, surrogate_model, **kwargs): - super().set_state(surrogate_model) - self.surrogate_model = surrogate_model - if "beta" in kwargs: - if not isinstance(kwargs["beta"], (list, np.array)): - self.beta = kwargs["beta"] - else: - self.logger.warning("Beta is a list, not updating beta value!") - - def eval( - self, x: Iterable, asscalar: bool = False - ) -> Union[np.ndarray, torch.Tensor, float]: - try: - mu, cov = self.surrogate_model.predict(x) - std = torch.sqrt(torch.diag(cov)) - except ValueError as e: - raise e - sign = 1 if self.maximize else -1 # LCB is performed if minimize=True - ucb_scores = mu + sign * np.sqrt(self.beta) * std - # if LCB, minimize acquisition, or maximize -acquisition - ucb_scores = ucb_scores.detach().numpy() * sign - - return ucb_scores - - -class MF_UCB(UpperConfidenceBound): - - def preprocess(self, x: Iterable) -> Iterable: - performances = self.observations.get_best_performance_for_each_budget() - pass - - def eval( - self, x: Iterable, asscalar: bool = False - ) -> Union[np.ndarray, torch.Tensor, float]: - x = self.preprocess(x) - return self.eval(x, asscalar=asscalar) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py new file mode 100644 index 00000000..488c57f4 --- /dev/null +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py @@ -0,0 +1,147 @@ +"""This module provides most of the functionality we require in NePS for now, +i.e., we need the ability to apply an arbitrary weight to an acquisition function. + +I spent some time understanding the meaning of the various dimensions of botorch/gpytorch. + +The two primary dimensions to consider are: + +* `d` - The dimensionality of the design space, i.e. how many hyperparameters. +* `batch` - The number of independent evaluations to make, i.e. how many times to + evaluate the acquisition function. + +There are two extra dimensions which are special cases and need to be accounted for. + +* `q` - Comes from the `qXXX` variants of acquisition, these will add an extra dimension + `q` to each `batch`, where instead of a `batch` representing a single config to get + the acquisition of, we might instead be getting the acquisition of 5 configs together, + representing the joint utility of evaluating these 5 configs, relative to other sets + of 5 configs. This dimension is _reduced_ away in the final step of the acquisition + when suggesting which set of group of 5 configs to suggest. + +* `mc_samples` - Comes from the `SampleReducdingXXX` variants of acquisition, will add an + extra dimension `mc_samples` which represent the amount of Monte Carlo samples used + to estimate the acquisition. These will eventually be _reduced_ away but are present + in the intermediate steps. These variants also seem to have `q` variants implicitly + and so you are likely to see the `q` dimension whever you see the `mc_samples` + dimension, even if it is just `q=1`. + +* `m` - The number of objectives in the multi-objective case. We will + specifically ignore this for now, however it exists as the last dimension (after `d`) + and is the first to be reduced away. They are also used in _constrainted_ settings + which we will also ignore for now. + +The most expanded tensor shape is the following, with the usual order of reduction being +the following below. If you are not using a SamplingReducing variant, you will not see +`mc_samples` and if you are not using a `q` variant, you will not see `q`. The simplest +case then being `acq(tensor: batch x d)`. + +* `batch x q x d`. + reduce(..., d) = Config -> Single number (!!!Acq applies here!!!) +* `batch x q`. + expand(mc_samples , ...) = MC Sampling from posterior (I think) +* `mc_samples x batch x q`. + reduce(..., q) = Joint-Config-Group -> Single number. +* `mc_samples x batch` + reduce(mc_samples, ...) = MC-samples -> statistical estimate +* `batch` + +Finally we get out a batch of values we can argmax over, used to index into either a +single configuration or a single index into a joint-group of `q` configurations. + +!!! tip + + The `mc_samples` is not of concern to the `WeightedAcquisition` below, and + broadcasting can be used, as a result, the `apply_weight` function only needs + to be able to handle: + + * (X: batch x q x d, acq_values: batch x q, acq: A) -> batch x q + + If utilizing the configurations `X` for weighting, you effectively will want + to reduce the `d` dimension. + +As a result of this, acquisition functions need to be able to handle arbitrary dimensions +and act accordingly. + +This module mostly follows the structure of the +`PriorGuidedAcquisitionFunction` which weights the acquisition function by a prior. + +* https://botorch.org/api/_modules/botorch/acquisition/prior_guided.html#PriorGuidedAcquisitionFunction + +We use this to create a more generic `WeightedAcquisition` which follows the required +structure to make new weightings easier to implement, but also to serve as an educational +reference. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Callable, TypeVar + +from botorch.acquisition import SampleReducingMCAcquisitionFunction +from botorch.acquisition.analytic import AcquisitionFunction, t_batch_mode_transform +from botorch.acquisition.monte_carlo import concatenate_pending_points + +if TYPE_CHECKING: + from torch import Tensor + +A = TypeVar("A", bound=AcquisitionFunction) + + +class WeightedAcquisition(AcquisitionFunction): + """Class for weighting acquisition functions. + + Please see module docstring for more information. + """ + + def __init__( + self, + acq: A, + apply_weight: Callable[[Tensor, Tensor, A], Tensor], + X_pending: Tensor | None = None, + ) -> None: + """Initialize the weighted acquisition function. + + Args: + acq: The base acquisition function. + apply_weight: A function that takes the acquisition function values, the + design points and the acquisition function itself and returns the + weighted acquisition function values. + + Please see the module docstring for more information on the dimensions + and how to handle them. + X_pending: `n x d` Tensor with `n` `d`-dim design points that have + been submitted for evaluation but have not yet been evaluated. + """ + super().__init__(model=acq.model) + # NOTE: We remove the X_pending from the base acquisition function as we will get + # it in our own forward with `@concatenate_pending_points` and pass that forward. + # This avoids possible duplicates + self.acq.set_X_pending(None) + self.set_X_pending(X_pending) + self.apply_weight = apply_weight + self.acq = acq + + # Taken from PiBO implementation in botorch (PriorGuidedAcquisitionFunction). + @concatenate_pending_points + @t_batch_mode_transform() # type: ignore + def forward(self, X: Tensor) -> Tensor: + """Evaluate a weighted acquisition function on the candidate set X. + + Args: + X: A tensor of size `batch_shape x q x d`-dim tensor of `q` `d`-dim + design points. + + Returns: + A tensor with the `d` dimension reduced away, representing the + weighted acquisition function values at the given design points `X`. + """ + if isinstance(self.acq, SampleReducingMCAcquisitionFunction): + # shape: mc_samples x batch x q-candidates + acq_values = self.acq._non_reduce_forward(X) + weighted_acq_values = self.apply_weight(acq_values, X, self.acq) + vals = self.acq._sample_reduction(self.acq._q_reduction(weighted_acq_values)) + return vals.squeeze(-1) + + # shape: batch x q-candidates + acq_values = self.acq(X).unsqueeze(-1) + weighted_acq_values = self.apply_weight(acq_values, X, self.acq) + return weighted_acq_values.squeeze(-1) diff --git a/neps/optimizers/bayesian_optimization/mf_tpe.py b/neps/optimizers/bayesian_optimization/mf_tpe.py deleted file mode 100644 index 45e4adc4..00000000 --- a/neps/optimizers/bayesian_optimization/mf_tpe.py +++ /dev/null @@ -1,719 +0,0 @@ -from __future__ import annotations - -import random -from copy import deepcopy -from typing import Any, Iterable - -import numpy as np -import torch -from scipy.stats import spearmanr -from typing_extensions import Literal, override - -from neps.state.optimizer import BudgetInfo, OptimizationState -from neps.utils.types import ConfigResult, RawConfig -from neps.utils.common import instance_from_map -from neps.search_spaces import ( - CategoricalParameter, - ConstantParameter, - FloatParameter, - IntegerParameter, - SearchSpace, -) -from neps.optimizers.base_optimizer import BaseOptimizer -from neps.optimizers.bayesian_optimization.acquisition_samplers import ( - AcquisitionSamplerMapping, -) -from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, -) -from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping - -CUSTOM_FLOAT_CONFIDENCE_SCORES = dict(FloatParameter.DEFAULT_CONFIDENCE_SCORES) -CUSTOM_FLOAT_CONFIDENCE_SCORES.update({"ultra": 0.05}) - -CUSTOM_CATEGORICAL_CONFIDENCE_SCORES = dict( - CategoricalParameter.DEFAULT_CONFIDENCE_SCORES -) -CUSTOM_CATEGORICAL_CONFIDENCE_SCORES.update({"ultra": 8}) - - -class MultiFidelityPriorWeightedTreeParzenEstimator(BaseOptimizer): - def __init__( - self, - pipeline_space: SearchSpace, - use_priors: bool = True, - prior_num_evals: float = 2.5, - good_fraction: float = 0.3334, - random_interleave_prob: float = 0.0, - initial_design_size: int = 0, - prior_as_samples: bool = True, - pending_as_bad: bool = True, - fidelity_weighting: Literal["linear", "spearman"] = "spearman", - surrogate_model: str = "kde", - good_model_bw_factor: int = 1.5, - joint_kde_modelling: bool = False, - threshold_improvement: bool = True, - promote_from_acq: bool = True, - acquisition_sampler: str | AcquisitionSampler = "mutation", - prior_draws: int = 1000, - prior_confidence: Literal["low", "medium", "high"] = "medium", - surrogate_model_args: dict = None, - soft_promotion: bool = True, - patience: int = 50, - logger=None, - budget: None | int | float = None, - loss_value_on_error: None | float = None, - cost_value_on_error: None | float = None, - ): - """[summary] - - Args: - pipeline_space: Space in which to search - prior_num_evals (float, optional): [description]. Defaults to 2.5. - good_fraction (float, optional): [description]. Defaults to 0.333. - random_interleave_prob: Frequency at which random configurations are sampled - instead of configurations from the acquisition strategy. - initial_design_size: Number of 'x' samples that are to be evaluated before - selecting a sample using a strategy instead of randomly. If there is a - user prior, we can rely on the model from the very first iteration. - prior_as_samples: Whether to sample from the KDE and incorporate that way, or - just have the distribution be an linear combination of the KDE and the prior. - Should be True if the prior happens to be unnormalized. - pending_as_bad: Whether to treat pending observations as bad, assigning them to - the bad KDE to encourage diversity among samples queried in parallel - prior_draws: The number of samples drawn from the prior if there is one. This - # does not affect the strength of the prior, just how accurately it - # is reconstructed by the KDE. - patience: How many times we try something that fails before giving up. - budget: Maximum budget - loss_value_on_error: Setting this and cost_value_on_error to any float will - supress any error during bayesian optimization and will use given loss - value instead. default: None - cost_value_on_error: Setting this and loss_value_on_error to any float will - supress any error during bayesian optimization and will use given cost - value instead. default: None - logger: logger object, or None to use the neps logger - """ - super().__init__( - pipeline_space=pipeline_space, - patience=patience, - logger=logger, - budget=budget, - loss_value_on_error=loss_value_on_error, - cost_value_on_error=cost_value_on_error, - ) - self.pipeline_space = pipeline_space - self.good_fraction = good_fraction - if self.pipeline_space.has_fidelity: - self.min_fidelity = pipeline_space.fidelity.lower - self.max_fidelity = pipeline_space.fidelity.upper - self.rung_map, self.inverse_rung_map = self._get_rung_maps() - self.min_rung = 0 - self.max_rung = len(self.rung_map) - 1 - - else: - self.min_rung = 0 - self.max_rung = 0 - self.min_fidelity = 1 - self.max_fidelity = 1 - self.rung_map, self.inverse_rung_map = self._get_rung_maps() - - if initial_design_size == 0: - self._initial_design_size = len(self.pipeline_space) * np.round( - 1 / self.good_fraction - ).astype(int) - else: - self._initial_design_size = initial_design_size - self.promote_from_acq = promote_from_acq - - self.num_rungs = len(self.rung_map) - self.use_priors = use_priors - self.prior_num_evals = prior_num_evals - self._random_interleave_prob = random_interleave_prob - self._pending_as_bad = pending_as_bad - self.prior_draws = prior_draws - self._has_promotable_configs = False - self.soft_promotion = soft_promotion - self.joint_kde_modelling = joint_kde_modelling - # if we use priors, we don't add conigurations as good until is is within the top fraction - # This heuristic has not been tried further, but makes sense in the context when we have priors - self.round_up = not use_priors - self.fidelity_weighting = fidelity_weighting - self.threshold_improvement = threshold_improvement - # TODO have this read in as part of load_results - it cannot be saved as an attribute when - # running parallel instances of the algorithm (since the old configs are shared, not instance-specific) - self.old_configs_per_fid = [[] for i in range(self.num_rungs)] - # We assume that the information conveyed per fidelity (and the cost) is linear in the - # fidelity levels if nothing else is specified - if surrogate_model != "kde": - raise NotImplementedError( - "Only supports KDEs for now. Could (maybe?) support binary classification in the future." - ) - self.acquisition_sampler = instance_from_map( - AcquisitionSamplerMapping, - acquisition_sampler, - name="acquisition sampler function", - kwargs={"patience": self.patience, "pipeline_space": self.pipeline_space}, - ) - self.prior_confidence = prior_confidence - self._enhance_priors() - surrogate_model_args = surrogate_model_args or {} - - param_types, num_options, logged_params, is_fidelity = self._get_types() - surrogate_model_args["param_types"] = param_types - surrogate_model_args["num_options"] = num_options - surrogate_model_args["is_fidelity"] = is_fidelity - surrogate_model_args["logged_params"] = logged_params - good_model_args = deepcopy(surrogate_model_args) - good_model_args["bandwidth_factor"] = good_model_bw_factor - if self.pipeline_space.has_prior and use_priors: - if prior_as_samples: - self.prior_samples = [ - self.pipeline_space.sample( - patience=self.patience, user_priors=True, ignore_fidelity=False - ) - for idx in range(self.prior_draws) - ] - else: - pass - # TODO work out affine combination - else: - self.prior_samples = [] - - self.surrogate_models = { - "good": instance_from_map( - SurrogateModelMapping, - surrogate_model, - name="surrogate model", - kwargs=good_model_args, - ), - "bad": instance_from_map( - SurrogateModelMapping, - surrogate_model, - name="surrogate model", - kwargs=surrogate_model_args, - ), - "all": instance_from_map( - SurrogateModelMapping, - surrogate_model, - name="surrogate model", - kwargs=surrogate_model_args, - ), - } - self.acquisition = self - self.acquisition_sampler = instance_from_map( - AcquisitionSamplerMapping, - acquisition_sampler, - name="acquisition sampler function", - kwargs={"patience": self.patience, "pipeline_space": self.pipeline_space}, - ) - - def _enhance_priors(self): - """Only applicable when priors are given along with a confidence.""" - if not self.use_priors and self.prior_confidence is None: - return - for k in self.pipeline_space.keys(): - if self.pipeline_space[k].is_fidelity: - continue - elif isinstance(self.pipeline_space[k], (FloatParameter, IntegerParameter)): - confidence = CUSTOM_FLOAT_CONFIDENCE_SCORES[self.prior_confidence] - self.pipeline_space[k].default_confidence_score = confidence - elif isinstance(self.pipeline_space[k], CategoricalParameter): - confidence = CUSTOM_CATEGORICAL_CONFIDENCE_SCORES[self.prior_confidence] - self.pipeline_space[k].default_confidence_score = confidence - - def _get_rung_maps(self, s: int = 0) -> dict: - """Maps rungs (0,1,...,k) to a fidelity value based on fidelity bounds, eta, s.""" - eta = round(1 / self.good_fraction) - new_min_budget = self.min_fidelity * (1 / eta**s) - nrungs = ( - np.floor(np.log(self.max_fidelity / new_min_budget) / np.log(eta)).astype(int) - + 1 - ) - _max_budget = self.max_fidelity - rung_map = dict() - inverse_rung_map = dict() - for i in reversed(range(nrungs)): - # TODO: add +s to keys and TEST - rung_value = ( - int(_max_budget) - if isinstance(self.pipeline_space.fidelity, IntegerParameter) - else _max_budget - ) - - rung_map[i + s] = rung_value - inverse_rung_map[rung_value] = i + s - _max_budget /= eta - return rung_map, inverse_rung_map - - def _get_types(self): - """extracts the needed types from the configspace for faster retrival later - - type = 0 - numerical (continuous or integer) parameter - type >=1 - categorical parameter - - TODO: figure out a way to properly handle ordinal parameters - - """ - types = [] - num_values = [] - logs = [] - is_fidelity = [] - for _, hp in self.pipeline_space.items(): - is_fidelity.append(hp.is_fidelity) - if isinstance(hp, CategoricalParameter): - # u as in unordered - used to play nice with the statsmodels KDE implementation - types.append("u") - logs.append(False) - num_values.append(len(hp.choices)) - elif isinstance(hp, IntegerParameter): - # o as in ordered - types.append("o") - logs.append(False) - num_values.append(hp.upper - hp.lower + 1) - elif isinstance(hp, FloatParameter): - # c as in continous - types.append("f") - logs.append(hp.log) - num_values.append(np.inf) - elif isinstance(hp, ConstantParameter): - # c as in continous - types.append("c") - logs.append(False) - num_values.append(1) - - else: - raise ValueError("Unsupported Parametertype %s" % type(hp)) - - return types, num_values, logs, is_fidelity - - def __call__( - self, - x: Iterable, - asscalar: bool = False, - only_lowest_fidelity=True, - only_good=False, - ) -> np.ndarray | torch.Tensor | float: - """ - Return the negative expected improvement at the query point - """ - # this is to only make the lowest fidelity viable - # TODO have this as a setting in the acq_sampler instead - if only_lowest_fidelity: - is_lowest_fidelity = ( - np.array([x_.fidelity.value for x_ in x]) == self.rung_map[self.min_rung] - ) - return np.log(self.surrogate_models["good"].pdf(x)) - np.log( - self.surrogate_models["bad"].pdf(x) - ) - else: - return np.log(self.surrogate_models["good"].pdf(x)) - np.log( - self.surrogate_models["bad"].pdf(x) - ) - - def _split_by_fidelity(self, configs, losses): - if self.pipeline_space.has_fidelity: - configs_per_fidelity = [[] for i in range(self.num_rungs)] - losses_per_fidelity = [[] for i in range(self.num_rungs)] - # per fidelity, add a list to make it a nested list of lists - # [[config_A at fid1, config_B at fid1], [config_C at fid2], ...] - for config, loss in zip(configs, losses): - rung = self.inverse_rung_map[int(config.fidelity.value)] - configs_per_fidelity[rung].append(config) - losses_per_fidelity[rung].append(loss) - return configs_per_fidelity, losses_per_fidelity - else: - return [configs], [losses] - - def _split_configs( - self, configs_per_fid, losses_per_fid, weight_per_fidelity, good_fraction=None - ): - """Splits configs into good and bad for the KDEs. - - Args: - configs ([type]): [description] - losses ([type]): [description] - round_up (bool, optional): [description]. Defaults to True. - - Returns: - [type]: [description] - """ - if good_fraction is None: - good_fraction = self.good_fraction - - good_configs, bad_configs = [], [] - good_configs_weights, bad_configs_weights = [], [] - - for fid, (configs_fid, losses_fid) in enumerate( - zip(configs_per_fid, losses_per_fid) - ): - if self.round_up: - num_good_configs = np.ceil(len(configs_fid) * good_fraction).astype(int) - else: - num_good_configs = np.floor(len(configs_fid) * good_fraction).astype(int) - - ordered_loss_indices = np.argsort(losses_fid) - good_indices = ordered_loss_indices[0:num_good_configs] - bad_indices = ordered_loss_indices[num_good_configs:] - good_configs_fid = [configs_fid[idx] for idx in good_indices] - bad_configs_fid = [configs_fid[idx] for idx in bad_indices] - good_configs.extend(good_configs_fid) - bad_configs.extend(bad_configs_fid) - - if self.threshold_improvement: - good_configs_weights.extend( - self._compute_improvement_weights( - losses_fid, num_good_configs, weight_per_fidelity[fid] - ) - ) - else: - good_configs_weights.extend( - [weight_per_fidelity[fid]] * len(good_configs_fid) - ) - bad_configs_weights.extend([weight_per_fidelity[fid]] * len(bad_configs_fid)) - return good_configs, bad_configs, good_configs_weights, bad_configs_weights - - def _compute_improvement_weights(self, losses, num_good_configs, max_weight): - if num_good_configs == 0: - return [] - - ordered_losses = np.sort(losses) - best_bad_loss = ordered_losses[num_good_configs] - good_losses = ordered_losses[0:num_good_configs] - relative_improvements = (best_bad_loss - good_losses) / ( - best_bad_loss - good_losses.min() - ) - improvement_weights = max_weight * relative_improvements - return improvement_weights - - def compute_fidelity_weights(self, configs_per_fid, losses_per_fid) -> list: - # TODO consider pending configurations - will default to a linear weighting - # which is not necessarily correct - if self.fidelity_weighting == "linear": - weight_per_fidelity = self._compute_linear_weights() - elif self.fidelity_weighting == "spearman": - weight_per_fidelity = self._compute_spearman_weights( - configs_per_fid, losses_per_fid - ) - else: - raise ValueError( - f"No weighting scheme {self.fidelity_weighting} is available." - ) - return weight_per_fidelity - - def _compute_linear_weights(self): - return (1 + np.arange(self.min_rung, self.max_rung + 1)) / self.num_rungs - - def _compute_spearman_weights(self, configs_per_fid, losses_per_fid) -> list: - min_number_samples = np.round(1 / self.good_fraction).astype(int) - samples_per_fid = np.array([len(cfgs_fid) for cfgs_fid in configs_per_fid]) - max_comparable_fid = ( - self.max_rung - np.argmax(np.flip(samples_per_fid) >= min_number_samples) - ).astype(int) - if max_comparable_fid == 0: - # if we cannot compare to any otḧer fidelity, return default - return self._compute_linear_weights() - else: - # compare the rankings of the existing configurations to the ranking - # of the same configurations at lower rungs - spearman = np.ones(self.num_rungs) - for fid_idx, (cfgs, losses) in enumerate( - zip(configs_per_fid, losses_per_fid) - ): - if fid_idx >= max_comparable_fid: - spearman[fid_idx] = 1 - - else: - comp_losses = losses_per_fid[fid_idx + 1] - comp_configs = configs_per_fid[fid_idx + 1] - - lower_fid_configs = [None] * len(comp_configs) - lower_fid_losses = [None] * len(comp_configs) - for cfg, loss in zip(cfgs, losses): - # check if the config at the lower fidelity level is in the comparison set - # TODO make this more efficient - probably embarrasingly slof for now - # with the triple-nested loop (although number of configs per level is pretty low) - is_equal_config = [ - cfg.is_equal_value(comp_cfg, include_fidelity=False) - for comp_cfg in comp_configs - ] - if any(is_equal_config): - equal_index = np.argmax(is_equal_config) - lower_fid_configs[equal_index] = cfg - lower_fid_losses[equal_index] = loss - - spearman[fid_idx] = spearmanr( - lower_fid_losses, comp_losses - ).correlation - - spearman = np.clip(spearman, a_min=0, a_max=1) - # The correlation with Z_max at fidelity Z-k cannot be larger than at Z-k+1 - spearman = np.flip(np.multiply.accumulate(np.flip(spearman))) - fidelity_weights = spearman * (max_comparable_fid + 1) / (self.max_rung + 1) - return fidelity_weights - - def is_init_phase(self) -> bool: - """Decides if optimization is still under the warmstart phase/model-based search.""" - if self._num_train_x >= self._initial_design_size: - return False - return True - - @override - def load_optimization_state( - self, - previous_results: dict[str, ConfigResult], - pending_evaluations: dict[str, SearchSpace], - budget_info: BudgetInfo | None, - optimizer_state: dict[str, Any], - ) -> None: - # TODO remove doubles from previous results - train_y = [self.get_loss(el.result) for el in previous_results.values()] - - train_x_configs = [el.config for el in previous_results.values()] - pending_configs = list(pending_evaluations.values()) - - filtered_configs, filtered_indices = self._filter_old_configs(train_x_configs) - filtered_y = np.array(train_y)[filtered_indices].tolist() - - self.train_x_configs = train_x_configs - self.train_y = train_y - - self._pending_evaluations = pending_evaluations - self._num_train_x = len(self.train_x_configs) - if not self.is_init_phase(): - # This is to extract the configurations as numpy arrays on the format num_data x num_dim - # TODO when a config is removed in the filtering process, that means that some other - # configuration at the lower fidelity will become good, that was previously bad. This - # may be good or bad, but I'm not sure. / Carl - configs_per_fid, losses_per_fid = self._split_by_fidelity( - train_x_configs, train_y - ) - filtered_configs_per_fid, filtered_losses_per_fid = self._split_by_fidelity( - filtered_configs, filtered_y - ) - weight_per_fidelity = self.compute_fidelity_weights( - configs_per_fid, losses_per_fid - ) - - good_configs, bad_configs, good_weights, bad_weights = self._split_configs( - filtered_configs_per_fid, filtered_losses_per_fid, weight_per_fidelity - ) - if self.use_priors: - num_prior_configs = len(self.prior_samples) - good_configs.extend(self.prior_samples) - prior_sample_constant = self.prior_num_evals / num_prior_configs - good_weights.extend([prior_sample_constant] * num_prior_configs) - - fixed_bw = None - self.surrogate_models["all"].fit(filtered_configs) - if self.joint_kde_modelling: - fixed_bw = self.surrogate_models["all"].bw - - self.surrogate_models["good"].fit( - good_configs, fixed_bw=fixed_bw, config_weights=good_weights - ) - if self._pending_as_bad: - # This is only to compute the weights of the pending configs - _, pending_configs, _, pending_weights = self._split_configs( - pending_configs, - [np.inf] * len(pending_configs), - weight_per_fidelity, - good_fraction=0.0, - ) - bad_configs.extend(pending_configs) - bad_weights.extend(pending_weights) - - self.surrogate_models["bad"].fit( - bad_configs, fixed_bw=fixed_bw, config_weights=bad_weights - ) - # self.visualize_acq(previous_results, weight_per_fidelity) - - def _filter_old_configs(self, configs): - new_configs = [] - new_indices = [] - old_configs_flat = [] - for cfgs in self.old_configs_per_fid: - old_configs_flat.extend(cfgs) - - for idx, cfg in enumerate(configs): - if any([cfg.is_equal_value(old_cfg) for old_cfg in old_configs_flat]): - # If true, configs are equal and shouldn't be added - continue - else: - new_configs.append(cfg) - new_indices.append(idx) - return new_configs, new_indices - - def _get_promotable_configs(self, configs): - if self.soft_promotion: - configs_for_promotion = self._get_soft_promotable(configs) - else: - configs_for_promotion = self._get_hard_promotable(configs) - return configs_for_promotion - - def _get_hard_promotable(self, configs): - # count the number of configs that are at or above any given rung - configs_per_rung = np.zeros(self.num_rungs) - # check the number of configs per fidelity level - for config in configs: - rung = self.inverse_rung_map[int(config.fidelity.value)] - configs_per_rung[rung] += 1 - - cumulative_per_rung = np.flip(np.cumsum(np.flip(configs_per_rung))) - cumulative_above = np.append(np.flip(np.cumsum(np.flip(configs_per_rung[1:]))), 0) - # then check which one can make the most informed decision on promotions - rungs_to_promote = cumulative_per_rung * self.good_fraction - cumulative_above - - # this defaults to max_fidelity if there is no promotable config (cannot promote from) - # the top fidelity anyway - fid_to_promote = self.num_rungs - np.argmax(np.flip(rungs_to_promote) > 1) - - # TODO check if this returns empty when it needs to - if fid_to_promote == self.max_rung: - return [] - return [cfg for cfg in configs if cfg.fidelity.value == fid_to_promote] - - def _get_soft_promotable(self, configs): - # TODO implement - # count the number of configs that are at or above any given rung - new_configs, _ = self._filter_old_configs(configs) - configs_per_rung = np.zeros(self.num_rungs) - - # check the number of configs per fidelity level - for config in new_configs: - rung = self.inverse_rung_map[int(config.fidelity.value)] - configs_per_rung[rung] += 1 - - # The square root means that we keep the approximate distribution between - # rungs as HyperBand - rungs_to_promote = configs_per_rung * np.power( - self.good_fraction, np.flip(np.sqrt(np.arange(self.num_rungs))) - ) - rungs_to_promote[-1] = 0 - next_rung_to_promote = np.arange(self.num_rungs)[rungs_to_promote > 1] - if len(next_rung_to_promote) == 0: - return [] - - next_fid_to_promote = self.rung_map[next_rung_to_promote[0]] - return [cfg for cfg in new_configs if cfg.fidelity.value == next_fid_to_promote] - - def _promote_existing(self, configs_for_promotion): - # TODO we still need to REMOVE the observation at the lower fidelity - # i.e. give it zero weight in the KDE, and ensure the count is correct - assert len(configs_for_promotion) > 0, "No promotable configurations" - if self.promote_from_acq: - acq_values = self.__call__(configs_for_promotion, only_lowest_fidelity=False) - else: - acq_values = self.__call__( - configs_for_promotion, only_lowest_fidelity=False, only_good=True - ) - - next_config = configs_for_promotion[np.argmax(acq_values)] - current_rung = self.inverse_rung_map[next_config.fidelity.value] - self.old_configs_per_fid[current_rung].append(next_config.copy()) - new_fidelity = self.rung_map[current_rung + 1] - next_config.fidelity.set_value(new_fidelity) - return next_config - - def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]: - if self._num_train_x == 0 and self._initial_design_size >= 1: - # TODO only at lowest fidelity - config = self.pipeline_space.sample( - patience=self.patience, user_priors=True, ignore_fidelity=False - ) - config.fidelity.set_value(self.rung_map[self.min_rung]) - - elif self.is_init_phase(): - config = self.pipeline_space.sample( - patience=self.patience, user_priors=True, ignore_fidelity=True - ) - config.fidelity.set_value(self.rung_map[self.min_rung]) - - elif random.random() < self._random_interleave_prob: - # TODO only at lowest fidelity - config = self.pipeline_space.sample( - patience=self.patience, ignore_fidelity=False, user_priors=False - ) - config.fidelity.set_vlaue(self.rung_map[self.min_rung]) - elif len(self._get_promotable_configs(self.train_x_configs)) > 0: - configs_for_promotion = self._get_promotable_configs(self.train_x_configs) - config = self._promote_existing(configs_for_promotion) - - else: - config = self.acquisition_sampler.sample(self.acquisition) - config.fidelity.set_value(self.rung_map[self.min_rung]) - - config_id = str(self._num_train_x + len(self._pending_evaluations) + 1) - return config.hp_values(), config_id, None - - def visualize_2d( - self, ax, previous_results, grid_points: int = 101, color: str = "k" - ): - X1 = np.linspace(0, 1, grid_points) - X2 = np.linspace(0, 1, grid_points) - X1, X2 = np.meshgrid(X1, X2) - X = np.append(X1.reshape(-1, 1), X2.reshape(-1, 1), axis=1) - Z = self.surrogate_models["good"]._pdf(X) / self.surrogate_models["bad"]._pdf(X) - Z_min, Z_max = -np.abs(Z).max(), np.abs(Z).max() - - Z = Z.reshape(grid_points, grid_points) - - c = ax.pcolormesh(X1, X2, Z, cmap=color, vmin=Z_min, vmax=Z_max) - ax.set_title("pcolormesh") - # set the limits of the plot to the limits of the data - ax.axis([0, 1, 0, 1]) - train_x_configs = [el.config for el in previous_results.values()] - np_X = self.surrogate_models["good"]._convert_configs_to_numpy(train_x_configs) - ax.scatter(np_X[:, 0], np_X[:, 1], s=100) - # ax.scatter(np_X[-1, 0], np_X[-1, 1], s=100, c='yellow') - - return ax - - def visualize_acq(self, previous_results, weights_per_fidelity): - import matplotlib.pyplot as plt - - train_x_configs = [el.config for el in previous_results.values()] - train_y = [self.get_loss(el.result) for el in previous_results.values()] - - filtered_configs, filtered_indices = self._filter_old_configs(train_x_configs) - configs_per_fid, losses_per_fid = self._split_by_fidelity( - train_x_configs, train_y - ) - filtered_y = np.array(train_y)[filtered_indices].tolist() - filtered_configs_per_fid, filtered_losses_per_fid = self._split_by_fidelity( - filtered_configs, filtered_y - ) - weight_per_fidelity = self.compute_fidelity_weights( - configs_per_fid, losses_per_fid - ) - good_configs, bad_configs, good_weights, bad_weights = self._split_configs( - filtered_configs_per_fid, filtered_losses_per_fid, weight_per_fidelity - ) - good_configs_np = self.surrogate_models["all"]._convert_configs_to_numpy( - good_configs - ) - bad_configs_np = self.surrogate_models["all"]._convert_configs_to_numpy( - bad_configs - ) - - fig, axes = plt.subplots(1, 3, figsize=(16, 9)) - axes[0] = self.surrogate_models["good"].visualize_2d(axes[0], color="RdBu") - axes[0].scatter( - good_configs_np[:, 0], - good_configs_np[:, 1], - c=good_weights, - cmap="spring", - s=50, - marker="x", - ) - axes[1] = self.surrogate_models["bad"].visualize_2d(axes[1], color="RdBu_r") - axes[1].scatter( - bad_configs_np[:, 0], - bad_configs_np[:, 1], - c=bad_weights, - s=50, - cmap="spring", - marker="x", - ) - axes[2] = self.visualize_2d(axes[2], previous_results, color="BrBG") - plt.show() diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index 206078cb..307f806b 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -10,7 +10,7 @@ import torch from botorch.acquisition.analytic import SingleTaskGP from botorch.models import MixedSingleTaskGP -from botorch.models.gp_regression_mixed import CategoricalKernel +from botorch.models.gp_regression_mixed import CategoricalKernel, Likelihood from botorch.models.transforms.outcome import Standardize from botorch.optim import optimize_acqf, optimize_acqf_mixed from gpytorch.kernels import MaternKernel, ScaleKernel @@ -149,7 +149,9 @@ def default_categorical_kernel( def default_single_obj_gp( x: TensorPack, y: torch.Tensor, -) -> SingleTaskGP: +) -> tuple[SingleTaskGP, Likelihood]: + if y.ndim == 1: + y = y.unsqueeze(-1) encoder = x.encoder numerics: list[int] = [] categoricals: list[int] = [] @@ -159,29 +161,33 @@ def default_single_obj_gp( else: numerics.append(encoder.index_of[hp_name]) + likelihood = default_likelihood_with_prior() + # Purely vectorial if len(categoricals) == 0: - return SingleTaskGP( + gp = SingleTaskGP( train_X=x.tensor, train_Y=y, mean_module=default_mean(), - likelihood=default_likelihood_with_prior(), + likelihood=likelihood, # Only matern kernel covar_module=default_matern_kernel(len(numerics)), outcome_transform=Standardize(m=1), ) + return gp, likelihood # Purely categorical if len(numerics) == 0: - return SingleTaskGP( + gp = SingleTaskGP( train_X=x.tensor, train_Y=y, mean_module=default_mean(), - likelihood=default_likelihood_with_prior(), + likelihood=likelihood, # Only categorical kernel covar_module=default_categorical_kernel(len(categoricals)), outcome_transform=Standardize(m=1), ) + return gp, likelihood # Mixed def cont_kernel_factory( @@ -203,14 +209,15 @@ def cont_kernel_factory( ), ) - return MixedSingleTaskGP( + gp = MixedSingleTaskGP( train_X=x.tensor, train_Y=y, cat_dims=categoricals, - likelihood=default_likelihood_with_prior(), + likelihood=likelihood, cont_kernel_factory=cont_kernel_factory, outcome_transform=Standardize(m=1), ) + return gp, likelihood def optimize_acq( diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index d9f322aa..2c4e5eeb 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -8,11 +8,15 @@ LinearMCObjective, qLogExpectedImprovement, ) +from botorch.fit import fit_gpytorch_mll from gpytorch import ExactMarginalLogLikelihood from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig -from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import ( - PiboAcquisition, +from neps.optimizers.bayesian_optimization.acquisition_functions.cost_cooling import ( + cost_cooled_acq, +) +from neps.optimizers.bayesian_optimization.acquisition_functions.pibo import ( + pibo_acquisition, ) from neps.optimizers.bayesian_optimization.models.gp import ( default_single_obj_gp, @@ -23,6 +27,7 @@ from neps.search_spaces.hyperparameters.categorical import CategoricalParameter if TYPE_CHECKING: + from botorch.models.gp_regression_mixed import Likelihood from botorch.models.model import Model from neps.search_spaces import ( @@ -34,29 +39,88 @@ from neps.state import BudgetInfo, Trial -def _pibo_acq_beta_and_n( - n_sampled_already: int, - ndims: int, - budget_info: BudgetInfo, -) -> tuple[float, float]: +def _missing_fill_strategy( + y: torch.Tensor, + strategy: Literal["mean", "worst", "3std", "nan"], + *, + lower_is_better: bool, +) -> torch.Tensor: + # Assumes minimization + if y.ndim != 1: + raise ValueError("Only supports single objective optimization for now!") + + match strategy: + case "nan": + return y + case "mean": + return torch.nan_to_num(y, nan=y.mean().item()) + case "worst": + worst = y.min() if lower_is_better else y.max() + return torch.nan_to_num(y, nan=worst.item()) + case "3std": + sign = 1 if lower_is_better else -1 + std = y.std() + return torch.nan_to_num(y, nan=y.mean().item() + sign * 3 * std.item()) + case _: + raise ValueError(f"Unknown strategy: {strategy}") + + +def _missing_y_strategy(y: torch.Tensor) -> torch.Tensor: + # TODO: Figure out what to do if there's no reported loss value. + # Some strategies: + # 1. Replace with NaN, in which case GPYtorch ignores it + # * Good if crash is random crash, in which case we do not wish to model + # a performance because of it. + # 2. Replace with worst value seen so far + # * Good if crash is systematic, in which case we wish to model it as + # basically, "don't go here" while remaining in the range of possible + # values for the GP. + # 3. Replace with mean + # * Same as above but keeps the optimization of the GP landscape + # smoother. Good if we have a mix of non-systematic and systematic + # crashed. Likely the safest option as GP will likely be unconfident in + # unsystematic crash cases, especially if it seems like a rare-event. + # Will also unlikely be a candidate region if systematic and we observe + # a few crashes there. However would take longer to learn of systematic + # crash regions. + return _missing_fill_strategy(y, strategy="mean", lower_is_better=True) + + +def _missing_cost_strategy(cost: torch.Tensor) -> torch.Tensor: + # TODO: Figure out what to do if there's no reported cost value + # Likely best to just fill in worst cost seen so far as this crash + # cost us a lot of time and we do not want to waste time on this + # region again. However if the crash was random, we might enter some + # issues. + return _missing_fill_strategy(cost, strategy="3std", lower_is_better=True) + + +def _pibo_exp_term(n_sampled_already: int, ndims: int, budget_info: BudgetInfo) -> float: if budget_info.max_evaluations is not None: # From the PIBO paper (Section 4.1) # https://arxiv.org/pdf/2204.11051 + n = n_sampled_already beta = budget_info.max_evaluations / 10 - return n_sampled_already, beta - - if budget_info.max_cost_budget is not None: + elif budget_info.max_cost_budget is not None: # This might not work well if cost number is high # early on, but it will start to normalize. n = budget_info.used_cost_budget beta = budget_info.max_cost_budget / 10 - return n, beta + else: + # Otherwise, just some random heuristic based on the number + # of trials and dimensionality of the search space + # TODO: Think about and evaluate this more. + n = n_sampled_already + beta = ndims**2 / 10 - # Otherwise, just some random heuristic based on the number - # of trials and dimensionality of the search space - # TODO: Think about and evaluate this more. - beta = ndims**2 / 10 - return n_sampled_already, beta + return beta / n + + +def _cost_used_budget_percentage(budget_info: BudgetInfo) -> float: + if budget_info.max_cost_budget is not None: + return budget_info.used_cost_budget / budget_info.max_cost_budget + + raise ValueError("No cost budget provided!") # TODO: This needs to be moved to the search space class, however @@ -105,9 +169,10 @@ def __init__( # noqa: D417 *, initial_design_size: int | None = None, surrogate_model: ( - Literal["gp"] | Callable[[TensorPack, torch.Tensor], Model] + Literal["gp"] | Callable[[TensorPack, torch.Tensor], tuple[Model, Likelihood]] ) = "gp", use_priors: bool = False, + use_cost: bool = False, sample_default_first: bool = False, device: torch.device | None = None, encoder: TensorEncoder | None = None, @@ -124,6 +189,15 @@ def __init__( # noqa: D417 surrogate_model: Surrogate model, either a known model str or a callable that takes in the training data and returns a model fitted to (X, y). use_priors: Whether to use priors set on the hyperparameters during search. + use_cost: Whether to consider reported "cost" from configurations in decision + making. If True, the optimizer will weigh potential candidates by how much + they cost, incentivising the optimizer to explore cheap, good performing + configurations. This amount is modified over time + + !!! warning + + If using `cost`, cost must be provided in the reports of the trials. + sample_default_first: Whether to sample the default configuration first. device: Device to use for the optimization. encoder: Encoder to use for encoding the configurations. If None, it will @@ -154,6 +228,7 @@ def __init__( # noqa: D417 params.update(pipeline_space.fidelities) self.encoder = TensorEncoder.default(params) if encoder is None else encoder + self.use_cost = use_cost self.prior = _make_prior(params) if use_priors is True else None self.device = device self.sample_default_first = sample_default_first @@ -176,8 +251,9 @@ def ask( "Seed is not yet implemented for BayesianOptimization" ) + n_trials = len(trials) space = self.pipeline_space - config_id = str(len(trials) + 1) + config_id = str(n_trials + 1) # Fill intitial design data if we don't have any... if self.initial_design_ is None: @@ -203,8 +279,8 @@ def ask( self.initial_design_.extend(configs) # If we havn't passed the intial design phase - if len(trials) <= len(self.initial_design_): - config = self.initial_design_[len(trials) - 1] + if n_trials <= len(self.initial_design_): + config = self.initial_design_[n_trials - 1] sample = SampledConfig(id=config_id, config=config, previous_config_id=None) return sample, optimizer_state @@ -212,45 +288,80 @@ def ask( # TODO: Lift this into runtime, let the optimizer advertise the encoding wants... x_configs: list[dict[str, Any]] = [] ys: list[float] = [] + costs: list[float] = [] pending: list[dict[str, Any]] = [] for trial in trials.values(): if trial.state.pending(): pending.append(trial.config) else: assert trial.report is not None - # TODO: Figure out what to do if there's no reported loss value. - assert trial.report.loss is not None x_configs.append(trial.config) - ys.append(trial.report.loss) + ys.append( + trial.report.loss if trial.report.loss is not None else torch.nan + ) + if self.use_cost: + cost = trial.report.cost + costs.append(cost if cost is not None else torch.nan) x = self.encoder.pack(x_configs, device=self.device) - x_pending = ( - None if len(pending) == 0 else self.encoder.pack(pending, device=self.device) - ) - y = torch.tensor(ys, dtype=torch.float64, device=self.device) - if y.ndim == 1: - y = y.unsqueeze(1) - - model = self._get_model(x, y) + maybe_x_pending_tensor = None + if len(pending) > 0: + x_pending = self.encoder.pack(pending, device=self.device) + maybe_x_pending_tensor = x_pending.tensor - from botorch.fit import fit_gpytorch_mll + y = torch.tensor(ys, dtype=torch.float64, device=self.device) + y = _missing_y_strategy(y) - mll = ExactMarginalLogLikelihood(likelihood=model.likelihood, model=model) - _fit_mll = fit_gpytorch_mll(mll) + # Now fit our model + y_model, y_likelihood = self._get_model(x, y) + fit_gpytorch_mll( + ExactMarginalLogLikelihood(likelihood=y_likelihood, model=y_model) + ) acq = qLogExpectedImprovement( - model, + y_model, best_f=y.min(), - X_pending=None if x_pending is None else x_pending.tensor, + X_pending=maybe_x_pending_tensor, # Unfortunatly, there's no option to indicate that we minimize # the AcqFunction so we need to do some kind of transformation. # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607 objective=LinearMCObjective(weights=torch.tensor([-1.0])), ) + + # If we should use the prior, weight the acquisition function by + # the probability of it being sampled from the prior. if self.prior: - n, beta = _pibo_acq_beta_and_n(len(trials), self.encoder.ncols, budget_info) - acq = PiboAcquisition(acq, prior=self.prior, n=n, beta=beta) + acq = pibo_acquisition( + acq, + prior=self.prior, + prior_exponent=_pibo_exp_term(n_trials, self.encoder.ncols, budget_info), + x_domain=self.encoder.domains, + X_pending=maybe_x_pending_tensor, + ) + + # If we should use cost, weight the acquisition function by the cost + # of the configurations. + if self.use_cost: + cost = torch.tensor(costs, dtype=torch.float64, device=self.device) + cost = _missing_cost_strategy(cost) + + # TODO: We might want a different model for cost estimation... one reason + # is that cost estimates are likely to be a lot noisier than the likelihood + # we have by default. + cost_model, cost_likelihood = self._get_model(x, cost) + + # Optimize the cost model + fit_gpytorch_mll( + ExactMarginalLogLikelihood(likelihood=cost_likelihood, model=cost_model) + ) + acq = cost_cooled_acq( + acq_fn=acq, + model=cost_model, + likelihood=cost_likelihood, + used_budget_percentage=_cost_used_budget_percentage(budget_info), + ) + # Finally, optimize the acquisition function to get a configuration candidates, _eis = optimize_acq(acq_fn=acq, encoder=self.encoder, acq_options={}) assert len(candidates) == 1, "Expected only one candidate!" diff --git a/neps/optimizers/bayesian_optimization/sobol.py b/neps/optimizers/bayesian_optimization/sobol.py deleted file mode 100644 index e69de29b..00000000 diff --git a/neps/optimizers/initial_design.py b/neps/optimizers/initial_design.py deleted file mode 100644 index c8039c6a..00000000 --- a/neps/optimizers/initial_design.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Initial design of points for optimization.""" - -from __future__ import annotations - -from dataclasses import dataclass -from typing import TYPE_CHECKING -from typing_extensions import override - -if TYPE_CHECKING: - import torch - - from neps.priors import Prior - from neps.search_spaces.domain import Domain - - -@dataclass -class PriorInitialDesign(InitialDesign): - """Sample from a prior distribution.""" - - prior: Prior - """The prior to sample from.""" - - # TODO: Right now we don't have a way to set the seed temporarily - seed: int | None = None - """The seed for sampling.""" - - @override - def sample(self, n: int) -> torch.Tensor: - return self.prior.sample(n) - - @property - @override - def sample_domain(self) -> list[Domain]: - return self.prior.domains diff --git a/neps/distributions.py b/neps/sampling/distributions.py similarity index 99% rename from neps/distributions.py rename to neps/sampling/distributions.py index 2361e191..fb552949 100644 --- a/neps/distributions.py +++ b/neps/sampling/distributions.py @@ -225,6 +225,6 @@ def log_prob(self, value): @dataclass -class DistributionOverDomain: +class TorchDistributionWithDomain: distribution: Distribution domain: Domain diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py index 2deda010..03b64122 100644 --- a/neps/sampling/priors.py +++ b/neps/sampling/priors.py @@ -15,7 +15,7 @@ import torch -from neps.distributions import DistributionOverDomain, TruncatedNormal +from neps.sampling.distributions import TorchDistributionWithDomain, TruncatedNormal from neps.sampling.samplers import Sampler, WeightedSampler from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain @@ -167,11 +167,11 @@ def make_centered( f"Please provide a center for all domains." ) - distributions: list[DistributionOverDomain] = [] + distributions: list[TorchDistributionWithDomain] = [] for name, domain in domains.items(): center_confidence = centers.get(name) if center_confidence is None: - dist = DistributionOverDomain( + dist = TorchDistributionWithDomain( distribution=torch.distributions.Uniform(0.0, 1.0), domain=UNIT_FLOAT_DOMAIN, ) @@ -202,7 +202,7 @@ def make_centered( weights[center] = confidence - dist = DistributionOverDomain( + dist = TorchDistributionWithDomain( distribution=torch.distributions.Categorical(probs=weights), domain=domain, ) @@ -213,7 +213,7 @@ def make_centered( unit_center = domain.to_unit( torch.tensor(center, device=device, dtype=torch.float64) ) - dist = DistributionOverDomain( + dist = TorchDistributionWithDomain( distribution=TruncatedNormal( loc=unit_center, scale=(1 - confidence), @@ -254,7 +254,7 @@ class CenteredPrior(Prior): [`Prior.make_centered()`][neps.priors.Prior.make_centered]. """ - distributions: list[DistributionOverDomain] + distributions: list[TorchDistributionWithDomain] """Distributions along with the corresponding domains they sample from.""" _distribution_domains: list[Domain] = field(init=False, repr=False) diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py index 6802f6d7..c7456155 100644 --- a/neps/sampling/samplers.py +++ b/neps/sampling/samplers.py @@ -40,11 +40,9 @@ def sample( will be added with [`.ncols`][neps.samplers.Sampler.ncols]. For example, if `n = 5`, the output will be `(5, ncols)`. If `n = (5, 3)`, the output will be `(5, 3, ncols)`. - to: The domain or list of domains to cast the points to. - If a single domain, all points are cast to that domain, otherwise - each column `ndim_i` in (n, ndim) is cast to the corresponding domain - in `to`. As a result, the length of `to` must match the number of columns - from [`.ncols`][neps.samplers.Sampler.ncols]. + to: If a single domain, `.ncols` columns will be produced form that one + domain. If a list of domains, then it must have the same length as the + number of columns, with each column being in the corresponding domain. seed: The seed for the random number generator. device: The device to cast the samples to. @@ -58,7 +56,7 @@ def sobol(cls, ndim: int, *, scramble: bool = True, seed: int | None = None) -> """Create a Sobol sampler. Args: - ndim: The number of dimensions to sample for. + ndim: The number of columns to sample. scramble: Whether to scramble the Sobol sequence. seed: The seed for the Sobol sequence. @@ -82,6 +80,13 @@ class Sobol(Sampler): scramble: bool = True """Whether to scramble the Sobol sequence.""" + def __post_init__(self): + if self.ndim < 1: + raise ValueError( + "The number of dimensions must be at least 1." + f" Got {self.ndim} dimensions." + ) + @property @override def ncols(self) -> int: @@ -180,13 +185,13 @@ def sample( total_samples = reduce(lambda x, y: x * y, n) output_shape = (*n, self.ncols) - # Randomly select which prior to sample from for each of the total_samples - chosen_priors = torch.empty((total_samples,), device=device, dtype=torch.int64) - chosen_priors = torch.multinomial( + # Randomly select which sampler to sample from for each of the total_samples + chosen_samplers = torch.empty((total_samples,), device=device, dtype=torch.int64) + chosen_samplers = torch.multinomial( self.probabilities, total_samples, replacement=True, - out=chosen_priors, + out=chosen_samplers, ) # Create an empty tensor to hold all samples @@ -194,16 +199,16 @@ def sample( (total_samples, self.ncols), device=device, dtype=torch.float64 ) - # Loop through each prior and its associated indices - for i, prior in enumerate(self.samplers): - # Find indices where the chosen prior is i + # Loop through each sampler and its associated indices + for i, sampler in enumerate(self.samplers): + # Find indices where the chosen sampler is i _i = torch.tensor(i, dtype=torch.int64, device=device) - indices = torch.where(chosen_priors == _i)[0] + indices = torch.where(chosen_samplers == _i)[0] if len(indices) > 0: - # Sample from the prior for the required number of indices - samples_from_prior = prior.sample(len(indices), to=to, device=device) - output_samples[indices] = samples_from_prior + # Sample from the sampler for the required number of indices + samples_from_sampler = sampler.sample(len(indices), to=to, device=device) + output_samples[indices] = samples_from_sampler # Reshape to the output shape including ncols dimension output_samples = output_samples.view(output_shape) diff --git a/neps/search_spaces/distributions/__init__.py b/neps/search_spaces/distributions/__init__.py deleted file mode 100644 index 65151e66..00000000 --- a/neps/search_spaces/distributions/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from neps.search_spaces.distributions.distribution import Distribution -from neps.search_spaces.distributions.truncnorm import TruncNormDistribution -from neps.search_spaces.distributions.uniform_float import UniformFloatDistribution -from neps.search_spaces.distributions.uniform_int import UniformIntDistribution -from neps.search_spaces.distributions.weighted_ints import WeightedIntsDistribution - -UNIT_UNIFORM = UniformFloatDistribution.new(0.0, 1.0) - -__all__ = [ - "Distribution", - "TruncNormDistribution", - "UniformFloatDistribution", - "UniformIntDistribution", - "UNIT_UNIFORM", - "WeightedIntsDistribution", -] diff --git a/neps/search_spaces/distributions/distribution.py b/neps/search_spaces/distributions/distribution.py deleted file mode 100644 index 7ab4dd6f..00000000 --- a/neps/search_spaces/distributions/distribution.py +++ /dev/null @@ -1,21 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, TypeVar -from typing_extensions import Protocol - -V = TypeVar("V", int, float) - - -if TYPE_CHECKING: - from torch import Generator, Tensor - - from neps.search_spaces.domain import Domain - - -class Distribution(Protocol[V]): - @property - def domain(self) -> Domain[V]: ... - - def sample(self, n: int, to: Domain, *, seed: Generator) -> Tensor: ... - - def likelihood(self, value: Tensor) -> Tensor: ... diff --git a/neps/search_spaces/distributions/truncnorm.py b/neps/search_spaces/distributions/truncnorm.py deleted file mode 100644 index 3938cf1c..00000000 --- a/neps/search_spaces/distributions/truncnorm.py +++ /dev/null @@ -1,112 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from functools import lru_cache -from typing import TYPE_CHECKING, Any -from typing_extensions import override - -import torch -from torch import Tensor - -from neps.search_spaces.distributions.distribution import Distribution -from neps.search_spaces.domain import Domain - -if TYPE_CHECKING: - from neps.utils.types import Number - -INT_HIGH = 1_000_000 - - -@lru_cache -def _truncnorm(a: float, b: float, loc: float, scale: float) -> Any: - from scipy.stats import truncnorm - - return truncnorm(a=a, b=b, loc=loc, scale=scale) - - -@dataclass(frozen=True) -class TruncNormDistribution(Distribution[float]): - domain: Domain[float] - center: float - std: float - truncnorm: Any - - @override - def sample(self, n: int, seed: torch.Generator) -> Tensor: - random_state = torch.randint(INT_HIGH, size=(1,), generator=seed) - rv = self.truncnorm.rvs(size=n, random_state=random_state.item()) - return torch.tensor(rv, dtype=self.domain.dtype) - - @override - def likelihood(self, value: Tensor) -> Tensor: - return self.truncnorm.pdf(value.numpy()) - - def normalize(self) -> TruncNormDistribution: - # Send to unit domain - center = float(self.domain.from_unit(torch.tensor(self.center)).item()) - std = self.std / self.domain.length - - return TruncNormDistribution( - domain=Domain.unit_float(), - center=center, - std=std, - truncnorm=_truncnorm( - a=(0 - center) / std, - b=(1 - center) / std, - loc=center, - scale=std, - ), - ) - - def with_center_and_confidence( - self, - center: Number, - confidence: float, - ) -> TruncNormDistribution: - assert 0 <= confidence <= 1 - assert self.domain.lower <= center <= self.domain.upper - std = 1 - confidence - center = float(center) - return TruncNormDistribution( - domain=self.domain, - center=center, - std=std, - truncnorm=_truncnorm( - a=(self.domain.lower - center) / std, - b=(self.domain.upper - center) / std, - loc=center, - scale=std, - ), - ) - - @classmethod - def new( - cls, - lower: Number, - center: Number, - upper: Number, - *, - std: Number, - std_is_normalized: bool, - ) -> TruncNormDistribution: - assert lower <= center <= upper, f"{lower} <= {center} <= {upper}" - center = float(center) - - if std_is_normalized: - assert 0 <= std <= 1 - std = float((upper - lower) * std) - else: - assert std > 0 - std = float(std) - - return cls( - domain=Domain.float(float(lower), float(upper)), - center=center, - std=std, - truncnorm=_truncnorm( - a=(lower - center) / std, - b=(upper - center) / std, - loc=center, - scale=std, - ), - ) diff --git a/neps/search_spaces/distributions/uniform_float.py b/neps/search_spaces/distributions/uniform_float.py deleted file mode 100644 index bdb43ee8..00000000 --- a/neps/search_spaces/distributions/uniform_float.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -from typing_extensions import override - -import torch -from torch import Tensor - -from neps.search_spaces.distributions.distribution import Distribution -from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain - -INT_HIGH = 1_000_000 - - -@dataclass(frozen=True) -class UniformFloatDistribution(Distribution[float]): - domain: Domain[float] - _pdf: float = field(repr=False) - - @override - def sample(self, n: int, to: Domain, seed: torch.Generator) -> Tensor: - # This creates samples in a unit float domain, rather than - # the `.domain` attribute of this distribution. Rather than scale - # up twice, we just scale directly form the UNIT_FLOAT_DOMAIN - # We still however need the `.domain` attribute for `likelihood` - unit_samples = torch.rand(n, generator=seed) - return to.cast(unit_samples, UNIT_FLOAT_DOMAIN) - - @override - def likelihood(self, value: Tensor) -> Tensor: - return torch.where( - (value >= self.domain.lower) & (value <= self.domain.upper), - self._pdf, - 0.0, - ) - - @classmethod - def new(cls, lower: int | float, upper: int | float) -> UniformFloatDistribution: - _pdf = 1.0 / (upper - lower) - return cls(Domain.float(lower, upper), _pdf=_pdf) - - @classmethod - def unit_distribution(cls) -> UniformFloatDistribution: - return UNIT_UNIFORM_FLOAT - - -UNIT_UNIFORM_FLOAT = UniformFloatDistribution.new(0.0, 1.0) diff --git a/neps/search_spaces/distributions/uniform_int.py b/neps/search_spaces/distributions/uniform_int.py deleted file mode 100644 index 8fd7b043..00000000 --- a/neps/search_spaces/distributions/uniform_int.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -from typing import TYPE_CHECKING -from typing_extensions import override - -import torch -from torch import Tensor - -from neps.search_spaces.distributions.distribution import Distribution -from neps.search_spaces.domain import Domain - -if TYPE_CHECKING: - from neps.utils.types import Number - - -@dataclass(frozen=True) -class UniformIntDistribution(Distribution[int]): - domain: Domain[int] - _pdf: float = field(repr=False) - - @override - def sample(self, n: int, to: Domain, *, seed: torch.Generator) -> Tensor: - samples = torch.randint( - self.domain.lower, - self.domain.upper, - size=(n,), - generator=seed, - ) - return to.cast(samples, frm=self.domain) - - @override - def likelihood(self, value: Tensor) -> Tensor: - return torch.where( - (value >= self.domain.lower) & (value <= self.domain.upper), - self._pdf, - 0.0, - ) - - @classmethod - def indices(cls, n: int) -> UniformIntDistribution: - return cls(Domain.int(0, n - 1), _pdf=1.0 / n) - - @classmethod - def new(cls, lower: Number, upper: Number) -> UniformIntDistribution: - return cls(Domain.int(lower, upper), _pdf=1.0 / (upper - lower)) diff --git a/neps/search_spaces/distributions/weighted_ints.py b/neps/search_spaces/distributions/weighted_ints.py deleted file mode 100644 index 3c8c60c5..00000000 --- a/neps/search_spaces/distributions/weighted_ints.py +++ /dev/null @@ -1,91 +0,0 @@ -from __future__ import annotations - -import warnings -from dataclasses import dataclass -from typing import TYPE_CHECKING, ClassVar, Sequence -from typing_extensions import override - -import torch -from torch import Tensor - -from neps.search_spaces.distributions.distribution import Distribution -from neps.search_spaces.domain import Domain - -if TYPE_CHECKING: - from neps.utils.types import Number - - -@dataclass(frozen=True) -class WeightedIntsDistribution(Distribution[int]): - # NOTE: Having a Million weights is very resource intense and super slow - # for sampling, especially given our common use case is to have only one weight - # with the rest being uniform. 100 is well out of scope for what was intended, - # as this is mostly intended for categoricals. - # If we need this, then we should make a more efficient implementation, - # such as one that uniform samples and then with probability `weight` - # replaces the value with the favoured value. - LIMIT_FOR_WEIGHTED_INTS: ClassVar[int] = 200 - - domain: Domain[int] - weights: Tensor - - @override - def sample(self, n: int, to: Domain, *, seed: torch.Generator) -> Tensor: - rand_tensor = torch.multinomial( - self.weights, - n, - replacement=True, - generator=seed, - ) - return to.cast(rand_tensor, frm=self.domain) - - @override - def likelihood(self, value: Tensor) -> Tensor: - valid_indices = torch.logical_and( - value >= self.domain.lower, value <= self.domain.upper - ) - psuedo_indices = torch.where(valid_indices, value, 0) - probs = self.weights[psuedo_indices] - return torch.where(valid_indices, probs, 0) - - @classmethod - def new(cls, weights: Sequence[Number] | Tensor) -> WeightedIntsDistribution: - if len(weights) > cls.LIMIT_FOR_WEIGHTED_INTS: - raise ValueError( - f"Having {len(weights)} weights is very resource intense and slow" - " for sampling. Consider using a more efficient implementation" - " if you need this many weights.", - ) - return cls( - weights=torch.as_tensor(weights, dtype=torch.float64), - domain=Domain.indices(len(weights)), - ) - - @classmethod - def with_favoured( - cls, - n: int, - favoured: int, - confidence: float, - ) -> WeightedIntsDistribution: - if n > cls.LIMIT_FOR_WEIGHTED_INTS: - raise ValueError( - f"Having {n} weights is very resource intense and slow" - " for sampling. Consider using a more efficient implementation" - " if you need this many weights.", - ) - - assert 0.0 <= confidence <= 1.0 - remaining = 1.0 - confidence - rest = remaining / (n - 1) - if confidence < rest: - warnings.warn( - f"Weight {confidence} is less than the rest {rest}." - " This will make the favoured value less likely to be sampled" - " than the rest of the values.", - UserWarning, - stacklevel=2, - ) - dist = torch.full(size=(n,), fill_value=rest, dtype=torch.float64) - dist[favoured] = confidence - return cls(weights=dist, domain=Domain.indices(n)) diff --git a/neps/search_spaces/samplers/__init__.py b/neps/search_spaces/samplers/__init__.py deleted file mode 100644 index 784b5aa4..00000000 --- a/neps/search_spaces/samplers/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from neps.search_spaces.samplers.prior import PriorSampler -from neps.search_spaces.samplers.sampler import Sampler -from neps.search_spaces.samplers.uniform import UniformSampler - -__all__ = [ - "Sampler", - "UniformSampler", - "PriorSampler", -] diff --git a/neps/search_spaces/samplers/model.py b/neps/search_spaces/samplers/model.py deleted file mode 100644 index c413b6bf..00000000 --- a/neps/search_spaces/samplers/model.py +++ /dev/null @@ -1,186 +0,0 @@ -from __future__ import annotations - -import logging -from typing import TYPE_CHECKING, Any, Mapping - -import numpy as np - -from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping -from neps.optimizers.bayesian_optimization.acquisition_samplers import ( - AcquisitionSamplerMapping, -) -from neps.optimizers.bayesian_optimization.kernels.get_kernels import get_kernels -from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping -from neps.search_spaces.samplers.sampler import Sampler -from neps.search_spaces.samplers.uniform import UniformSampler -from neps.utils.common import instance_from_map - -logger = logging.getLogger(__name__) - -if TYPE_CHECKING: - from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( - BaseAcquisition, - ) - from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, - ) - from neps.search_spaces import SearchSpace - from neps.utils.types import Number - - -class ModelPolicy(Sampler): - """A policy for sampling configuration, i.e. the default for SH / hyperband. - - Args: - SamplingPolicy ([type]): [description] - """ - - def __init__( - self, - *, - space: SearchSpace, - surrogate_model: str | Any = "gp", - surrogate_model_args: Mapping[str, Any] | None = None, - domain_se_kernel: str | None = None, - graph_kernels: list | None = None, - hp_kernels: list | None = None, - acquisition: str | BaseAcquisition | type[BaseAcquisition] = "EI", - acquisition_sampler: ( - str | AcquisitionSampler | type[AcquisitionSampler] - ) = "random", - patience: int = 100, - ): - surrogate_model_args = dict(surrogate_model_args) if surrogate_model_args else {} - - graph_kernels, hp_kernels = get_kernels( - pipeline_space=space, - domain_se_kernel=domain_se_kernel, - graph_kernels=graph_kernels, - hp_kernels=hp_kernels, - optimal_assignment=False, - ) - - if "graph_kernels" not in surrogate_model_args: - surrogate_model_args["graph_kernels"] = None - - if "hp_kernels" not in surrogate_model_args: - surrogate_model_args["hp_kernels"] = hp_kernels - - if not surrogate_model_args["hp_kernels"]: - raise ValueError("No kernels are provided!") - - if "vectorial_features" not in surrogate_model_args: - # TODO: Graph gets ignored? - surrogate_model_args["vectorial_features"] = { - "continuous": len(space.numericals), - "categorical": len(space.categoricals), - } - - # TODO: What the hell type is this - self.surrogate_model: Any = instance_from_map( - SurrogateModelMapping, - surrogate_model, - name="surrogate model", - kwargs=surrogate_model_args, - ) - - self.acquisition: BaseAcquisition = instance_from_map( - AcquisitionMapping, - acquisition, # type: ignore - name="acquisition function", - ) - - self.acquisition_sampler: AcquisitionSampler = instance_from_map( - AcquisitionSamplerMapping, - acquisition_sampler, # type: ignore - name="acquisition sampler function", - kwargs={"patience": patience, "pipeline_space": space}, - ) - self.uniform_sampler = UniformSampler.new(space) - - def _fantasize_pending(self, train_x, train_y, pending_x): - if len(pending_x) == 0: - return train_x, train_y - - self.surrogate_model.fit(train_x, train_y) - # hallucinating: predict for the pending evaluations - _y, _ = self.surrogate_model.predict(pending_x) - _y = _y.detach().numpy().tolist() - # appending to training data - train_x.extend(pending_x) - train_y.extend(_y) - return train_x, train_y - - def update_model(self, train_x, train_y, pending_x, decay_t=None): - if decay_t is None: - decay_t = len(train_x) - train_x, train_y = self._fantasize_pending(train_x, train_y, pending_x) - self.surrogate_model.fit(train_x, train_y) - self.acquisition.set_state(self.surrogate_model, decay_t=decay_t) - # TODO: set_state should generalize to all options - # no needed to set state of sampler when using `random` - # self.acquisition_sampler.set_state(x=train_x, y=train_y) - - def sample( - self, - n: int, - *, - active_max_fidelity: Mapping[str, Number] | None = None, - fidelity: Mapping[str, Number] | None = None, - seed: np.random.Generator, - ) -> SearchSpace: - """Performs the equivalent of optimizing the acquisition function. - - Performs 2 strategies as per the arguments passed: - * If fidelity is not None, triggers the case when the surrogate has been - trained jointly with the fidelity dimension, i.e., all observations ever - recorded. In this case, the EI for random samples is evaluated at the - `fidelity` where the new sample will be evaluated. The top-10 are selected, - and the EI for them is evaluated at the target/mmax fidelity. - * If active_max_fidelity is not None, triggers the case when a surrogate is - trained per fidelity. In this case, all samples have their fidelity - variable set to the same value. This value is same as that of the fidelity - value of the configs in the training data. - """ - logger.info("Acquiring...") - - # sampling random configurations - samples = [ - self.space.sample(user_priors=False, ignore_fidelity=True) - for _ in range(SAMPLE_THRESHOLD) - ] - - if fidelity is not None: - # w/o setting this flag, the AF eval will set all fidelities to max - self.acquisition.optimize_on_max_fidelity = False - _inc_copy = self.acquisition.incumbent - # TODO: better design required, for example, not import torch - # right now this case handles the 2-step acquisition in `sample` - if "incumbent" in kwargs: - # sets the incumbent to the best score at the required fidelity for - # correct computation of EI scores - self.acquisition.incumbent = torch.tensor(kwargs["incumbent"]) - # updating the fidelity of the sampled configurations - samples = list(map(update_fidelity, samples, [fidelity] * len(samples))) - # computing EI at the given `fidelity` - eis = self.acquisition.eval(x=samples, asscalar=True) - # extracting the 10 highest scores - _ids = np.argsort(eis)[-TOP_EI_SAMPLE_COUNT:] - samples = pd.Series(samples).iloc[_ids].values.tolist() - # setting the fidelity to the maximum fidelity - self.acquisition.optimize_on_max_fidelity = True - self.acquisition.incumbent = _inc_copy - - if active_max_fidelity is not None: - # w/o setting this flag, the AF eval will set all fidelities to max - self.acquisition.optimize_on_max_fidelity = False - fidelity = active_max_fidelity - samples = list(map(update_fidelity, samples, [fidelity] * len(samples))) - - # computes the EI for all `samples` - eis = self.acquisition.eval(x=samples, asscalar=True) - # extracting the highest scored sample - return samples[np.argmax(eis)] - # TODO: can generalize s.t. sampler works for all types, currently, - # random sampler in NePS does not do what is required here - # return self.acquisition_sampler.sample(self.acquisition) diff --git a/neps/search_spaces/samplers/prior.py b/neps/search_spaces/samplers/prior.py deleted file mode 100644 index 65165cae..00000000 --- a/neps/search_spaces/samplers/prior.py +++ /dev/null @@ -1,110 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Mapping -from typing_extensions import Self, override - -from neps.search_spaces.config import Config -from neps.search_spaces.distributions.uniform_int import UniformIntDistribution -from neps.search_spaces.distributions.weighted_ints import WeightedIntsDistribution -from neps.search_spaces.samplers.sampler import Sampler - -if TYPE_CHECKING: - import numpy as np - - from neps.search_spaces.distributions.distribution import Distribution - from neps.search_spaces.search_space import SearchSpace - - -@dataclass -class PriorSampler(Sampler): - search_space: SearchSpace - - _numerical_distributions: Mapping[str, Distribution] - _categorical_distributions: Mapping[str, Distribution] - - @override - def sample_configs( - self, - n: int, - *, - fidelity: Mapping[str, float] | None, - seed: np.random.Generator, - with_constants: bool = True, - ) -> list[Config]: - numerical_samples = {} - for k, dist in self._numerical_distributions.items(): - param = self.search_space.numericals[k] - numerical_samples[k] = dist.sample(n, to=param.domain, seed=seed) - - categorical_samples = {} - for k, dist in self._categorical_distributions.items(): - cat = self.search_space.categoricals[k] - domain = cat.domain - samples = dist.sample(n, to=domain, seed=seed) - choices = cat.lookup(samples) - categorical_samples[k] = choices - - graph_samples = {} - for k, v in self.search_space.graphs.items(): - graph_samples[k] = [v.sample() for _ in range(n)] - - _constants = self.search_space.constants if with_constants else {} - - return [ - Config( - values={ - **{k: samples[i] for k, samples in numerical_samples.items()}, - **{k: samples[i] for k, samples in categorical_samples.items()}, - **{k: samples[i] for k, samples in graph_samples.items()}, - **_constants, - }, - fidelity=fidelity, - ) - for i in range(n) - ] - - @classmethod - def new( - cls, - space: SearchSpace, - prior: Mapping[str, tuple[Any, float]], - *, - replace_missing_with_uniform: bool = True, - ) -> Self: - missing = set(space.hyperparameters) - set(prior.keys()) - if not replace_missing_with_uniform and any(missing): - raise ValueError( - "If `replace_missing_with_uniform` is False, the prior must be defined" - f" for all parameters. Missing prior for: {missing}" - ) - - numerical_distributions = { - hp_name: ( - hp.domain.truncnorm_distribution(center=p[0], confidence=p[1]) - if (p := prior.get(hp_name)) - else hp.domain.uniform_distribution() - ) - for hp_name, hp in space.numericals.items() - } - # NOTE: It would be nice to somehow check if the prior given for - # a categorical was an index or a value in the categorical. - # Since it's much more efficient to hold on to the index, we will - # assume that for now. - categorical_distribution = { - hp_name: ( - WeightedIntsDistribution.with_favoured( - n=cat.size, - favoured=cat.index(p[0]), - confidence=p[1], - ) - if (p := prior.get(hp_name)) - else UniformIntDistribution.indices(cat.size) - ) - for hp_name, cat in space.categoricals.items() - } - return cls( - space, - _numerical_distributions=numerical_distributions, - _categorical_distributions=categorical_distribution, - ) diff --git a/neps/search_spaces/samplers/sampler.py b/neps/search_spaces/samplers/sampler.py deleted file mode 100644 index f104a3a5..00000000 --- a/neps/search_spaces/samplers/sampler.py +++ /dev/null @@ -1,22 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from typing import TYPE_CHECKING, Mapping -from typing_extensions import Protocol - -if TYPE_CHECKING: - import numpy as np - - from neps.search_spaces.config import Config - from neps.utils.types import Number - - -@dataclass -class Sampler(Protocol): - def sample_configs( - self, - n: int, - *, - fidelity: Mapping[str, Number] | None, - seed: np.random.Generator, - ) -> list[Config]: ... diff --git a/neps/search_spaces/samplers/uniform.py b/neps/search_spaces/samplers/uniform.py deleted file mode 100644 index 88060932..00000000 --- a/neps/search_spaces/samplers/uniform.py +++ /dev/null @@ -1,79 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from typing import TYPE_CHECKING, Mapping -from typing_extensions import Self, override - -from neps.search_spaces.config import Config -from neps.search_spaces.distributions.uniform_int import UniformIntDistribution -from neps.search_spaces.samplers.sampler import Sampler - -if TYPE_CHECKING: - import numpy as np - - from neps.search_spaces.distributions.distribution import Distribution - from neps.search_spaces.search_space import SearchSpace - - -@dataclass -class UniformSampler(Sampler): - search_space: SearchSpace - - _numerical_distributions: Mapping[str, Distribution] - _categorical_distributions: Mapping[str, Distribution] - - @override - def sample_configs( - self, - n: int, - *, - fidelity: Mapping[str, float] | None = None, - seed: np.random.Generator, - with_constants: bool = True, - ) -> list[Config]: - numerical_samples = {} - for k, dist in self._numerical_distributions.items(): - param = self.search_space.numericals[k] - numerical_samples[k] = dist.sample(n, to=param.domain, seed=seed) - - categorical_samples = {} - for k, dist in self._categorical_distributions.items(): - cat = self.search_space.categoricals[k] - domain = cat.domain - samples = dist.sample(n, to=domain, seed=seed) - choices = cat.lookup(samples) - categorical_samples[k] = choices - - graph_samples = {} - for k, v in self.search_space.graphs.items(): - graph_samples[k] = [v.sample() for _ in range(n)] - - _constants = self.search_space.constants if with_constants else {} - - return [ - Config( - { - **{k: samples[i] for k, samples in numerical_samples.items()}, - **{k: samples[i] for k, samples in categorical_samples.items()}, - **{k: samples[i] for k, samples in graph_samples.items()}, - **_constants, - }, - fidelity=fidelity, - ) - for i in range(n) - ] - - @classmethod - def new(cls, space: SearchSpace) -> Self: - numerical_distributions = { - k: p.domain.uniform_distribution() for k, p in space.numericals.items() - } - categorical_distribution = { - k: UniformIntDistribution.indices(p.size) - for k, p in space.categoricals.items() - } - return cls( - space, - _numerical_distributions=numerical_distributions, - _categorical_distributions=categorical_distribution, - ) diff --git a/neps/search_spaces/samplers/weighted_sampler.py b/neps/search_spaces/samplers/weighted_sampler.py deleted file mode 100644 index 32e51908..00000000 --- a/neps/search_spaces/samplers/weighted_sampler.py +++ /dev/null @@ -1,51 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Mapping -from typing_extensions import Self, override - -import numpy as np - -from neps.search_spaces.samplers.sampler import Sampler -from neps.utils.types import Arr, Number, f64 - -if TYPE_CHECKING: - from neps.search_spaces.config import Config - - -@dataclass -class WeightedSampler(Sampler): - weights: dict[str, float] - samplers: dict[str, Sampler] - - _probabilities: Arr[f64] = field(init=False, repr=False, compare=False) - _samplers: Arr[np.str_] = field(init=False, repr=False, compare=False) - - def __post_init__(self): - probs = np.array(list(self.weights.values()), dtype=f64) - probs /= probs.sum() - self._probabilities = probs - self._samplers = np.asarray(sorted(self.samplers.keys()), dtype=np.str_) - - @override - def sample_configs( - self, - n: int, - *, - fidelity: Mapping[str, Number] | None, - seed: np.random.Generator, - ) -> list[Config]: - choices = seed.choice(self._samplers, size=n, p=self._probabilities) - keys, counts = np.unique(choices, return_counts=True) - - configs: list[Config] = [] - for key, count in zip(keys, counts): - sampler = self.samplers[key] - config_samples = sampler.sample_configs(count, fidelity=fidelity, seed=seed) - configs.extend(config_samples) - - return configs - - @classmethod - def equally_weighted(cls, samples: dict[str, Sampler]) -> Self: - return cls(weights={k: 1.0 for k in samples}, samplers=samples) From 03729ca18fe1e2940a5fb581139ca6b49d386522 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Thu, 29 Aug 2024 18:56:32 +0200 Subject: [PATCH 26/63] fix: Return acquisition_functions still in use --- neps/{sampling => }/distributions.py | 2 +- .../acquisition_functions/_ehvi.py | 213 ++++++++++++++++++ .../acquisition_functions/base_acquisition.py | 17 ++ .../acquisition_functions/ei.py | 120 ++++++++++ .../acquisition_functions/mf_ei.py | 35 ++- .../acquisition_functions/prior_weighted.py | 111 +++++++++ .../acquisition_functions/ucb.py | 60 +++++ 7 files changed, 537 insertions(+), 21 deletions(-) rename neps/{sampling => }/distributions.py (99%) create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/ei.py create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py create mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py diff --git a/neps/sampling/distributions.py b/neps/distributions.py similarity index 99% rename from neps/sampling/distributions.py rename to neps/distributions.py index fb552949..2361e191 100644 --- a/neps/sampling/distributions.py +++ b/neps/distributions.py @@ -225,6 +225,6 @@ def log_prob(self, value): @dataclass -class TorchDistributionWithDomain: +class DistributionOverDomain: distribution: Distribution domain: Domain diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py b/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py new file mode 100644 index 00000000..8722c545 --- /dev/null +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py @@ -0,0 +1,213 @@ +# from abc import ABC, abstractmethod +from itertools import product + +import torch +from torch import Tensor +from torch.distributions import Normal +from torch.nn import Module + +# class MultiObjectiveBaseAcqusition(ABC): +# def __init__(self, surrogate_models: dict): +# self.surrogate_models = surrogate_models +# +# def propose_location(self, *args): +# """Propose new locations for subsequent sampling +# This method should be overriden by respective acquisition function implementations.""" +# raise NotImplementedError +# +# def optimize(self): +# """This is the method that user should call for the Bayesian optimisation main loop.""" +# raise NotImplementedError +# +# @abstractmethod +# def eval(self, x, asscalar: bool = False): +# """Evaluate the acquisition function at point x2. This should be overridden by respective acquisition +# function implementations""" +# raise NotImplementedError +# +# def __call__(self, *args, **kwargs): +# return self.eval(*args, **kwargs) +# +# def reset_surrogate_model(self, surrogate_models: dict): +# for objective, surrogate_model in surrogate_models.items(): +# self.surrogate_models[objective] = surrogate_model +# + + +class ExpectedHypervolumeImprovement(Module): # , MultiObjectiveBaseAcqusition): + def __init__( + self, + model, + ref_point, + partitioning, + ) -> None: + r"""Expected Hypervolume Improvement supporting m>=2 outcomes. + + Implementation from BOtorch, adapted from + https://github.com/pytorch/botorch/blob/353f37649fa8d90d881e8ea20c11986b15723ef1/botorch/acquisition/multi_objective/analytic.py#L78 + + This implements the computes EHVI using the algorithm from [Yang2019]_, but + additionally computes gradients via auto-differentiation as proposed by + [Daulton2020qehvi]_. + + Note: this is currently inefficient in two ways due to the binary partitioning + algorithm that we use for the box decomposition: + + - We have more boxes in our decomposition + - If we used a box decomposition that used `inf` as the upper bound for + the last dimension *in all hypercells*, then we could reduce the number + of terms we need to compute from 2^m to 2^(m-1). [Yang2019]_ do this + by using DKLV17 and LKF17 for the box decomposition. + + TODO: Use DKLV17 and LKF17 for the box decomposition as in [Yang2019]_ for + greater efficiency. + + TODO: Add support for outcome constraints. + + Example: + >>> model = SingleTaskGP(train_X, train_Y) + >>> ref_point = [0.0, 0.0] + >>> EHVI = ExpectedHypervolumeImprovement(model, ref_point, partitioning) + >>> ehvi = EHVI(test_X) + + Args: + model: A fitted model. + ref_point: A list with `m` elements representing the reference point (in the + outcome space) w.r.t. to which compute the hypervolume. This is a + reference point for the objective values (i.e. after applying + `objective` to the samples). + partitioning: A `NondominatedPartitioning` module that provides the non- + dominated front and a partitioning of the non-dominated space in hyper- + rectangles. + objective: An `AnalyticMultiOutputObjective`. + """ + # TODO: we could refactor this __init__ logic into a + # HypervolumeAcquisitionFunction Mixin + if len(ref_point) != partitioning.num_outcomes: + raise ValueError( + "The length of the reference point must match the number of outcomes. " + f"Got ref_point with {len(ref_point)} elements, but expected " + f"{partitioning.num_outcomes}." + ) + ref_point = torch.tensor( + ref_point, + dtype=partitioning.pareto_Y.dtype, + device=partitioning.pareto_Y.device, + ) + better_than_ref = (partitioning.pareto_Y > ref_point).all(dim=1) + if not better_than_ref.any() and partitioning.pareto_Y.shape[0] > 0: + raise ValueError( + "At least one pareto point must be better than the reference point." + ) + super().__init__() + self.model = model + self.register_buffer("ref_point", ref_point) + self.partitioning = partitioning + cell_bounds = self.partitioning.get_hypercell_bounds() + self.register_buffer("cell_lower_bounds", cell_bounds[0]) + self.register_buffer("cell_upper_bounds", cell_bounds[1]) + # create indexing tensor of shape `2^m x m` + self._cross_product_indices = torch.tensor( + list(product(*[[0, 1] for _ in range(ref_point.shape[0])])), + dtype=torch.long, + device=ref_point.device, + ) + self.normal = Normal(0, 1) + + def psi(self, lower: Tensor, upper: Tensor, mu: Tensor, sigma: Tensor) -> None: + r"""Compute Psi function. + + For each cell i and outcome k: + + Psi(lower_{i,k}, upper_{i,k}, mu_k, sigma_k) = ( + sigma_k * PDF((upper_{i,k} - mu_k) / sigma_k) + ( + mu_k - lower_{i,k} + ) * (1 - CDF(upper_{i,k} - mu_k) / sigma_k) + ) + + See Equation 19 in [Yang2019]_ for more details. + + Args: + lower: A `num_cells x m`-dim tensor of lower cell bounds + upper: A `num_cells x m`-dim tensor of upper cell bounds + mu: A `batch_shape x 1 x m`-dim tensor of means + sigma: A `batch_shape x 1 x m`-dim tensor of standard deviations (clamped). + + Returns: + A `batch_shape x num_cells x m`-dim tensor of values. + """ + u = (upper - mu) / sigma + return sigma * self.normal.log_prob(u).exp() + (mu - lower) * ( + 1 - self.normal.cdf(u) + ) + + def nu(self, lower: Tensor, upper: Tensor, mu: Tensor, sigma: Tensor) -> None: + r"""Compute Nu function. + + For each cell i and outcome k: + + nu(lower_{i,k}, upper_{i,k}, mu_k, sigma_k) = ( + upper_{i,k} - lower_{i,k} + ) * (1 - CDF((upper_{i,k} - mu_k) / sigma_k)) + + See Equation 25 in [Yang2019]_ for more details. + + Args: + lower: A `num_cells x m`-dim tensor of lower cell bounds + upper: A `num_cells x m`-dim tensor of upper cell bounds + mu: A `batch_shape x 1 x m`-dim tensor of means + sigma: A `batch_shape x 1 x m`-dim tensor of standard deviations (clamped). + + Returns: + A `batch_shape x num_cells x m`-dim tensor of values. + """ + return (upper - lower) * (1 - self.normal.cdf((upper - mu) / sigma)) + + def forward(self, X: Tensor) -> Tensor: + posterior = [[_m.predict(_x) for _m in self.model] for _x in X] + mu = torch.tensor([[_m[0].item() for _m in _p] for _p in posterior])[:, None, :] + sigma = torch.tensor([[_s[1].item() for _s in _p] for _p in posterior])[ + :, None, : + ] + + # clamp here, since upper_bounds will contain `inf`s, which + # are not differentiable + cell_upper_bounds = self.cell_upper_bounds.clamp_max(1e8) + # Compute psi(lower_i, upper_i, mu_i, sigma_i) for i=0, ... m-2 + psi_lu = self.psi( + lower=self.cell_lower_bounds, upper=cell_upper_bounds, mu=mu, sigma=sigma + ) + # Compute psi(lower_m, lower_m, mu_m, sigma_m) + psi_ll = self.psi( + lower=self.cell_lower_bounds, + upper=self.cell_lower_bounds, + mu=mu, + sigma=sigma, + ) + # Compute nu(lower_m, upper_m, mu_m, sigma_m) + nu = self.nu( + lower=self.cell_lower_bounds, upper=cell_upper_bounds, mu=mu, sigma=sigma + ) + # compute the difference psi_ll - psi_lu + psi_diff = psi_ll - psi_lu + + # this is batch_shape x num_cells x 2 x (m-1) + stacked_factors = torch.stack([psi_diff, nu], dim=-2) + + # Take the cross product of psi_diff and nu across all outcomes + # e.g. for m = 2 + # for each batch and cell, compute + # [psi_diff_0, psi_diff_1] + # [nu_0, psi_diff_1] + # [psi_diff_0, nu_1] + # [nu_0, nu_1] + # this tensor has shape: `batch_shape x num_cells x 2^m x m` + all_factors_up_to_last = stacked_factors.gather( + dim=-2, + index=self._cross_product_indices.expand( + stacked_factors.shape[:-2] + self._cross_product_indices.shape + ), + ) + # compute product for all 2^m terms, + # sum across all terms and hypercells + return all_factors_up_to_last.prod(dim=-1).sum(dim=-1).sum(dim=-1) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py new file mode 100644 index 00000000..7249c0fd --- /dev/null +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py @@ -0,0 +1,17 @@ +from abc import ABC, abstractmethod + + +class BaseAcquisition(ABC): + def __init__(self): + self.surrogate_model = None + + @abstractmethod + def eval(self, x, asscalar: bool = False): + """Evaluate the acquisition function at point x2.""" + raise NotImplementedError + + def __call__(self, *args, **kwargs): + return self.eval(*args, **kwargs) + + def set_state(self, surrogate_model, **kwargs): + self.surrogate_model = surrogate_model diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py new file mode 100644 index 00000000..1a4e24d0 --- /dev/null +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Sequence + +import torch +from torch.distributions import Normal + +from .base_acquisition import BaseAcquisition + +if TYPE_CHECKING: + import numpy as np + + from neps.search_spaces import SearchSpace + + +class ComprehensiveExpectedImprovement(BaseAcquisition): + def __init__( + self, + augmented_ei: bool = False, + xi: float = 0.0, + in_fill: str = "best", + log_ei: bool = False, + optimize_on_max_fidelity: bool = True, + ): + """This is the graph BO version of the expected improvement + key differences are: + + 1. The input x2 is a networkx graph instead of a vectorial input + + 2. The search space (a collection of x1_graphs) is discrete, so there is no + gradient-based optimisation. Instead, we compute the EI at all candidate points + and empirically select the best position during optimisation + + Args: + augmented_ei: Using the Augmented EI heuristic modification to the standard + expected improvement algorithm according to Huang (2006). + xi: manual exploration-exploitation trade-off parameter. + in_fill: the criterion to be used for in-fill for the determination of mu_star + 'best' means the empirical best observation so far (but could be + susceptible to noise), 'posterior' means the best *posterior GP mean* + encountered so far, and is recommended for optimization of more noisy + functions. Defaults to "best". + log_ei: log-EI if true otherwise usual EI. + """ + super().__init__() + + if in_fill not in ["best", "posterior"]: + raise ValueError(f"Invalid value for in_fill ({in_fill})") + self.augmented_ei = augmented_ei + self.xi = xi + self.in_fill = in_fill + self.log_ei = log_ei + self.incumbent = None + self.optimize_on_max_fidelity = optimize_on_max_fidelity + + def eval( + self, + x: Sequence[SearchSpace], + asscalar: bool = False, + ) -> np.ndarray | torch.Tensor | float: + """Return the negative expected improvement at the query point x2.""" + assert self.incumbent is not None, "EI function not fitted on model" + + if x[0].has_fidelity and self.optimize_on_max_fidelity: + _x = [e.clone() for e in x] + for e in _x: + e.set_to_max_fidelity() + else: + _x = x + + mu, cov = self.surrogate_model.predict(_x) + + std = torch.sqrt(torch.diag(cov)) + mu_star = self.incumbent + + gauss = Normal(torch.zeros(1, device=mu.device), torch.ones(1, device=mu.device)) + # u = (mu - mu_star - self.xi) / std + # ei = std * updf + (mu - mu_star - self.xi) * ucdf + if self.log_ei: + # we expect that f_min is in log-space + f_min = mu_star - self.xi + v = (f_min - mu) / std + ei = torch.exp(f_min) * gauss.cdf(v) - torch.exp( + 0.5 * torch.diag(cov) + mu + ) * gauss.cdf(v - std) + else: + u = (mu_star - mu - self.xi) / std + try: + ucdf = gauss.cdf(u) + except ValueError as e: + print(f"u: {u}") # noqa: T201 + print(f"mu_star: {mu_star}") # noqa: T201 + print(f"mu: {mu}") # noqa: T201 + print(f"std: {std}") # noqa: T201 + print(f"diag: {cov.diag()}") # noqa: T201 + raise e + updf = torch.exp(gauss.log_prob(u)) + ei = std * updf + (mu_star - mu - self.xi) * ucdf + if self.augmented_ei: + sigma_n = self.surrogate_model.likelihood + ei *= 1.0 - torch.sqrt(torch.tensor(sigma_n, device=mu.device)) / torch.sqrt( + sigma_n + torch.diag(cov) + ) + if isinstance(_x, list) and asscalar: + return ei.detach().numpy() + if asscalar: + ei = ei.detach().numpy().item() + return ei + + def set_state(self, surrogate_model, **kwargs): + super().set_state(surrogate_model, **kwargs) + + # Compute incumbent + if self.in_fill == "best": + self.incumbent = torch.min(self.surrogate_model.y_) + else: + x = self.surrogate_model.x + mu_train, _ = self.surrogate_model.predict(x) + incumbent_idx = torch.argmin(mu_train) + self.incumbent = self.surrogate_model.y_[incumbent_idx] diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py index c8502ca1..3d19040d 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_ei.py @@ -1,28 +1,22 @@ -# Left in as reference for now. # type: ignore -from __future__ import annotations - -from typing import TYPE_CHECKING, Any, Iterable +from typing import Any, Iterable, Tuple, Union import numpy as np import pandas as pd import torch from torch.distributions import Normal -from neps.optimizers.utils import map_real_hyperparameters_from_tabular_ids - +from ....optimizers.utils import map_real_hyperparameters_from_tabular_ids +from ....search_spaces.search_space import SearchSpace +from ...multi_fidelity.utils import MFObservedData from .ei import ComprehensiveExpectedImprovement -if TYPE_CHECKING: - from neps.optimizers.multi_fidelity.utils import MFObservedData - from neps.search_spaces.search_space import SearchSpace - class MFEI(ComprehensiveExpectedImprovement): def __init__( self, pipeline_space: SearchSpace, - surrogate_model_name: str | None = None, + surrogate_model_name: str = None, augmented_ei: bool = False, xi: float = 0.0, in_fill: str = "best", @@ -38,7 +32,7 @@ def __init__( def get_budget_level(self, config) -> int: return int((config.fidelity.value - config.fidelity.lower) / self.b_step) - def preprocess(self, x: pd.Series) -> tuple[Iterable, Iterable]: + def preprocess(self, x: pd.Series) -> Tuple[Iterable, Iterable]: """Prepares the configurations for appropriate EI calculation. Takes a set of points and computes the budget and incumbent for each point, as @@ -71,7 +65,7 @@ def preprocess(self, x: pd.Series) -> tuple[Iterable, Iterable]: budget_list.append(self.get_budget_level(config)) # Drop unused configs - x = x.drop(labels=indices_to_drop) + x.drop(labels=indices_to_drop, inplace=True) performances = self.observations.get_best_performance_for_each_budget() inc_list = [] @@ -84,11 +78,11 @@ def preprocess(self, x: pd.Series) -> tuple[Iterable, Iterable]: return x, torch.Tensor(inc_list) - def preprocess_gp(self, x: Iterable) -> tuple[Iterable, Iterable]: + def preprocess_gp(self, x: Iterable) -> Tuple[Iterable, Iterable]: x, inc_list = self.preprocess(x) return x.values.tolist(), inc_list - def preprocess_deep_gp(self, x: Iterable) -> tuple[Iterable, Iterable]: + def preprocess_deep_gp(self, x: Iterable) -> Tuple[Iterable, Iterable]: x, inc_list = self.preprocess(x) x_lcs = [] for idx in x.index: @@ -103,7 +97,7 @@ def preprocess_deep_gp(self, x: Iterable) -> tuple[Iterable, Iterable]: self.surrogate_model.set_prediction_learning_curves(x_lcs) return x.values.tolist(), inc_list - def preprocess_pfn(self, x: Iterable) -> tuple[Iterable, Iterable, Iterable]: + def preprocess_pfn(self, x: Iterable) -> Tuple[Iterable, Iterable, Iterable]: """Prepares the configurations for appropriate EI calculation. Takes a set of points and computes the budget and incumbent for each point, as @@ -120,7 +114,7 @@ def preprocess_pfn(self, x: Iterable) -> tuple[Iterable, Iterable, Iterable]: ) / self.b_step return _x_tok, _x, inc_list - def eval(self, x: pd.Series, asscalar: bool = False) -> tuple[np.ndarray, pd.Series]: + def eval(self, x: pd.Series, asscalar: bool = False) -> Tuple[np.ndarray, pd.Series]: # _x = x.copy() # preprocessing needs to change the reference x Series so we don't copy here if self.surrogate_model_name == "pfn": _x_tok, _x, inc_list = self.preprocess_pfn( @@ -149,7 +143,7 @@ def eval(self, x: pd.Series, asscalar: bool = False) -> tuple[np.ndarray, pd.Ser def eval_pfn_ei( self, x: Iterable, inc_list: Iterable - ) -> np.ndarray | torch.Tensor | float: + ) -> Union[np.ndarray, torch.Tensor, float]: """PFN-EI modified to preprocess samples and accept list of incumbents.""" # x, inc_list = self.preprocess(x) # IMPORTANT change from vanilla-EI # _x = x.copy() @@ -160,7 +154,7 @@ def eval_pfn_ei( def eval_gp_ei( self, x: Iterable, inc_list: Iterable - ) -> np.ndarray | torch.Tensor | float: + ) -> Union[np.ndarray, torch.Tensor, float]: """Vanilla-EI modified to preprocess samples and accept list of incumbents.""" # x, inc_list = self.preprocess(x) # IMPORTANT change from vanilla-EI _x = x.copy() @@ -200,7 +194,7 @@ def set_state( pipeline_space: SearchSpace, surrogate_model: Any, observations: MFObservedData, - b_step: int | float, + b_step: Union[int, float], **kwargs, ): # overload to select incumbent differently through observations @@ -208,3 +202,4 @@ def set_state( self.surrogate_model = surrogate_model self.observations = observations self.b_step = b_step + return diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py new file mode 100644 index 00000000..8a735d58 --- /dev/null +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Iterable +from typing_extensions import override + +import numpy as np +import torch +from botorch.acquisition import MCAcquisitionFunction + +from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( + BaseAcquisition, +) + +if TYPE_CHECKING: + from neps.priors import Prior + + +class PiboAcquisition(MCAcquisitionFunction): + """Compute a prior weighted acquisition function according to PiBO. + + * https://arxiv.org/pdf/2204.11051 + """ + + def __init__( + self, + acq_fn: MCAcquisitionFunction, + prior: Prior, + beta: float, + n: float, + ): + """Initialize the acquisition function. + + Args: + acq_fn: The acquisition function to be weighted. + prior: The prior distribution to be used for weighting. + beta: The beta parameter for weighting. + n: The denominator for the beta parameter. + """ + self._log = self.acq_fn._log + self.acq_fn = acq_fn + + self.beta = beta + self.n = n + self.prior = prior + + @override + def forward(self, X: torch.Tensor) -> torch.Tensor: + weight = self.beta / self.n + acq = self.acq_fn(X) + + # The weight is shown as being applied to the pdf and not the log_pdf + values = acq * self.prior.prob(X) * weight + + # However, if the base acq function advertises as being log, + # i.e. self._log, then we should return the log of the values + return torch.log(values) if self._log else values + + +class DecayingPriorWeightedAcquisition(BaseAcquisition): + def __init__( + self, + base_acquisition, + pibo_beta=10, + log: bool = False, + ): + super().__init__() + self.pibo_beta = pibo_beta + self.base_acquisition = base_acquisition + self.log = log + self.decay_t = 0.0 + + def eval( + self, + x: Iterable, + **base_acquisition_kwargs, + ) -> np.ndarray | torch.Tensor | float: + acquisition = self.base_acquisition(x, **base_acquisition_kwargs) + + if self.log: + min_acq_val = abs(min(acquisition)) if min(acquisition) < 0 else 0 + + for i, candidate in enumerate(x): + prior_weight = candidate.compute_prior(log=self.log) + if prior_weight != 1.0: + if self.log: + # for log -> the smaller the prior_weight, + # the more unlikely it is from the prior + # also shift acquisition values to avoid negativ values + acquisition[i] = ( + np.log(acquisition[i] + min_acq_val + 1e-12) + + (self.pibo_beta / self.decay_t) * prior_weight + ) + else: + acquisition[i] *= np.power( + prior_weight + 1e-12, self.pibo_beta / self.decay_t + ) + return acquisition + + def set_state(self, surrogate_model, **kwargs): + if "decay_t" in kwargs: + decay_t = kwargs.pop("decay_t") + else: + train_x = surrogate_model.x + if train_x[0].has_fidelity: + decay_t = np.sum( + [float(_x.fidelity.value >= _x.fidelity.upper) for _x in train_x] + ) + else: + decay_t = len(train_x) + self.decay_t = decay_t + self.base_acquisition.set_state(surrogate_model, **kwargs) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py new file mode 100644 index 00000000..adf57266 --- /dev/null +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py @@ -0,0 +1,60 @@ +from typing import Iterable, Union + +import numpy as np +import torch + +from .base_acquisition import BaseAcquisition + + +class UpperConfidenceBound(BaseAcquisition): + def __init__(self, beta: float=1.0, maximize: bool=False): + """Upper Confidence Bound (UCB) acquisition function. + + Args: + beta: Controls the balance between exploration and exploitation. + maximize: If True, maximize the given model, else minimize. + DEFAULT=False, assumes minimzation. + """ + super().__init__() + self.beta = beta # can be updated as part of the state for dynamism or a schedule + self.maximize = maximize + + # to be initialized as part of the state + self.surrogate_model = None + + def set_state(self, surrogate_model, **kwargs): + super().set_state(surrogate_model) + self.surrogate_model = surrogate_model + if "beta" in kwargs: + if not isinstance(kwargs["beta"], (list, np.array)): + self.beta = kwargs["beta"] + else: + self.logger.warning("Beta is a list, not updating beta value!") + + def eval( + self, x: Iterable, asscalar: bool = False + ) -> Union[np.ndarray, torch.Tensor, float]: + try: + mu, cov = self.surrogate_model.predict(x) + std = torch.sqrt(torch.diag(cov)) + except ValueError as e: + raise e + sign = 1 if self.maximize else -1 # LCB is performed if minimize=True + ucb_scores = mu + sign * np.sqrt(self.beta) * std + # if LCB, minimize acquisition, or maximize -acquisition + ucb_scores = ucb_scores.detach().numpy() * sign + + return ucb_scores + + +class MF_UCB(UpperConfidenceBound): + + def preprocess(self, x: Iterable) -> Iterable: + performances = self.observations.get_best_performance_for_each_budget() + pass + + def eval( + self, x: Iterable, asscalar: bool = False + ) -> Union[np.ndarray, torch.Tensor, float]: + x = self.preprocess(x) + return self.eval(x, asscalar=asscalar) From 16c27f8e518e584643d75f5c0d5bd729969acbec Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Thu, 29 Aug 2024 19:40:40 +0200 Subject: [PATCH 27/63] fix: Some minor cleanup fixes --- neps/optimizers/__init__.py | 10 +- neps/{ => sampling}/distributions.py | 2 +- neps/state/neps_state.py | 140 +++++++++++++-------------- 3 files changed, 73 insertions(+), 79 deletions(-) rename neps/{ => sampling}/distributions.py (99%) diff --git a/neps/optimizers/__init__.py b/neps/optimizers/__init__.py index 31cb4c4a..518952cd 100644 --- a/neps/optimizers/__init__.py +++ b/neps/optimizers/__init__.py @@ -1,11 +1,9 @@ from __future__ import annotations from functools import partial -from typing import Callable, Mapping +from typing import TYPE_CHECKING, Callable, Mapping from .base_optimizer import BaseOptimizer -from .bayesian_optimization.cost_cooling import CostCooling -from .bayesian_optimization.mf_tpe import MultiFidelityPriorWeightedTreeParzenEstimator from .bayesian_optimization.optimizer import BayesianOptimization from .grid_search.optimizer import GridSearch from .multi_fidelity.dyhpo import MFEIBO @@ -26,13 +24,14 @@ from .random_search.optimizer import RandomSearch from .regularized_evolution.optimizer import RegularizedEvolution +if TYPE_CHECKING: + from .base_optimizer import BaseOptimizer + # TODO: Rename Searcher to Optimizer... SearcherMapping: Mapping[str, Callable[..., BaseOptimizer]] = { "bayesian_optimization": BayesianOptimization, "pibo": partial(BayesianOptimization, disable_priors=False), - "cost_cooling_bayesian_optimization": CostCooling, "random_search": RandomSearch, - "cost_cooling": CostCooling, "regularized_evolution": RegularizedEvolution, "assisted_regularized_evolution": partial(RegularizedEvolution, assisted=True), "grid_search": GridSearch, @@ -41,7 +40,6 @@ "asha": AsynchronousSuccessiveHalving, "hyperband": Hyperband, "asha_prior": AsynchronousSuccessiveHalvingWithPriors, - "multifidelity_tpe": MultiFidelityPriorWeightedTreeParzenEstimator, "hyperband_custom_default": HyperbandCustomDefault, "priorband": PriorBand, "mobster": MOBSTER, diff --git a/neps/distributions.py b/neps/sampling/distributions.py similarity index 99% rename from neps/distributions.py rename to neps/sampling/distributions.py index 2361e191..fb552949 100644 --- a/neps/distributions.py +++ b/neps/sampling/distributions.py @@ -225,6 +225,6 @@ def log_prob(self, value): @dataclass -class DistributionOverDomain: +class TorchDistributionWithDomain: distribution: Distribution domain: Domain diff --git a/neps/state/neps_state.py b/neps/state/neps_state.py index 163679d8..8afaee62 100644 --- a/neps/state/neps_state.py +++ b/neps/state/neps_state.py @@ -32,75 +32,6 @@ Loc = TypeVar("Loc") T = TypeVar("T") -def sample_trial( - neps_state, - optimizer: BaseOptimizer, - *, - worker_id: str, - _sample_hooks: list[Callable] | None = None, -) -> Trial: - """Sample a new trial from the optimizer. - - Args: - optimizer: The optimizer to sample the trial from. - worker_id: The worker that is sampling the trial. - _sample_hooks: A list of hooks to apply to the optimizer before sampling. - - Returns: - The new trial. - """ - with neps_state._optimizer_state.acquire() as ( - opt_state, - put_opt, - ), neps_state._seed_state.acquire() as (seed_state, put_seed_state): - trials: dict[Trial.ID, Trial] = {} - for trial_id, shared_trial in neps_state._trials.all().items(): - trial = shared_trial.synced() - trials[trial_id] = trial - - seed_state.set_as_global_seed_state() - - # TODO: Not sure if any existing pre_load hooks required - # it to be done after `load_results`... I hope not. - if _sample_hooks is not None: - for hook in _sample_hooks: - optimizer = hook(optimizer) - - # NOTE: We don't want optimizers mutating this before serialization - budget = opt_state.budget.clone() if opt_state.budget is not None else None - sampled_config, new_opt_state = optimizer.ask( - trials=trials, - budget_info=budget, - optimizer_state=opt_state.shared_state, - ) - - if sampled_config.previous_config_id is not None: - previous_trial = trials.get(sampled_config.previous_config_id) - if previous_trial is None: - raise ValueError( - f"Previous trial '{sampled_config.previous_config_id}' not found." - ) - previous_trial_location = previous_trial.metadata.location - else: - previous_trial_location = None - - trial = Trial.new( - trial_id=sampled_config.id, - location="", # HACK: This will be set by the `TrialRepo` - config=sampled_config.config, - previous_trial=sampled_config.previous_config_id, - previous_trial_location=previous_trial_location, - time_sampled=time.time(), - worker_id=worker_id, - ) - shared_trial = neps_state._trials.put_new(trial) - seed_state.recapture() - put_seed_state(seed_state) - put_opt( - OptimizationState(budget=opt_state.budget, shared_state=new_opt_state) - ) - - return trial @dataclass class NePSState(Generic[Loc]): @@ -140,10 +71,75 @@ def get_trials_by_ids(self, trial_ids: list[str], /) -> dict[str, Trial | None]: for _id, shared_trial in self._trials.get_by_ids(trial_ids).items() } - def get_optimizer_instance(self) -> BaseOptimizer: - """Get the optimizer instance.""" - raise NotImplementedError + def sample_trial( + self, + optimizer: BaseOptimizer, + *, + worker_id: str, + _sample_hooks: list[Callable] | None = None, + ) -> Trial: + """Sample a new trial from the optimizer. + + Args: + optimizer: The optimizer to sample the trial from. + worker_id: The worker that is sampling the trial. + _sample_hooks: A list of hooks to apply to the optimizer before sampling. + Returns: + The new trial. + """ + with self._optimizer_state.acquire() as ( + opt_state, + put_opt, + ), self._seed_state.acquire() as (seed_state, put_seed_state): + trials: dict[Trial.ID, Trial] = {} + for trial_id, shared_trial in self._trials.all().items(): + trial = shared_trial.synced() + trials[trial_id] = trial + + seed_state.set_as_global_seed_state() + + # TODO: Not sure if any existing pre_load hooks required + # it to be done after `load_results`... I hope not. + if _sample_hooks is not None: + for hook in _sample_hooks: + optimizer = hook(optimizer) + + # NOTE: We don't want optimizers mutating this before serialization + budget = opt_state.budget.clone() if opt_state.budget is not None else None + sampled_config, new_opt_state = optimizer.ask( + trials=trials, + budget_info=budget, + optimizer_state=opt_state.shared_state, + ) + + if sampled_config.previous_config_id is not None: + previous_trial = trials.get(sampled_config.previous_config_id) + if previous_trial is None: + raise ValueError( + f"Previous trial '{sampled_config.previous_config_id}' not found." + ) + previous_trial_location = previous_trial.metadata.location + else: + previous_trial_location = None + + trial = Trial.new( + trial_id=sampled_config.id, + location="", # HACK: This will be set by the `TrialRepo` + config=sampled_config.config, + previous_trial=sampled_config.previous_config_id, + previous_trial_location=previous_trial_location, + time_sampled=time.time(), + worker_id=worker_id, + ) + shared_trial = self._trials.put_new(trial) + seed_state.recapture() + put_seed_state(seed_state) + put_opt( + OptimizationState(budget=opt_state.budget, shared_state=new_opt_state) + ) + + return trial def report_trial_evaluation( self, From da4f376e20c64fa338f6e28564ee17d12cf73ce2 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Thu, 29 Aug 2024 19:40:59 +0200 Subject: [PATCH 28/63] optim: Switch to just additive kernel --- .../bayesian_optimization/models/gp.py | 48 +++++++++---------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index 307f806b..d8709c2a 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -5,12 +5,12 @@ from functools import reduce from typing import TYPE_CHECKING, Any, Mapping, TypeVar +from botorch.models import MultiTaskGP import gpytorch import gpytorch.constraints import torch from botorch.acquisition.analytic import SingleTaskGP -from botorch.models import MixedSingleTaskGP -from botorch.models.gp_regression_mixed import CategoricalKernel, Likelihood +from botorch.models.gp_regression_mixed import CategoricalKernel, Likelihood, MixedSingleTaskGP from botorch.models.transforms.outcome import Standardize from botorch.optim import optimize_acqf, optimize_acqf_mixed from gpytorch.kernels import MaternKernel, ScaleKernel @@ -190,31 +190,29 @@ def default_single_obj_gp( return gp, likelihood # Mixed - def cont_kernel_factory( - batch_shape: torch.Size, - ard_num_dims: int, - active_dims: list[int], - ) -> ScaleKernel: - lengthscale_prior, lengthscale_constraint = default_lengthscale_prior( - ard_num_dims - ) - return ScaleKernel( - MaternKernel( - nu=2.5, - batch_shape=batch_shape, - ard_num_dims=ard_num_dims, - active_dims=active_dims, - lengthscale_prior=lengthscale_prior, - lengthscale_constraint=lengthscale_constraint, - ), - ) + numeric_kernel = default_matern_kernel(len(numerics), active_dims=tuple(numerics)) + cat_kernel = default_categorical_kernel( + len(categoricals), active_dims=tuple(categoricals) + ) + + # WARNING: I previously tried SingleTaskMixedGp which does the following: + # + # x K((x1, c1), (x2, c2)) = + # x K_cont_1(x1, x2) + K_cat_1(c1, c2) + + # x K_cont_2(x1, x2) * K_cat_2(c1, c2) + # + # In a toy example with a single binary categorical which acted like F * {0, 1}, + # the model collapsed to always predicting `0`. Causing all parameters defining F + # to essentially be guess at random. This is a lot more stable while testing... + # TODO: Figure out why... + kernel = numeric_kernel + cat_kernel - gp = MixedSingleTaskGP( + gp = SingleTaskGP( train_X=x.tensor, train_Y=y, - cat_dims=categoricals, + mean_module=default_mean(), likelihood=likelihood, - cont_kernel_factory=cont_kernel_factory, + covar_module=kernel, outcome_transform=Standardize(m=1), ) return gp, likelihood @@ -232,8 +230,8 @@ def optimize_acq( ) -> tuple[torch.Tensor, torch.Tensor]: acq_options = acq_options or {} - lower = [domain.lower for domain in encoder.domains.values()] - upper = [domain.upper for domain in encoder.domains.values()] + lower = [domain.lower for domain in encoder.domains] + upper = [domain.upper for domain in encoder.domains] bounds = torch.tensor([lower, upper], dtype=torch.float) cat_transformers = { From 5f76a7e3ffe642cb8092c403721ce604a16dba66 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Fri, 30 Aug 2024 15:46:01 +0200 Subject: [PATCH 29/63] feat: Stable pibo implementation --- neps/optimizers/__init__.py | 5 +- .../acquisition_functions/pibo.py | 10 +- .../weighted_acquisition.py | 9 +- .../bayesian_optimization/models/gp.py | 6 +- .../bayesian_optimization/optimizer.py | 97 +++++++++++-------- .../bayesian_optimization.yaml | 1 - neps/optimizers/default_searchers/pibo.yaml | 1 - neps/runtime.py | 2 - neps/sampling/distributions.py | 64 ++++++++++-- neps/sampling/priors.py | 86 ++++++++++------ neps/search_spaces/domain.py | 2 +- .../hyperparameters/categorical.py | 2 + neps_examples/basic_usage/hyperparameters.py | 27 ++++-- 13 files changed, 210 insertions(+), 102 deletions(-) diff --git a/neps/optimizers/__init__.py b/neps/optimizers/__init__.py index 518952cd..74421687 100644 --- a/neps/optimizers/__init__.py +++ b/neps/optimizers/__init__.py @@ -19,7 +19,6 @@ SuccessiveHalving, SuccessiveHalvingWithPriors, ) -from .multi_fidelity_prior.async_priorband import PriorBandAsha, PriorBandAshaHB from .multi_fidelity_prior.priorband import PriorBand from .random_search.optimizer import RandomSearch from .regularized_evolution.optimizer import RegularizedEvolution @@ -29,8 +28,8 @@ # TODO: Rename Searcher to Optimizer... SearcherMapping: Mapping[str, Callable[..., BaseOptimizer]] = { - "bayesian_optimization": BayesianOptimization, - "pibo": partial(BayesianOptimization, disable_priors=False), + "bayesian_optimization": partial(BayesianOptimization, use_priors=False), + "pibo": partial(BayesianOptimization, use_priors=True), "random_search": RandomSearch, "regularized_evolution": RegularizedEvolution, "assisted_regularized_evolution": partial(RegularizedEvolution, assisted=True), diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py index 0f1668f1..76499ba1 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py @@ -39,9 +39,15 @@ def apply_pibo_acquisition_weight( x_domain: Domain | list[Domain], prior_exponent: float, ): + import rich + + rich.print(prior_exponent) if acq._log: - return acq_values + prior.log_prob(X, frm=x_domain) * prior_exponent - return acq_values * prior.prob(X, frm=x_domain).pow(prior_exponent) + weighted_log_probs = prior.log_prob(X, frm=x_domain) * prior_exponent + return acq_values + weighted_log_probs + + weighted_probs = prior.prob(X, frm=x_domain).pow(prior_exponent) + return acq_values * weighted_probs def pibo_acquisition( diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py index 488c57f4..eadf9207 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py @@ -115,7 +115,7 @@ def __init__( # NOTE: We remove the X_pending from the base acquisition function as we will get # it in our own forward with `@concatenate_pending_points` and pass that forward. # This avoids possible duplicates - self.acq.set_X_pending(None) + acq.set_X_pending(None) self.set_X_pending(X_pending) self.apply_weight = apply_weight self.acq = acq @@ -136,10 +136,11 @@ def forward(self, X: Tensor) -> Tensor: """ if isinstance(self.acq, SampleReducingMCAcquisitionFunction): # shape: mc_samples x batch x q-candidates - acq_values = self.acq._non_reduce_forward(X) + acq_values = self.acq._non_reduced_forward(X) weighted_acq_values = self.apply_weight(acq_values, X, self.acq) - vals = self.acq._sample_reduction(self.acq._q_reduction(weighted_acq_values)) - return vals.squeeze(-1) + q_reduced_acq = self.acq._q_reduction(weighted_acq_values) + sample_reduced_acq = self.acq._sample_reduction(q_reduced_acq) + return sample_reduced_acq.squeeze(-1) # shape: batch x q-candidates acq_values = self.acq(X).unsqueeze(-1) diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index d8709c2a..39e81cba 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -5,12 +5,14 @@ from functools import reduce from typing import TYPE_CHECKING, Any, Mapping, TypeVar -from botorch.models import MultiTaskGP import gpytorch import gpytorch.constraints import torch from botorch.acquisition.analytic import SingleTaskGP -from botorch.models.gp_regression_mixed import CategoricalKernel, Likelihood, MixedSingleTaskGP +from botorch.models.gp_regression_mixed import ( + CategoricalKernel, + Likelihood, +) from botorch.models.transforms.outcome import Standardize from botorch.optim import optimize_acqf, optimize_acqf_mixed from gpytorch.kernels import MaternKernel, ScaleKernel diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 2c4e5eeb..7d57e936 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -4,10 +4,7 @@ from typing import TYPE_CHECKING, Any, Callable, Literal, Mapping import torch -from botorch.acquisition import ( - LinearMCObjective, - qLogExpectedImprovement, -) +from botorch.acquisition import LinearMCObjective, qLogExpectedImprovement from botorch.fit import fit_gpytorch_mll from gpytorch import ExactMarginalLogLikelihood @@ -95,25 +92,34 @@ def _missing_cost_strategy(cost: torch.Tensor) -> torch.Tensor: return _missing_fill_strategy(cost, strategy="3std", lower_is_better=True) -def _pibo_exp_term(n_sampled_already: int, ndims: int, budget_info: BudgetInfo) -> float: - if budget_info.max_evaluations is not None: - # From the PIBO paper (Section 4.1) - # https://arxiv.org/pdf/2204.11051 - n = n_sampled_already - beta = budget_info.max_evaluations / 10 - elif budget_info.max_cost_budget is not None: - # This might not work well if cost number is high - # early on, but it will start to normalize. - n = budget_info.used_cost_budget - beta = budget_info.max_cost_budget / 10 - else: - # Otherwise, just some random heuristic based on the number - # of trials and dimensionality of the search space - # TODO: Think about and evaluate this more. - n = n_sampled_already - beta = ndims**2 / 10 - - return beta / n +def _pibo_exp_term( + n_sampled_already: int, + ndims: int, + initial_design_size: int, +) -> float: + # pibo paper + # https://arxiv.org/pdf/2204.11051 + # + # they use some constant determined from max problem budget. seems impractical, + # given we might not know the final budget (i.e. imagine you iteratively increase + # the budget as you go along). + # + # instead, we base it on the fact that in lower dimensions, we don't to rely + # on the prior for too long as the amount of space you need to cover around the + # prior is fairly low. effectively, since the gp needs little samples to + # model pretty effectively in low dimension, we can derive the utility from + # the prior pretty quickly. + # + # however, for high dimensional settings, we want to rely longer on the prior + # for longer as the number of samples needed to model the area around the prior + # is much larger, and deriving the utility will take longer. + # + # in the end, we would like some curve going from 1->0 as n->inf, where `n` is + # the number of samples we have done so far. + # the easiest function that does this is `exp(-n)`, with some discounting of `n` + # dependant on the number of dimensions. + n_bo_samples = n_sampled_already - initial_design_size + return math.exp(-n_bo_samples / ndims) def _cost_used_budget_percentage(budget_info: BudgetInfo) -> float: @@ -214,12 +220,6 @@ def __init__( # noqa: D417 raise NotImplementedError("Only supports flat search spaces for now!") super().__init__(pipeline_space=pipeline_space) - if initial_design_size is None: - N = len(pipeline_space.hyperparameters) - initial_design_size = int(max(2, math.log(N) ** 2)) - elif initial_design_size < 1: - raise ValueError("Initial_design_size to be at least 1") - params: dict[str, CategoricalParameter | FloatParameter | IntegerParameter] = { **pipeline_space.numerical, **pipeline_space.categoricals, @@ -227,6 +227,14 @@ def __init__( # noqa: D417 if treat_fidelity_as_hyperparameters: params.update(pipeline_space.fidelities) + if initial_design_size is None: + # As we have fairly regularized GPs, who start with a more smooth landscape + # model, we don't need a high level of initial samples. + ndims = len(params) + initial_design_size = max(2, int(math.log(ndims) ** 2)) + elif initial_design_size < 1: + raise ValueError("Initial_design_size to be at least 1") + self.encoder = TensorEncoder.default(params) if encoder is None else encoder self.use_cost = use_cost self.prior = _make_prior(params) if use_priors is True else None @@ -251,9 +259,9 @@ def ask( "Seed is not yet implemented for BayesianOptimization" ) - n_trials = len(trials) + n_trials_completed = len(trials) space = self.pipeline_space - config_id = str(n_trials + 1) + config_id = str(n_trials_completed + 1) # Fill intitial design data if we don't have any... if self.initial_design_ is None: @@ -279,8 +287,8 @@ def ask( self.initial_design_.extend(configs) # If we havn't passed the intial design phase - if n_trials <= len(self.initial_design_): - config = self.initial_design_[n_trials - 1] + if n_trials_completed < len(self.initial_design_): + config = self.initial_design_[n_trials_completed] sample = SampledConfig(id=config_id, config=config, previous_config_id=None) return sample, optimizer_state @@ -331,14 +339,25 @@ def ask( # If we should use the prior, weight the acquisition function by # the probability of it being sampled from the prior. if self.prior: - acq = pibo_acquisition( - acq, - prior=self.prior, - prior_exponent=_pibo_exp_term(n_trials, self.encoder.ncols, budget_info), - x_domain=self.encoder.domains, - X_pending=maybe_x_pending_tensor, + pibo_exp_term = _pibo_exp_term( + n_trials_completed, + self.encoder.ncols, + self.n_initial_design, ) + # If the amount of weight derived from the pibo exponent becomes + # insignificant, we don't use it as it as it adds extra computational + # burden and introduces more chance of numerical instability. + significant_lower_bound = 1e-4 + if pibo_exp_term > significant_lower_bound: + acq = pibo_acquisition( + acq, + prior=self.prior, + prior_exponent=pibo_exp_term, + x_domain=self.encoder.domains, + X_pending=maybe_x_pending_tensor, + ) + # If we should use cost, weight the acquisition function by the cost # of the configurations. if self.use_cost: diff --git a/neps/optimizers/default_searchers/bayesian_optimization.yaml b/neps/optimizers/default_searchers/bayesian_optimization.yaml index cf3717ab..fb43f97b 100644 --- a/neps/optimizers/default_searchers/bayesian_optimization.yaml +++ b/neps/optimizers/default_searchers/bayesian_optimization.yaml @@ -1,6 +1,5 @@ strategy: bayesian_optimization # Arguments that can be modified by the user -initial_design_size: 10 surrogate_model: gp # or {"gp_hierarchy"} acquisition: EI # or {"LogEI", "AEI"} log_prior_weighted: false diff --git a/neps/optimizers/default_searchers/pibo.yaml b/neps/optimizers/default_searchers/pibo.yaml index 9c386069..8b514ba8 100644 --- a/neps/optimizers/default_searchers/pibo.yaml +++ b/neps/optimizers/default_searchers/pibo.yaml @@ -1,6 +1,5 @@ strategy: pibo # Arguments that can be modified by the user -initial_design_size: 10 surrogate_model: gp # or {"gp_hierarchy"} acquisition: EI # or {"LogEI", "AEI"} log_prior_weighted: false diff --git a/neps/runtime.py b/neps/runtime.py index b102b153..b234a479 100644 --- a/neps/runtime.py +++ b/neps/runtime.py @@ -519,8 +519,6 @@ def _launch_runtime( # noqa: PLR0913 max_evaluations=max_evaluations_total, used_evaluations=0, ) - if max_cost_total is not None - else None ), shared_state={}, # TODO: Unused for the time being... ), diff --git a/neps/sampling/distributions.py b/neps/sampling/distributions.py index fb552949..6b557e5a 100644 --- a/neps/sampling/distributions.py +++ b/neps/sampling/distributions.py @@ -9,18 +9,22 @@ from typing_extensions import override import torch -from torch.distributions import Distribution, constraints +from torch.distributions import Distribution, Uniform, constraints from torch.distributions.utils import broadcast_all +from neps.search_spaces.domain import Domain + if TYPE_CHECKING: from neps.search_spaces.architecture.cfg_variants.constrained_cfg import Constraint - from neps.search_spaces.domain import Domain -CONST_SQRT_2 = math.sqrt(2) -CONST_INV_SQRT_2PI = 1 / math.sqrt(2 * math.pi) -CONST_INV_SQRT_2 = 1 / math.sqrt(2) -CONST_LOG_INV_SQRT_2PI = math.log(CONST_INV_SQRT_2PI) -CONST_LOG_SQRT_2PI_E = 0.5 * math.log(2 * math.pi * math.e) +CONST_SQRT_2 = torch.tensor(math.sqrt(2), dtype=torch.float64) +CONST_INV_SQRT_2PI = torch.tensor(1 / math.sqrt(2 * math.pi), dtype=torch.float64) +CONST_INV_SQRT_2 = torch.tensor(1 / math.sqrt(2), dtype=torch.float64) +CONST_LOG_INV_SQRT_2PI = torch.tensor(math.log(CONST_INV_SQRT_2PI), dtype=torch.float64) +CONST_LOG_SQRT_2PI_E = torch.tensor( + 0.5 * math.log(2 * math.pi * math.e), + dtype=torch.float64, +) # from https://github.com/toshas/torch_truncnorm @@ -224,7 +228,53 @@ def log_prob(self, value): return super().log_prob(value) - self._log_scale +class UniformWithUpperBound(Uniform): + """Uniform distribution with upper bound inclusive. + + This is mostly a hack because torch's version of Uniform does not include + the upper bound which only causes a problem when considering the log_prob. + Otherwise the upper bound works with every other method. + """ + + # OPTIM: This could probably be optimized a lot but I'm not sure how it effects + # gradients. Could probably do a different path depending on if `value` requires + # gradients or not. + @override + def log_prob(self, value: torch.Tensor) -> torch.Tensor: + if self._validate_args: + self._validate_sample(value) + + lb = self.low.le(value).type_as(self.low) + ub = self.high.ge(value).type_as(self.low) # The main change, is `gt` in original + return torch.log(lb.mul(ub)) - torch.log(self.high - self.low) + + @dataclass class TorchDistributionWithDomain: distribution: Distribution domain: Domain + + +UNIT_UNIFORM_DIST = TorchDistributionWithDomain( + distribution=UniformWithUpperBound(0, 1), + domain=Domain.unit_float(), +) + +if __name__ == "__main__": + loc = 0.95 + for confidence in torch.linspace(0.0, 0.8, 8): + scale = 1 - confidence + dist = TruncatedNormal( + loc=loc, + scale=scale, + a=0.0, + b=1.0, + ) + xs = torch.linspace(0, 1, 100) + ys = dist.log_prob(xs) + import matplotlib.pyplot as plt + + plt.plot(xs, ys, label=f"confidence={confidence}") + plt.plot(loc, dist.log_prob(torch.tensor(loc)), "ro") + plt.legend() + plt.show() diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py index 03b64122..f2373a68 100644 --- a/neps/sampling/priors.py +++ b/neps/sampling/priors.py @@ -15,7 +15,11 @@ import torch -from neps.sampling.distributions import TorchDistributionWithDomain, TruncatedNormal +from neps.sampling.distributions import ( + UNIT_UNIFORM_DIST, + TorchDistributionWithDomain, + TruncatedNormal, +) from neps.sampling.samplers import Sampler, WeightedSampler from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain @@ -160,21 +164,11 @@ def make_centered( f" Got {confidence}." ) - for name in domains: - if name not in centers: - raise ValueError( - f"Center for {name} is missing. " - f"Please provide a center for all domains." - ) - distributions: list[TorchDistributionWithDomain] = [] for name, domain in domains.items(): center_confidence = centers.get(name) if center_confidence is None: - dist = TorchDistributionWithDomain( - distribution=torch.distributions.Uniform(0.0, 1.0), - domain=UNIT_FLOAT_DOMAIN, - ) + distributions.append(UNIT_UNIFORM_DIST) continue center, confidence = center_confidence @@ -203,7 +197,9 @@ def make_centered( weights[center] = confidence dist = TorchDistributionWithDomain( - distribution=torch.distributions.Categorical(probs=weights), + distribution=torch.distributions.Categorical( + probs=weights, validate_args=False + ), domain=domain, ) distributions.append(dist) @@ -213,13 +209,17 @@ def make_centered( unit_center = domain.to_unit( torch.tensor(center, device=device, dtype=torch.float64) ) + scale = torch.tensor(1 - confidence, device=device, dtype=torch.float64) + a = torch.tensor(0.0, device=device, dtype=torch.float64) + b = torch.tensor(1.0, device=device, dtype=torch.float64) dist = TorchDistributionWithDomain( distribution=TruncatedNormal( loc=unit_center, - scale=(1 - confidence), - a=0.0, - b=1.0, + scale=scale, + a=a, + b=b, device=device, + validate_args=False, ), domain=UNIT_FLOAT_DOMAIN, ) @@ -257,11 +257,29 @@ class CenteredPrior(Prior): distributions: list[TorchDistributionWithDomain] """Distributions along with the corresponding domains they sample from.""" - _distribution_domains: list[Domain] = field(init=False, repr=False) + _distribution_domains: list[Domain] = field(init=False) + + # OPTIM: These are used for an optimization in `log_prob` + _meaningful_ixs: list[int] = field(init=False) + _meaningful_doms: list[Domain] = field(init=False) + _meaningful_dists: list[Distribution] = field(init=False) def __post_init__(self): self._distribution_domains = [dist.domain for dist in self.distributions] + rest: list[tuple[int, Domain, Distribution]] = [] + for i, dist in enumerate(self.distributions): + if dist != UNIT_UNIFORM_DIST: + rest.append((i, dist.domain, dist.distribution)) + + if len(rest) == 0: + self._meaningful_ixs = [] + self._meaningful_doms = [] + self._meaningful_dists = [] + return + + self._meaningful_ixs, self._meaningful_doms, self._meaningful_dists = zip(*rest) + @property @override def ncols(self) -> int: @@ -275,23 +293,31 @@ def log_prob(self, x: torch.Tensor, *, frm: list[Domain] | Domain) -> torch.Tens if x.ndim == 1: x = x.unsqueeze(0) + # OPTIM: We can actually just skip elements that are distributed uniformly as + # **assuming** they are all correctly in bounds, their log_pdf will be 0 and + # contribute nothing. + # It also helps numeric stability to avoid useless computations. + if len(self._meaningful_ixs) == 0: + return torch.zeros(x.shape[:-1], dtype=torch.float64, device=x.device) + + frm = frm if isinstance(frm, Domain) else [frm[i] for i in self._meaningful_ixs] + # Cast all values from the value domains to the domain of the sampler. - sample_domain_tensor = Domain.translate( - x, + translated_x = Domain.translate( + x[..., self._meaningful_ixs], frm=frm, - to=self._distribution_domains, + to=self._meaningful_doms, ) # Calculate the log probabilities of the sample domain tensors under their # respective distributions. - itr = enumerate(self.distributions) + itr = iter(zip(self._meaningful_ixs, self._meaningful_dists)) first_i, first_dist = next(itr) + log_probs = first_dist.log_prob(translated_x[..., first_i]) - log_probs = first_dist.distribution.log_prob(sample_domain_tensor[..., first_i]) for i, dist in itr: - log_probs = log_probs + dist.distribution.log_prob( - sample_domain_tensor[..., i] - ) + log_probs = log_probs + dist.log_prob(translated_x[..., i]) + return log_probs @override @@ -330,15 +356,11 @@ class UniformPrior(Prior): ncols: int """The number of columns in the tensor to sample from.""" - _unit_uniform: Distribution = field(init=False, repr=False) - - def __post_init__(self): - self._unit_uniform = torch.distributions.Uniform(0.0, 1.0) - @override def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor: - sample_domain_tensor = Domain.translate(x, frm=frm, to=UNIT_FLOAT_DOMAIN) - return torch.sum(self._unit_uniform.log_prob(sample_domain_tensor), dim=-1) + # NOTE: We just assume everything is in bounds... + shape = x.shape[:-1] + return torch.zeros(shape, dtype=torch.float64, device=x.device) @override def sample( diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py index 5d92fac3..7342a136 100644 --- a/neps/search_spaces/domain.py +++ b/neps/search_spaces/domain.py @@ -271,7 +271,7 @@ def cast(self, x: Tensor, frm: Domain) -> Tensor: if same_bounds and same_log_bounds and (self.bins is None or same_bins): if self.round: x = torch.round(x) - return x.type(self.dtype) + return x.type(self.dtype) if x.dtype != self.dtype else x # Shortcut 2. (From normalized) # The domain we are coming from is already normalized, we only need to lift diff --git a/neps/search_spaces/hyperparameters/categorical.py b/neps/search_spaces/hyperparameters/categorical.py index b6a1fe27..6756e828 100644 --- a/neps/search_spaces/hyperparameters/categorical.py +++ b/neps/search_spaces/hyperparameters/categorical.py @@ -17,6 +17,7 @@ import numpy.typing as npt from more_itertools import all_unique +from neps.search_spaces.domain import Domain from neps.search_spaces.parameter import MutatableParameter, ParameterWithPrior if TYPE_CHECKING: @@ -110,6 +111,7 @@ def __init__( self._default_index: int | None = ( self.choices.index(default) if default is not None else None ) + self.domain = Domain.indices(len(self.choices)) @override def clone(self) -> Self: diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py index a89c9bcc..6345b1c5 100644 --- a/neps_examples/basic_usage/hyperparameters.py +++ b/neps_examples/basic_usage/hyperparameters.py @@ -7,12 +7,23 @@ import neps +from rich import print -def run_pipeline(float1, float2, float3, categorical, integer1, integer2): +PRINT = False + + +def run_pipeline(float1, float2, float3, integer1, integer2): + if PRINT: + print("float1:", float1) + print("float2:", float2) + print("float3:", float3) + # print("categorical:", categorical) + print("integer1:", integer1) + print("integer2:", integer2) loss = -float( np.sum( [ - (float1 * float2 / (float3 + 1)) * int(categorical), + (float1 * float2 / (float3 + 1)) * 1, # ,(int(categorical) + 1), integer1, math.log(integer2), ] @@ -23,12 +34,12 @@ def run_pipeline(float1, float2, float3, categorical, integer1, integer2): pipeline_space = dict( - float1=neps.FloatParameter(lower=0, upper=1), - float2=neps.FloatParameter(lower=0, upper=20), - float3=neps.FloatParameter(lower=0, upper=5), - categorical=neps.CategoricalParameter(choices=[0, 1]), - integer1=neps.IntegerParameter(lower=0, upper=1), - integer2=neps.IntegerParameter(lower=1, upper=1000, log=True), + float1=neps.FloatParameter(lower=0, upper=1, default=0.95), + float2=neps.FloatParameter(lower=0, upper=20, default=19.5), + float3=neps.FloatParameter(lower=0, upper=5, default=0.5), + # categorical=neps.CategoricalParameter(choices=[0, 1]), + integer1=neps.IntegerParameter(lower=0, upper=1, default=1), + integer2=neps.IntegerParameter(lower=1, upper=1000, log=True, default=950), ) logging.basicConfig(level=logging.INFO) From b2d3e15af9829868741820a98d09f5114b6a4908 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Fri, 30 Aug 2024 15:59:46 +0200 Subject: [PATCH 30/63] fix: Remove stray prints --- .../bayesian_optimization/acquisition_functions/pibo.py | 3 --- neps_examples/basic_usage/hyperparameters.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py index 76499ba1..db3120e7 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py @@ -39,9 +39,6 @@ def apply_pibo_acquisition_weight( x_domain: Domain | list[Domain], prior_exponent: float, ): - import rich - - rich.print(prior_exponent) if acq._log: weighted_log_probs = prior.log_prob(X, frm=x_domain) * prior_exponent return acq_values + weighted_log_probs diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py index 6345b1c5..a137fabf 100644 --- a/neps_examples/basic_usage/hyperparameters.py +++ b/neps_examples/basic_usage/hyperparameters.py @@ -30,7 +30,7 @@ def run_pipeline(float1, float2, float3, integer1, integer2): ) ) # Random noise # time.sleep(0.7) # For demonstration purposes - return loss + return {"loss": loss, "cost": math.log(integer2)} pipeline_space = dict( From f209f4773c315e99f8ce9cd7b2cc80824d4add79 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Fri, 30 Aug 2024 16:09:47 +0200 Subject: [PATCH 31/63] optim: Lengthscale has more wiggle-room in high d --- neps/optimizers/bayesian_optimization/models/gp.py | 4 ++-- neps/optimizers/bayesian_optimization/optimizer.py | 8 ++++---- neps_examples/basic_usage/hyperparameters.py | 1 + 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index 39e81cba..b3c50112 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -42,7 +42,7 @@ def default_likelihood_with_prior() -> gpytorch.likelihoods.GaussianLikelihood: # even a 0.01% noise, we need that all the way up to 1e-2. Hence # # If we had 10% noise and we allow the noise to easily optimize towards - # 1e-8, then the lengthscales are forced to beome very small, essentially + # 1e-8, then the lengthscales are forced to become very small, essentially # overfitting. If we have 0% noise and we don't allow it to easily get low # then we will drastically underfit. # A guiding principle here is that we should allow the noise to be just @@ -90,7 +90,7 @@ def default_lengthscale_prior( # of the dimension and number of samples lengthscale_prior = gpytorch.priors.LogNormalPrior( loc=math.sqrt(2.0) + math.log(N) / 2, - scale=math.sqrt(3.0), + scale=math.sqrt(3.0) * math.log(N), ) # NOTE: It's possible to just specify `GreaterThan`, however # digging through the code, if this ends up at botorch's optimize, diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 7d57e936..eb916fb6 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -308,8 +308,8 @@ def ask( trial.report.loss if trial.report.loss is not None else torch.nan ) if self.use_cost: - cost = trial.report.cost - costs.append(cost if cost is not None else torch.nan) + cost_z_score = trial.report.cost + costs.append(cost_z_score if cost_z_score is not None else torch.nan) x = self.encoder.pack(x_configs, device=self.device) maybe_x_pending_tensor = None @@ -362,12 +362,12 @@ def ask( # of the configurations. if self.use_cost: cost = torch.tensor(costs, dtype=torch.float64, device=self.device) - cost = _missing_cost_strategy(cost) + cost_z_score = _missing_cost_strategy(cost) # TODO: We might want a different model for cost estimation... one reason # is that cost estimates are likely to be a lot noisier than the likelihood # we have by default. - cost_model, cost_likelihood = self._get_model(x, cost) + cost_model, cost_likelihood = self._get_model(x, cost_z_score) # Optimize the cost model fit_gpytorch_mll( diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py index a137fabf..e28cf585 100644 --- a/neps_examples/basic_usage/hyperparameters.py +++ b/neps_examples/basic_usage/hyperparameters.py @@ -45,6 +45,7 @@ def run_pipeline(float1, float2, float3, integer1, integer2): logging.basicConfig(level=logging.INFO) neps.run( run_pipeline=run_pipeline, + searcher="pibo", pipeline_space=pipeline_space, root_directory="results/hyperparameters_example", post_run_summary=True, From c8151634e847da075c01c33c55f0ac3ab1362538 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Sun, 8 Sep 2024 18:43:25 +0200 Subject: [PATCH 32/63] feat: Cost cooling --- .../acquisition_functions/cost_cooling.py | 29 +++++----- .../weighted_acquisition.py | 1 + .../bayesian_optimization/models/gp.py | 54 ++++++++++++------- .../bayesian_optimization/optimizer.py | 51 ++++++++++-------- neps_examples/basic_usage/hyperparameters.py | 12 ++--- pyproject.toml | 2 +- 6 files changed, 87 insertions(+), 62 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py b/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py index 4741705f..a32baebe 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING +import torch from botorch.acquisition.logei import partial from neps.optimizers.bayesian_optimization.acquisition_functions.weighted_acquisition import ( @@ -9,10 +10,8 @@ ) if TYPE_CHECKING: - import torch from botorch.acquisition import AcquisitionFunction - from botorch.models.gp_regression import Likelihood - from botorch.models.model import Model + from botorch.acquisition.analytic import GPyTorchModel from torch import Tensor @@ -20,23 +19,28 @@ def apply_cost_cooling( acq_values: Tensor, X: Tensor, acq: AcquisitionFunction, - cost_model: Model, - likelihood: Likelihood, + cost_model: GPyTorchModel, alpha: float, ) -> Tensor: - posterior = likelihood(cost_model(X)) - cost = posterior.mean + # NOTE: We expect **positive** costs from model + cost = cost_model.posterior(X).mean + cost = cost.squeeze(dim=-1) if cost_model.num_outputs == 1 else cost.sum(dim=-1) if acq._log: - # can derive from eq log(x) = log(acq / cost^alpha) - return acq_values - alpha * cost.log() - return acq_values / cost.pow(alpha) + # Take log of both sides, acq is already log scaled + # -- x = acq / cost^alpha + # -- log(x) = log(acq) - alpha * log(cost) + w = alpha * cost.log() + return acq_values - w + + # https://github.com/pytorch/botorch/discussions/2194 + w = cost.pow(alpha) + return torch.where(acq_values > 0, acq_values / w, acq_values * w) def cost_cooled_acq( acq_fn: AcquisitionFunction, - model: Model, - likelihood: Likelihood, + model: GPyTorchModel, used_budget_percentage: float, X_pending: torch.Tensor | None = None, ) -> WeightedAcquisition: @@ -46,7 +50,6 @@ def cost_cooled_acq( apply_weight=partial( apply_cost_cooling, cost_model=model, - likelihood=likelihood, alpha=1 - used_budget_percentage, ), X_pending=X_pending, diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py index eadf9207..fa7ca176 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py @@ -119,6 +119,7 @@ def __init__( self.set_X_pending(X_pending) self.apply_weight = apply_weight self.acq = acq + self._log = acq._log # Taken from PiBO implementation in botorch (PriorGuidedAcquisitionFunction). @concatenate_pending_points diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index b3c50112..7dd29dd7 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -1,3 +1,5 @@ +"""Gaussian Process models for Bayesian Optimization.""" + from __future__ import annotations import logging @@ -12,6 +14,7 @@ from botorch.models.gp_regression_mixed import ( CategoricalKernel, Likelihood, + OutcomeTransform, ) from botorch.models.transforms.outcome import Standardize from botorch.optim import optimize_acqf, optimize_acqf_mixed @@ -33,7 +36,12 @@ T = TypeVar("T") -def default_likelihood_with_prior() -> gpytorch.likelihoods.GaussianLikelihood: +def likelihood_with_prior_on_log_scale( + mean: float = 1e-2, + std: float = math.sqrt(3), + bounds: tuple[float, float] = (1e-6, 1), +) -> gpytorch.likelihoods.GaussianLikelihood: + """Default Gaussian likelihood with priors for the noise.""" # The effect of the likelihood of noise is pretty crucial w.r.t. # whether we are going to overfit every point by overfitting with # the lengthscale, or whether we smooth through and assume variation @@ -54,25 +62,21 @@ def default_likelihood_with_prior() -> gpytorch.likelihoods.GaussianLikelihood: # TOOD: We may want to move the likelihood inside the GP and decay the # amount the GP can attribute to noise (reduce std and mean) relative # to samples seen, effectively reducing the smoothness of the GP overtime - noise_mean = 1e-2 - noise_std = math.sqrt(3) - _noise_prior = gpytorch.priors.LogNormalPrior( - math.log(noise_mean) + noise_std**2, - noise_std, - ) + _noise_prior = gpytorch.priors.LogNormalPrior(math.log(mean) + std**2, std) return gpytorch.likelihoods.GaussianLikelihood( noise_prior=_noise_prior, # Going below 1e-6 could introduuce a lot of numerical instability in the # kernels, even if it's a noiseless function noise_constraint=gpytorch.constraints.Interval( - lower_bound=1e-6, - upper_bound=1, - initial_value=noise_mean, + lower_bound=bounds[0], + upper_bound=bounds[1], + initial_value=mean, ), ) def default_signal_variance_prior() -> gpytorch.priors.NormalPrior: + """Default prior for the signal variance.""" # The outputscale prior is a bit more tricky. Essentially # it describes how much we expect the function to move # around the mean (0 as we normalize the `ys`) @@ -85,6 +89,7 @@ def default_signal_variance_prior() -> gpytorch.priors.NormalPrior: def default_lengthscale_prior( N: int, ) -> tuple[gpytorch.priors.LogNormalPrior, gpytorch.constraints.Interval]: + """Default prior for the lengthscale.""" # Based on `Vanilla GP work great in High Dimensions` by Carl Hvafner # TODO: I'm not convinced entirely that the `std` is independant # of the dimension and number of samples @@ -107,6 +112,7 @@ def default_lengthscale_prior( def default_mean() -> gpytorch.means.ConstantMean: + """Default mean for the GP.""" return gpytorch.means.ConstantMean( constant_prior=gpytorch.priors.NormalPrior(0, 0.2), constant_constraint=gpytorch.constraints.Interval( @@ -121,6 +127,7 @@ def default_matern_kernel( N: int, active_dims: tuple[int, ...] | None = None, ) -> ScaleKernel: + """Default Matern kernel for the GP.""" lengthscale_prior, lengthscale_constraint = default_lengthscale_prior(N) return ScaleKernel( @@ -138,6 +145,7 @@ def default_categorical_kernel( N: int, active_dims: tuple[int, ...] | None = None, ) -> ScaleKernel: + """Default Categorical kernel for the GP.""" # Following BoTorches implementation of the MixedSingleTaskGP return ScaleKernel( CategoricalKernel( @@ -151,9 +159,16 @@ def default_categorical_kernel( def default_single_obj_gp( x: TensorPack, y: torch.Tensor, + *, + outcome_transform: OutcomeTransform | None = None, ) -> tuple[SingleTaskGP, Likelihood]: + """Default GP for single objective optimization.""" if y.ndim == 1: y = y.unsqueeze(-1) + + if outcome_transform is None: + outcome_transform = Standardize(m=1) + encoder = x.encoder numerics: list[int] = [] categoricals: list[int] = [] @@ -163,7 +178,9 @@ def default_single_obj_gp( else: numerics.append(encoder.index_of[hp_name]) - likelihood = default_likelihood_with_prior() + # TODO: If we have a low cardinality integer, we should consider + # just treating it as a categorical... + likelihood = likelihood_with_prior_on_log_scale() # Purely vectorial if len(categoricals) == 0: @@ -174,7 +191,7 @@ def default_single_obj_gp( likelihood=likelihood, # Only matern kernel covar_module=default_matern_kernel(len(numerics)), - outcome_transform=Standardize(m=1), + outcome_transform=outcome_transform, ) return gp, likelihood @@ -187,7 +204,7 @@ def default_single_obj_gp( likelihood=likelihood, # Only categorical kernel covar_module=default_categorical_kernel(len(categoricals)), - outcome_transform=Standardize(m=1), + outcome_transform=outcome_transform, ) return gp, likelihood @@ -215,7 +232,7 @@ def default_single_obj_gp( mean_module=default_mean(), likelihood=likelihood, covar_module=kernel, - outcome_transform=Standardize(m=1), + outcome_transform=outcome_transform, ) return gp, likelihood @@ -226,15 +243,16 @@ def optimize_acq( *, n_candidates_required: int = 1, num_restarts: int = 20, - n_intial_start_points: int = 512, + n_intial_start_points: int = 256, acq_options: Mapping[str, Any] | None = None, maximum_allowed_categorical_combinations: int = 30, ) -> tuple[torch.Tensor, torch.Tensor]: + """Optimize the acquisition function.""" acq_options = acq_options or {} lower = [domain.lower for domain in encoder.domains] upper = [domain.upper for domain in encoder.domains] - bounds = torch.tensor([lower, upper], dtype=torch.float) + bounds = torch.tensor([lower, upper], dtype=torch.float64) cat_transformers = { name: t @@ -281,8 +299,8 @@ def optimize_acq( else: fixed_cats = [dict(zip(cats.keys(), combo)) for combo in product(*cats.values())] - # TODO: we should deterministicall shuffle the fixed_categoricals as the - # underlying function does not. + # TODO: we should deterministically shuffle the fixed_categoricals + # as the underlying function does not. return optimize_acqf_mixed( acq_function=acq_fn, bounds=bounds, diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index eb916fb6..d2b58c19 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -1,11 +1,15 @@ from __future__ import annotations import math -from typing import TYPE_CHECKING, Any, Callable, Literal, Mapping +from typing import TYPE_CHECKING, Any, Literal, Mapping import torch -from botorch.acquisition import LinearMCObjective, qLogExpectedImprovement +from botorch.acquisition import ( + LinearMCObjective, +) +from botorch.acquisition.logei import qLogNoisyExpectedImprovement from botorch.fit import fit_gpytorch_mll +from botorch.models.transforms.outcome import ChainedOutcomeTransform, Log, Standardize from gpytorch import ExactMarginalLogLikelihood from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig @@ -20,13 +24,10 @@ optimize_acq, ) from neps.sampling import Prior, Sampler -from neps.search_spaces.encoding import TensorEncoder, TensorPack +from neps.search_spaces.encoding import TensorEncoder from neps.search_spaces.hyperparameters.categorical import CategoricalParameter if TYPE_CHECKING: - from botorch.models.gp_regression_mixed import Likelihood - from botorch.models.model import Model - from neps.search_spaces import ( SearchSpace, ) @@ -174,9 +175,6 @@ def __init__( # noqa: D417 pipeline_space: SearchSpace, *, initial_design_size: int | None = None, - surrogate_model: ( - Literal["gp"] | Callable[[TensorPack, torch.Tensor], tuple[Model, Likelihood]] - ) = "gp", use_priors: bool = False, use_cost: bool = False, sample_default_first: bool = False, @@ -192,8 +190,6 @@ def __init__( # noqa: D417 initial_design_size: Number of samples used before using the surrogate model. If None, it will take `int(log(N) ** 2)` samples where `N` is the number of parameters in the search space. - surrogate_model: Surrogate model, either a known model str or a callable - that takes in the training data and returns a model fitted to (X, y). use_priors: Whether to use priors set on the hyperparameters during search. use_cost: Whether to consider reported "cost" from configurations in decision making. If True, the optimizer will weigh potential candidates by how much @@ -241,10 +237,6 @@ def __init__( # noqa: D417 self.device = device self.sample_default_first = sample_default_first self.n_initial_design = initial_design_size - self._get_model = ( - default_single_obj_gp if surrogate_model == "gp" else surrogate_model - ) - self.initial_design_: list[dict[str, Any]] | None = None def ask( @@ -321,14 +313,21 @@ def ask( y = _missing_y_strategy(y) # Now fit our model - y_model, y_likelihood = self._get_model(x, y) + y_model, y_likelihood = default_single_obj_gp(x, y) fit_gpytorch_mll( ExactMarginalLogLikelihood(likelihood=y_likelihood, model=y_model) ) - acq = qLogExpectedImprovement( + # NOTE: We use: + # * q - allows accounting for pending points, normally used to get a batch + # of points. + # * log - More numerically stable + # * Noisy - In Deep-Learning, we shouldn't take f.min() incase it was a noise + # spike. This accounts for noise in objective. + # * ExpectedImprovement - Cause ya know, the default. + acq = qLogNoisyExpectedImprovement( y_model, - best_f=y.min(), + X_baseline=x.tensor, X_pending=maybe_x_pending_tensor, # Unfortunatly, there's no option to indicate that we minimize # the AcqFunction so we need to do some kind of transformation. @@ -364,10 +363,16 @@ def ask( cost = torch.tensor(costs, dtype=torch.float64, device=self.device) cost_z_score = _missing_cost_strategy(cost) - # TODO: We might want a different model for cost estimation... one reason - # is that cost estimates are likely to be a lot noisier than the likelihood - # we have by default. - cost_model, cost_likelihood = self._get_model(x, cost_z_score) + cost_model, cost_likelihood = default_single_obj_gp( + x, + cost_z_score, + outcome_transform=ChainedOutcomeTransform( + # TODO: Maybe some way for a user to specify their cost + # is on a log scale? + log=Log(), + standardize=Standardize(m=1), + ), + ) # Optimize the cost model fit_gpytorch_mll( @@ -376,8 +381,8 @@ def ask( acq = cost_cooled_acq( acq_fn=acq, model=cost_model, - likelihood=cost_likelihood, used_budget_percentage=_cost_used_budget_percentage(budget_info), + X_pending=maybe_x_pending_tensor, ) # Finally, optimize the acquisition function to get a configuration diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py index e28cf585..eb7e095b 100644 --- a/neps_examples/basic_usage/hyperparameters.py +++ b/neps_examples/basic_usage/hyperparameters.py @@ -7,8 +7,6 @@ import neps -from rich import print - PRINT = False @@ -21,16 +19,16 @@ def run_pipeline(float1, float2, float3, integer1, integer2): print("integer1:", integer1) print("integer2:", integer2) loss = -float( - np.sum( + integer2 + * np.sum( [ - (float1 * float2 / (float3 + 1)) * 1, # ,(int(categorical) + 1), + (float1 * float2 / (float3 + 1)), # * (int(categorical) + 1), integer1, - math.log(integer2), ] ) ) # Random noise # time.sleep(0.7) # For demonstration purposes - return {"loss": loss, "cost": math.log(integer2)} + return {"loss": loss, "cost": float(integer2)} pipeline_space = dict( @@ -45,7 +43,7 @@ def run_pipeline(float1, float2, float3, integer1, integer2): logging.basicConfig(level=logging.INFO) neps.run( run_pipeline=run_pipeline, - searcher="pibo", + searcher="bayesian_optimization", pipeline_space=pipeline_space, root_directory="results/hyperparameters_example", post_run_summary=True, diff --git a/pyproject.toml b/pyproject.toml index 27e49fa2..cd808439 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,7 @@ grakel = "^0.1" numpy = "^1" pandas = "^2" networkx = "^2.6.3" -nltk = "^3.6.4" +nltk = "^3" scipy = "^1" torch = ">1.7.0,!=2.0.1, !=2.1.0" matplotlib = "^3" From da47d701bf0f7907d447cc389ae3d0830ce32337 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Sun, 8 Sep 2024 18:55:53 +0200 Subject: [PATCH 33/63] optim: Scale initial sample points for acq. opt based on ndims --- neps/optimizers/bayesian_optimization/models/gp.py | 13 +++++++++++-- neps/optimizers/bayesian_optimization/optimizer.py | 8 +++++++- neps_examples/basic_usage/hyperparameters.py | 1 + 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index 7dd29dd7..8436310c 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -243,7 +243,7 @@ def optimize_acq( *, n_candidates_required: int = 1, num_restarts: int = 20, - n_intial_start_points: int = 256, + n_intial_start_points: int | None = None, acq_options: Mapping[str, Any] | None = None, maximum_allowed_categorical_combinations: int = 30, ) -> tuple[torch.Tensor, torch.Tensor]: @@ -260,6 +260,15 @@ def optimize_acq( if isinstance(t, CategoricalToIntegerTransformer) } if not any(cat_transformers): + # Small heuristic to increase the number of candidates as our dimensionality + # increases... we apply a cap. + if n_intial_start_points is None: + # TODO: Need to investigate how num_restarts is used in botorch to inform + # this proxy. + + # Cap out at 4096 when len(bounds) >= 8 + n_intial_start_points = min(64 * len(bounds) ** 2, 4096) + return optimize_acqf( acq_function=acq_fn, bounds=bounds, @@ -304,7 +313,7 @@ def optimize_acq( return optimize_acqf_mixed( acq_function=acq_fn, bounds=bounds, - num_restarts=num_restarts, + num_restarts=min(num_restarts // n_combos, 2), raw_samples=n_intial_start_points, q=n_candidates_required, fixed_features_list=fixed_cats, diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index d2b58c19..5a6593ba 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -313,7 +313,13 @@ def ask( y = _missing_y_strategy(y) # Now fit our model - y_model, y_likelihood = default_single_obj_gp(x, y) + y_model, y_likelihood = default_single_obj_gp( + x, + y, + # TODO: We should consider applying some heurisitc to see if this should + # also include a log transform, similar as we do to cost if using `use_cost`. + outcome_transform=Standardize(m=1), + ) fit_gpytorch_mll( ExactMarginalLogLikelihood(likelihood=y_likelihood, model=y_model) ) diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py index eb7e095b..6ea897f8 100644 --- a/neps_examples/basic_usage/hyperparameters.py +++ b/neps_examples/basic_usage/hyperparameters.py @@ -48,4 +48,5 @@ def run_pipeline(float1, float2, float3, integer1, integer2): root_directory="results/hyperparameters_example", post_run_summary=True, max_evaluations_total=50, + use_prior=True, ) From 5a710303711e90c52deaadf6e3c78b889bbe4da1 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 18 Sep 2024 12:57:43 +0200 Subject: [PATCH 34/63] fix: Remove old model --- neps/optimizers/bayesian_optimization/models/__init__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/models/__init__.py b/neps/optimizers/bayesian_optimization/models/__init__.py index 789b2b05..35ae2120 100755 --- a/neps/optimizers/bayesian_optimization/models/__init__.py +++ b/neps/optimizers/bayesian_optimization/models/__init__.py @@ -1,9 +1,5 @@ -from neps.optimizers.bayesian_optimization.models.gp import ComprehensiveGP -from neps.utils.common import MissingDependencyError - from .ftpfn import FTPFNSurrogate SurrogateModelMapping = { - "gp": ComprehensiveGP, "ftpfn": FTPFNSurrogate, } From afc904d89482afa21de8084b1cd19851fc10c3e4 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 18 Sep 2024 12:59:20 +0200 Subject: [PATCH 35/63] ci: Update ruff version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 775823dc..9b670189 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,7 @@ torchvision = ">=0.8.0" ifbo = ">=0.3.10" [tool.poetry.group.dev.dependencies] -ruff = "^0.4" +ruff = "*" pre-commit = "^3" mypy = "^1" pytest = "^7" From f0ec81ed21bf2543ff8ef296c18fde6c8ad8679a Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 18 Sep 2024 13:00:46 +0200 Subject: [PATCH 36/63] fix: Update pre-commit --- .pre-commit-config.yaml | 6 +- neps/optimizers/__init__.py | 10 +- neps/optimizers/base_optimizer.py | 31 +-- .../acquisition_functions/__init__.py | 5 +- .../acquisition_functions/_ehvi.py | 2 + .../acquisition_functions/base_acquisition.py | 2 + .../acquisition_functions/ei.py | 3 +- .../acquisition_functions/mf_pi.py | 105 +++++----- .../acquisition_functions/prior_weighted.py | 3 +- .../acquisition_functions/ucb.py | 20 +- .../weighted_acquisition.py | 3 +- .../acquisition_samplers/base_acq_sampler.py | 4 +- .../acquisition_samplers/evolution_sampler.py | 21 +- .../freeze_thaw_sampler.py | 16 +- .../acquisition_samplers/mutation_sampler.py | 11 +- .../acquisition_samplers/random_sampler.py | 8 +- .../bayesian_optimization/kernels/__init__.py | 4 +- .../kernels/get_kernels.py | 14 +- .../kernels/grakel_replace/edge_histogram.py | 154 +++++++------- .../kernels/grakel_replace/utils.py | 18 +- .../grakel_replace/vertex_histogram.py | 198 +++++++++--------- .../grakel_replace/weisfeiler_lehman.py | 193 ++++++++--------- .../bayesian_optimization/kernels/utils.py | 18 +- .../bayesian_optimization/models/ftpfn.py | 10 +- .../bayesian_optimization/models/gp.py | 8 +- .../bayesian_optimization/optimizer.py | 3 +- neps/optimizers/grid_search/optimizer.py | 12 +- neps/optimizers/info.py | 20 +- neps/optimizers/multi_fidelity/hyperband.py | 163 +++++++------- neps/optimizers/multi_fidelity/ifbo.py | 69 +++--- neps/optimizers/multi_fidelity/mf_bo.py | 62 +++--- .../multi_fidelity/promotion_policy.py | 16 +- .../multi_fidelity/sampling_policy.py | 74 ++++--- .../multi_fidelity/successive_halving.py | 63 +++--- neps/optimizers/multi_fidelity/utils.py | 77 +++---- .../multi_fidelity_prior/async_priorband.py | 122 +++++------ .../multi_fidelity_prior/priorband.py | 85 ++++---- neps/optimizers/multi_fidelity_prior/utils.py | 23 +- .../prototype_optimizer.py | 14 +- neps/optimizers/random_search/optimizer.py | 12 +- .../regularized_evolution/optimizer.py | 36 ++-- neps/optimizers/utils.py | 22 +- neps/plot/tensorboard_eval.py | 5 +- neps/sampling/distributions.py | 3 +- neps/sampling/priors.py | 11 +- neps/sampling/samplers.py | 3 +- neps/search_spaces/domain.py | 5 +- neps/search_spaces/encoding.py | 10 +- neps/state/__init__.py | 2 +- 49 files changed, 907 insertions(+), 872 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 92ff2356..aa81c5bb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: files: '^src/.*\.py$' - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.1 + rev: v1.11.2 hooks: - id: mypy files: | @@ -42,7 +42,7 @@ repos: - "--show-traceback" - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.29.1 + rev: 0.29.2 hooks: - id: check-github-workflows files: '^github/workflows/.*\.ya?ml$' @@ -51,7 +51,7 @@ repos: files: '^\.github/dependabot\.ya?ml$' - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.5.5 + rev: v0.6.5 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix, --no-cache] diff --git a/neps/optimizers/__init__.py b/neps/optimizers/__init__.py index a6c0d5f9..c96022f6 100644 --- a/neps/optimizers/__init__.py +++ b/neps/optimizers/__init__.py @@ -1,18 +1,17 @@ - - +from collections.abc import Callable, Mapping from functools import partial -from typing import TYPE_CHECKING, Callable, Mapping +from typing import TYPE_CHECKING from .base_optimizer import BaseOptimizer from .bayesian_optimization.optimizer import BayesianOptimization from .grid_search.optimizer import GridSearch -from .multi_fidelity.ifbo import IFBO from .multi_fidelity.hyperband import ( MOBSTER, AsynchronousHyperband, Hyperband, HyperbandCustomDefault, ) +from .multi_fidelity.ifbo import IFBO from .multi_fidelity.successive_halving import ( AsynchronousSuccessiveHalving, AsynchronousSuccessiveHalvingWithPriors, @@ -24,9 +23,6 @@ from .random_search.optimizer import RandomSearch from .regularized_evolution.optimizer import RegularizedEvolution -if TYPE_CHECKING: - from .base_optimizer import BaseOptimizer - # TODO: Rename Searcher to Optimizer... SearcherMapping: Mapping[str, Callable[..., BaseOptimizer]] = { "bayesian_optimization": partial(BayesianOptimization, use_priors=False), diff --git a/neps/optimizers/base_optimizer.py b/neps/optimizers/base_optimizer.py index c5b5f83f..a80f9f75 100644 --- a/neps/optimizers/base_optimizer.py +++ b/neps/optimizers/base_optimizer.py @@ -1,15 +1,18 @@ - +from __future__ import annotations import logging from abc import abstractmethod -from typing import Any, Mapping - +from collections.abc import Mapping from dataclasses import asdict, dataclass -from neps.state.optimizer import BudgetInfo -from neps.utils.types import ConfigResult, RawConfig, ERROR, ResultDict -from neps.search_spaces.search_space import SearchSpace -from neps.utils.data_loading import _get_cost, _get_learning_curve, _get_loss +from typing import TYPE_CHECKING, Any + from neps.state.trial import Trial +from neps.utils.data_loading import _get_cost, _get_learning_curve, _get_loss +from neps.utils.types import ERROR, ConfigResult, RawConfig, ResultDict + +if TYPE_CHECKING: + from neps.search_spaces.search_space import SearchSpace + from neps.state.optimizer import BudgetInfo @dataclass @@ -58,7 +61,7 @@ def load_optimization_state( @abstractmethod def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]: - """Sample a new configuration + """Sample a new configuration. Returns: config: serializable object representing the configuration @@ -74,7 +77,7 @@ def ask( budget_info: BudgetInfo | None, optimizer_state: dict[str, Any], ) -> tuple[SampledConfig, dict[str, Any]]: - """Sample a new configuration + """Sample a new configuration. !!! note @@ -145,8 +148,8 @@ def get_loss( self, result: ERROR | ResultDict | float | Trial.Report ) -> float | ERROR: """Calls result.utils.get_loss() and passes the error handling through. - Please use self.get_loss() instead of get_loss() in all optimizer classes.""" - + Please use self.get_loss() instead of get_loss() in all optimizer classes. + """ # TODO(eddiebergman): This is a forward change for whenever we can have optimizers # use `Trial` and `Report`, they already take care of this and save having to do this # `_get_loss` at every call. We can also then just use `None` instead of the string `"error"` @@ -163,7 +166,8 @@ def get_cost( self, result: ERROR | ResultDict | float | Trial.Report ) -> float | ERROR: """Calls result.utils.get_cost() and passes the error handling through. - Please use self.get_cost() instead of get_cost() in all optimizer classes.""" + Please use self.get_cost() instead of get_cost() in all optimizer classes. + """ # TODO(eddiebergman): This is a forward change for whenever we can have optimizers # use `Trial` and `Report`, they already take care of this and save having to do this # `_get_loss` at every call @@ -180,7 +184,8 @@ def get_learning_curve( self, result: str | dict | float | Trial.Report ) -> list[float] | Any: """Calls result.utils.get_loss() and passes the error handling through. - Please use self.get_loss() instead of get_loss() in all optimizer classes.""" + Please use self.get_loss() instead of get_loss() in all optimizer classes. + """ # TODO(eddiebergman): This is a forward change for whenever we can have optimizers # use `Trial` and `Report`, they already take care of this and save having to do this # `_get_loss` at every call diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py b/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py index 542664c4..a125997d 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py @@ -1,13 +1,10 @@ +from collections.abc import Callable from functools import partial -from typing import Callable from neps.optimizers.bayesian_optimization.acquisition_functions.ei import ( ComprehensiveExpectedImprovement, ) from neps.optimizers.bayesian_optimization.acquisition_functions.mf_pi import MFPI_Random -from neps.optimizers.bayesian_optimization.acquisition_functions.ucb import ( - UpperConfidenceBound, -) from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import ( DecayingPriorWeightedAcquisition, ) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py b/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py index 8722c545..236a17e5 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/_ehvi.py @@ -1,4 +1,6 @@ # from abc import ABC, abstractmethod +from __future__ import annotations + from itertools import product import torch diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py index 7249c0fd..b2f8783a 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/base_acquisition.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from abc import ABC, abstractmethod diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py index 1a4e24d0..aadb76a0 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/ei.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Sequence +from collections.abc import Sequence +from typing import TYPE_CHECKING import torch from torch.distributions import Normal diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py index 71955820..ba2e886b 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py @@ -1,26 +1,33 @@ # type: ignore -from typing import Any, Iterable, Tuple, Union +from __future__ import annotations + +from collections.abc import Iterable +from typing import TYPE_CHECKING, Any import numpy as np -import pandas as pd import torch -from copy import deepcopy - -from neps.optimizers.utils import map_real_hyperparameters_from_tabular_ids -from neps.search_spaces.search_space import SearchSpace +from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( + BaseAcquisition, +) from neps.optimizers.multi_fidelity.utils import ( - get_freeze_thaw_normalized_step, get_tokenized_data, MFObservedData + MFObservedData, + get_freeze_thaw_normalized_step, + get_tokenized_data, ) -from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import BaseAcquisition +from neps.optimizers.utils import map_real_hyperparameters_from_tabular_ids +if TYPE_CHECKING: + import pandas as pd + + from neps.search_spaces.search_space import SearchSpace -class MFPI(BaseAcquisition): +class MFPI(BaseAcquisition): def __init__( self, pipeline_space: SearchSpace, - surrogate_model_name: str = None, + surrogate_model_name: str | None = None, ): super().__init__() self.pipeline_space = pipeline_space @@ -34,7 +41,7 @@ def set_state( pipeline_space: SearchSpace, surrogate_model: Any, observations: MFObservedData, - b_step: Union[int, float], + b_step: int | float, **kwargs, ): # overload to select incumbent differently through observations @@ -42,9 +49,8 @@ def set_state( self.surrogate_model = surrogate_model self.observations = observations self.b_step = b_step - return - def preprocess(self, x: pd.Series) -> Tuple[pd.Series, torch.Tensor]: + def preprocess(self, x: pd.Series) -> tuple[pd.Series, torch.Tensor]: """Prepares the configurations for appropriate EI calculation. Takes a set of points and computes the budget and incumbent for each point, as @@ -52,7 +58,7 @@ def preprocess(self, x: pd.Series) -> Tuple[pd.Series, torch.Tensor]: """ raise NotImplementedError - def eval(self, x: pd.Series, asscalar: bool = False) -> Tuple[np.ndarray, pd.Series]: + def eval(self, x: pd.Series, asscalar: bool = False) -> tuple[np.ndarray, pd.Series]: # deepcopy # _x = pd.Series([deepcopy(x.loc[idx]) for idx in x.index.values], index=x.index) if self.surrogate_model_name == "ftpfn": @@ -64,18 +70,20 @@ def eval(self, x: pd.Series, asscalar: bool = False) -> Tuple[np.ndarray, pd.Ser idx_mask = np.where(_idx > max(self.observations.seen_config_ids))[0] _idx[idx_mask] = 0 # normalizing steps - _steps = torch.Tensor([ - get_freeze_thaw_normalized_step( - _conf.fidelity.value, - self.pipeline_space.fidelity.lower, - self.pipeline_space.fidelity.upper, - self.b_step - ) - for _conf in _x - ]) - _x_tok = torch.hstack(( - (_idx).reshape(-1, 1), _steps.reshape(-1, 1), torch.Tensor(_x_tok) - )) + _steps = torch.Tensor( + [ + get_freeze_thaw_normalized_step( + _conf.fidelity.value, + self.pipeline_space.fidelity.lower, + self.pipeline_space.fidelity.upper, + self.b_step, + ) + for _conf in _x + ] + ) + _x_tok = torch.hstack( + ((_idx).reshape(-1, 1), _steps.reshape(-1, 1), torch.Tensor(_x_tok)) + ) pi = self.eval_pfn_pi(_x_tok, inc_list) else: raise ValueError( @@ -85,12 +93,11 @@ def eval(self, x: pd.Series, asscalar: bool = False) -> Tuple[np.ndarray, pd.Ser pi = pi.cpu() if len(_x) > 1 and asscalar: return pi.detach().numpy(), _x - else: - return pi.detach().numpy().item(), _x + return pi.detach().numpy().item(), _x def eval_pfn_pi( self, x: Iterable, inc_list: Iterable - ) -> Union[np.ndarray, torch.Tensor, float]: + ) -> np.ndarray | torch.Tensor | float: """PFN-PI modified to preprocess samples and accept list of incumbents.""" pi = self.surrogate_model.get_pi(x.to(self.surrogate_model.device), inc_list) if len(pi.shape) == 2: @@ -99,7 +106,6 @@ def eval_pfn_pi( class MFPI_Random(MFPI): - BUDGET = 1000 def __init__( @@ -107,7 +113,7 @@ def __init__( pipeline_space: SearchSpace, horizon: str = "random", threshold: str = "random", - surrogate_model_name: str = None, + surrogate_model_name: str | None = None, ): super().__init__(pipeline_space, surrogate_model_name) self.horizon = horizon @@ -118,35 +124,34 @@ def set_state( pipeline_space: SearchSpace, surrogate_model: Any, observations: MFObservedData, - b_step: Union[int, float], + b_step: int | float, **kwargs, ): # set RNG self.rng = np.random.RandomState(seed=42) - for i in range(len(observations.completed_runs)): - self.rng.uniform(-4,-1) - self.rng.randint(1,51) + for _i in range(len(observations.completed_runs)): + self.rng.uniform(-4, -1) + self.rng.randint(1, 51) return super().set_state(pipeline_space, surrogate_model, observations, b_step) def sample_horizon(self, steps_passed): - if self.horizon == 'random': + if self.horizon == "random": shortest = self.pipeline_space.fidelity.lower longest = min(self.pipeline_space.fidelity.upper, self.BUDGET - steps_passed) - return self.rng.randint(shortest, longest+1) - elif self.horizon == 'max': + return self.rng.randint(shortest, longest + 1) + if self.horizon == "max": return min(self.pipeline_space.fidelity.upper, self.BUDGET - steps_passed) - else: - return int(self.horizon) + return int(self.horizon) def sample_performance_threshold(self, f_inc): - if self.threshold == 'random': - lu = 10**self.rng.uniform(-4,-1) # % of gap closed + if self.threshold == "random": + lu = 10 ** self.rng.uniform(-4, -1) # % of gap closed else: lu = float(self.threshold) return f_inc * (1 - lu) - def preprocess(self, x: pd.Series) -> Tuple[pd.Series, torch.Tensor]: + def preprocess(self, x: pd.Series) -> tuple[pd.Series, torch.Tensor]: """Prepares the configurations for appropriate EI calculation. Takes a set of points and computes the budget and incumbent for each point, as @@ -180,11 +185,13 @@ def preprocess(self, x: pd.Series) -> Tuple[pd.Series, torch.Tensor]: indices_to_drop.append(i) else: # a candidate partial training run to continue - config.update_hp_values({ - config.fidelity_name: min( - config.fidelity.value + horizon, config.fidelity.upper - ) # if horizon exceeds max, query at max - }) + config.update_hp_values( + { + config.fidelity_name: min( + config.fidelity.value + horizon, config.fidelity.upper + ) # if horizon exceeds max, query at max + } + ) inc_list.append(inc_value) else: # a candidate new training run that we would need to start @@ -192,7 +199,7 @@ def preprocess(self, x: pd.Series) -> Tuple[pd.Series, torch.Tensor]: inc_list.append(inc_value) # Drop unused configs - x.drop(labels=indices_to_drop, inplace=True) + x = x.drop(labels=indices_to_drop) assert len(inc_list) == len(x) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py index 8a735d58..2728d67a 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/prior_weighted.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Iterable +from collections.abc import Iterable +from typing import TYPE_CHECKING from typing_extensions import override import numpy as np diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py b/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py index 11b592eb..3733b693 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/ucb.py @@ -1,4 +1,6 @@ -from typing import Iterable, Union +from __future__ import annotations + +from collections.abc import Iterable import numpy as np import torch @@ -7,7 +9,7 @@ class UpperConfidenceBound(BaseAcquisition): - def __init__(self, beta: float=1.0, maximize: bool=False): + def __init__(self, beta: float = 1.0, maximize: bool = False): """Upper Confidence Bound (UCB) acquisition function. Args: @@ -18,7 +20,7 @@ def __init__(self, beta: float=1.0, maximize: bool=False): super().__init__() self.beta = beta # can be updated as part of the state for dynamism or a schedule self.maximize = maximize - + # to be initialized as part of the state self.surrogate_model = None @@ -26,14 +28,14 @@ def set_state(self, surrogate_model, **kwargs): super().set_state(surrogate_model) self.surrogate_model = surrogate_model if "beta" in kwargs: - if not isinstance(kwargs["beta"], (list, np.array)): + if not isinstance(kwargs["beta"], list | np.array): self.beta = kwargs["beta"] else: self.logger.warning("Beta is a list, not updating beta value!") - + def eval( self, x: Iterable, asscalar: bool = False - ) -> Union[np.ndarray, torch.Tensor, float]: + ) -> np.ndarray | torch.Tensor | float: try: mu, cov = self.surrogate_model.predict(x) std = torch.sqrt(torch.diag(cov)) @@ -41,7 +43,5 @@ def eval( raise e sign = 1 if self.maximize else -1 # LCB is performed if minimize=True ucb_scores = mu + sign * np.sqrt(self.beta) * std - # if LCB, minimize acquisition, or maximize -acquisition - ucb_scores = ucb_scores.detach().numpy() * sign - - return ucb_scores + # if LCB, minimize acquisition, or maximize -acquisition + return ucb_scores.detach().numpy() * sign diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py index fa7ca176..f589298b 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py @@ -74,7 +74,8 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Callable, TypeVar +from collections.abc import Callable +from typing import TYPE_CHECKING, TypeVar from botorch.acquisition import SampleReducingMCAcquisitionFunction from botorch.acquisition.analytic import AcquisitionFunction, t_batch_mode_transform diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/base_acq_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/base_acq_sampler.py index adf47b82..c0049a3f 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_samplers/base_acq_sampler.py +++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/base_acq_sampler.py @@ -1,11 +1,13 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Sequence, Callable +from collections.abc import Callable, Sequence +from typing import TYPE_CHECKING if TYPE_CHECKING: import numpy as np import torch + from neps.search_spaces.search_space import SearchSpace diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/evolution_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/evolution_sampler.py index 6a76dcfc..4aec84eb 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_samplers/evolution_sampler.py +++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/evolution_sampler.py @@ -1,13 +1,17 @@ +from __future__ import annotations + import random from heapq import nlargest -from typing import List, Tuple +from typing import TYPE_CHECKING import numpy as np -from ....search_spaces.search_space import SearchSpace from .base_acq_sampler import AcquisitionSampler from .random_sampler import RandomSampler +if TYPE_CHECKING: + from neps.search_spaces.search_space import SearchSpace + class EvolutionSampler(AcquisitionSampler): def __init__( @@ -113,7 +117,7 @@ def evolution( acquisition_function, previous_samples: list, population_size: int, - batch_size: int = None, + batch_size: int | None = None, ): def inner_loop(population, fitness, X_max, acq_max): try: @@ -142,7 +146,7 @@ def inner_loop(population, fitness, X_max, acq_max): if not self.allow_isomorphism and self.check_isomorphism_history else [] ) - population: List[SearchSpace] = [] + population: list[SearchSpace] = [] remaining_patience = self.patience while ( population_size - len(previous_samples) > len(population) @@ -186,7 +190,10 @@ def inner_loop(population, fitness, X_max, acq_max): population, fitness, X_max, acq_max ) if all( - all(np.isclose(x, l) for l in list(zip(*iterations_best[-5:]))[j]) + all( + np.isclose(x, l) + for l in list(zip(*iterations_best[-5:], strict=False))[j] + ) for j, x in enumerate(acq_max) ): break @@ -195,8 +202,8 @@ def inner_loop(population, fitness, X_max, acq_max): return X_max, population, acq_max - def sample(self, acquisition_function) -> Tuple[list, list, list]: - population: List[SearchSpace] = [] + def sample(self, acquisition_function) -> tuple[list, list, list]: + population: list[SearchSpace] = [] if self.initial_history_last > 0 and len(self.x) >= self.initial_history_last: population = self.x[-self.initial_history_last :] if self.initial_history_best > 0 and len(self.x) >= self.initial_history_best: diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py index 93c7370f..3021bfe0 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py +++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py @@ -1,18 +1,20 @@ from __future__ import annotations -from typing import Callable import warnings +from collections.abc import Callable +from copy import deepcopy +from typing import TYPE_CHECKING import numpy as np import pandas as pd -from copy import deepcopy -from neps.search_spaces.search_space import SearchSpace -from neps.optimizers.multi_fidelity.utils import MFObservedData from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( AcquisitionSampler, ) +if TYPE_CHECKING: + from neps.optimizers.multi_fidelity.utils import MFObservedData + from neps.search_spaces.search_space import SearchSpace SAMPLES_TO_DRAW = ( 100 # number of random samples to draw for optimizing acquisition function @@ -131,7 +133,7 @@ def sample( # handles tabular data such that the entire unseen set of configs from the # table is considered to be the new set of candidates _partial_ids = {conf["id"].value for conf in partial_configs} - _all_ids = set(list(self.pipeline_space.custom_grid_table.keys())) + _all_ids = set(self.pipeline_space.custom_grid_table.keys()) # accounting for unseen configs only, samples remaining table if flag is set max_n = len(_all_ids) + 1 if self.sample_full_table else _n @@ -178,9 +180,7 @@ def sample( for config in new_configs: config.update_hp_values({config.fidelity_name: new_fid}) - configs = pd.concat([deepcopy(partial_configs), new_configs]) - - return configs # type: ignore + return pd.concat([deepcopy(partial_configs), new_configs]) def set_state( self, diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py index 81f79b96..51f10bfb 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py +++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/mutation_sampler.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Callable, Sequence +from collections.abc import Callable, Sequence +from typing import TYPE_CHECKING from typing_extensions import override import numpy as np @@ -119,9 +120,11 @@ def create_pool( n_best = len(self.x) if len(self.x) < self.n_best else self.n_best best_configs = [ - x for (_, x) in - sorted(zip(self.y, self.x), key=lambda pair: pair[0]) - ][:n_best] + x + for (_, x) in sorted( + zip(self.y, self.x, strict=False), key=lambda pair: pair[0] + ) + ][:n_best] seen: set[int] = set() diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/random_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/random_sampler.py index 5d783a3e..d5335731 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_samplers/random_sampler.py +++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/random_sampler.py @@ -1,6 +1,12 @@ -from ....search_spaces.search_space import SearchSpace +from __future__ import annotations + +from typing import TYPE_CHECKING + from .base_acq_sampler import AcquisitionSampler +if TYPE_CHECKING: + from neps.search_spaces.search_space import SearchSpace + class RandomSampler(AcquisitionSampler): def __init__(self, pipeline_space: SearchSpace, patience: int = 100): diff --git a/neps/optimizers/bayesian_optimization/kernels/__init__.py b/neps/optimizers/bayesian_optimization/kernels/__init__.py index 3c922b17..7c7018d0 100644 --- a/neps/optimizers/bayesian_optimization/kernels/__init__.py +++ b/neps/optimizers/bayesian_optimization/kernels/__init__.py @@ -1,7 +1,5 @@ - - +from collections.abc import Callable from functools import partial -from typing import Callable from .vectorial_kernels import HammingKernel, Matern32Kernel, Matern52Kernel, RBFKernel from .weisfilerlehman import WeisfilerLehman diff --git a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py b/neps/optimizers/bayesian_optimization/kernels/get_kernels.py index 927e23c2..36add92e 100644 --- a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py +++ b/neps/optimizers/bayesian_optimization/kernels/get_kernels.py @@ -1,9 +1,11 @@ -from neps.utils.common import instance_from_map -from ....search_spaces.architecture.core_graph_grammar import CoreGraphGrammar -from ....search_spaces.hyperparameters.categorical import CategoricalParameter -from ....search_spaces.hyperparameters.float import FloatParameter -from ....search_spaces.hyperparameters.integer import IntegerParameter -from ....utils.common import has_instance +from __future__ import annotations + +from neps.search_spaces.architecture.core_graph_grammar import CoreGraphGrammar +from neps.search_spaces.hyperparameters.categorical import CategoricalParameter +from neps.search_spaces.hyperparameters.float import FloatParameter +from neps.search_spaces.hyperparameters.integer import IntegerParameter +from neps.utils.common import has_instance, instance_from_map + from . import GraphKernelMapping, StationaryKernelMapping diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py index 1b0b37d6..12a83a19 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py @@ -1,5 +1,7 @@ """The Edge Histogram kernel as defined in :cite:`sugiyama2015halting`.""" +from __future__ import annotations + from collections import Counter from collections.abc import Iterable from warnings import warn @@ -22,7 +24,7 @@ class EdgeHistogram(VertexHistogram): If 'auto', uses a sparse matrix when the number of zeros is more than the half of the matrix size. In all cases if the dense matrix doesn't fit system memory, I sparse approach will be tried. - Attributes + Attributes: ---------- None. @@ -41,7 +43,7 @@ def parse_input(self, X: Iterable, **kwargs): node_labels and the third edge_labels (that fitting the given graph format). - Returns + Returns: ------- out : np.array, shape=(len(X), n_labels) A np array for frequency (cols) histograms for all Graphs (rows). @@ -49,82 +51,78 @@ def parse_input(self, X: Iterable, **kwargs): """ if not isinstance(X, Iterable): raise TypeError("input must be an iterable\n") - else: - rows, cols, data = list(), list(), list() - if self._method_calling in [1, 2]: - labels = dict() - self._labels = labels - elif self._method_calling == 3: - labels = dict(self._labels) - ni = 0 - for i, x in enumerate(iter(X)): - is_iter = isinstance(x, Iterable) - if is_iter: - x = list(x) - if is_iter and len(x) in [0, 3]: - if len(x) == 0: - warn("Ignoring empty element on index: " + str(i)) - continue - else: - # Our element is an iterable of at least 2 elements - L = x[2] - elif isinstance(x, Graph): - # get labels in any existing format - L = x.get_labels(purpose="any", label_type="edge") - else: - raise TypeError( - "each element of X must be either a " - + "graph object or a list with at least " - + "a graph like object and node labels " - + "dict \n" - ) - - if L is None: - raise ValueError("Invalid graph entry at location " + str(i) + "!") - # construct the data input for the numpy array - for label, frequency in Counter(L.values()).items(): - # for the row that corresponds to that graph - rows.append(ni) - - # and to the value that this label is indexed - col_idx = labels.get(label, None) - if col_idx is None: - # if not indexed, add the new index (the next) - col_idx = len(labels) - labels[label] = col_idx - - # designate the certain column information - cols.append(col_idx) - - # as well as the frequency value to data - data.append(frequency) - ni += 1 + rows, cols, data = [], [], [] + if self._method_calling in [1, 2]: + labels = {} + self._labels = labels + elif self._method_calling == 3: + labels = dict(self._labels) + ni = 0 + for i, x in enumerate(iter(X)): + is_iter = isinstance(x, Iterable) + if is_iter: + x = list(x) + if is_iter and len(x) in [0, 3]: + if len(x) == 0: + warn("Ignoring empty element on index: " + str(i)) + continue + # Our element is an iterable of at least 2 elements + L = x[2] + elif isinstance(x, Graph): + # get labels in any existing format + L = x.get_labels(purpose="any", label_type="edge") + else: + raise TypeError( + "each element of X must be either a " + + "graph object or a list with at least " + + "a graph like object and node labels " + + "dict \n" + ) + if L is None: + raise ValueError("Invalid graph entry at location " + str(i) + "!") + # construct the data input for the numpy array + for label, frequency in Counter(L.values()).items(): + # for the row that corresponds to that graph + rows.append(ni) + + # and to the value that this label is indexed + col_idx = labels.get(label, None) + if col_idx is None: + # if not indexed, add the new index (the next) + col_idx = len(labels) + labels[label] = col_idx + + # designate the certain column information + cols.append(col_idx) + + # as well as the frequency value to data + data.append(frequency) + ni += 1 + + # Initialise the feature matrix + if self._method_calling in [1, 2]: + if self.sparse == "auto": + self.sparse_ = len(cols) / float(ni * len(labels)) <= 0.5 + else: + self.sparse_ = bool(self.sparse) + + if self.sparse_: + features = csr_matrix( + (data, (rows, cols)), shape=(ni, len(labels)), copy=False + ) + else: # Initialise the feature matrix - if self._method_calling in [1, 2]: - if self.sparse == "auto": - self.sparse_ = len(cols) / float(ni * len(labels)) <= 0.5 - else: - self.sparse_ = bool(self.sparse) - - if self.sparse_: - features = csr_matrix( - (data, (rows, cols)), shape=(ni, len(labels)), copy=False + try: + features = zeros(shape=(ni, len(labels))) + features[rows, cols] = data + except MemoryError: + warn("memory-error: switching to sparse") + self.sparse_, features = ( + True, + csr_matrix((data, (rows, cols)), shape=(ni, len(labels)), copy=False), ) - else: - # Initialise the feature matrix - try: - features = zeros(shape=(ni, len(labels))) - features[rows, cols] = data - except MemoryError: - warn("memory-error: switching to sparse") - self.sparse_, features = ( - True, - csr_matrix( - (data, (rows, cols)), shape=(ni, len(labels)), copy=False - ), - ) - - if ni == 0: - raise ValueError("parsed input is empty") - return features + + if ni == 0: + raise ValueError("parsed input is empty") + return features diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py index b34b2c79..e0ad94f3 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py @@ -1,11 +1,12 @@ +from __future__ import annotations + import torch def calculate_kernel_matrix_as_tensor( X, Y=None, oa=False, se_kernel=None, normalize=True ) -> torch.Tensor: - """ - Same as calculate kernel matrix, but in pytorch framework and uses autodiff to compute the gradient of + """Same as calculate kernel matrix, but in pytorch framework and uses autodiff to compute the gradient of the kernel function with respect to the feature vector. This function is taken out of the class to facilitate derivative computation. @@ -26,18 +27,14 @@ def calculate_kernel_matrix_as_tensor( normalize: bool: Whether to normalize the GP covariance matrix to the range of [0, 1]. Default is True. - Returns + Returns: ------- K: pytorch tensor, shape = [n_targets, n_inputs] dK_dY: pytorch tensor, of the same shape of K. The derivative of the value of the kernel function with respect to each of the X. If Y is None, the derivative is instead taken at the *training point* (i.e. X). """ - if Y is None: - if se_kernel is not None: - K = se_kernel.forward(X, X) - else: - K = X @ X.t() + K = se_kernel.forward(X, X) if se_kernel is not None else X @ X.t() if normalize: K_diag = torch.sqrt(torch.diag(K)) K_diag_outer = torch.ger(K_diag, K_diag) @@ -46,10 +43,7 @@ def calculate_kernel_matrix_as_tensor( assert Y.shape[1] == X.shape[1], ( "got Y shape " + str(Y.shape[1]) + " but X shape " + str(X.shape[1]) ) - if se_kernel is not None: - K = se_kernel.forward(X, Y) - else: - K = Y @ X.t() + K = se_kernel.forward(X, Y) if se_kernel is not None else Y @ X.t() if normalize: Kxx = calculate_kernel_matrix_as_tensor( X, X, oa=oa, se_kernel=se_kernel, normalize=False diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py index 4a4dfc79..a3a31bdf 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py @@ -108,7 +108,7 @@ def initialize(self): self._initialized["n_jobs"] = True if not self._initialized["sparse"]: if self.sparse not in ["auto", False, True]: - TypeError("sparse could be False, True or auto") + raise TypeError("sparse could be False, True or auto") self._initialized["sparse"] = True def parse_input(self, X, label_start_idx=0, label_end_idx=None): @@ -144,100 +144,98 @@ def parse_input(self, X, label_start_idx=0, label_end_idx=None): if not isinstance(X, Iterable): raise TypeError("input must be an iterable\n") - else: - rows, cols, data = [], [], [] - if self._method_calling in [0, 1, 2]: - labels = {} - self._labels = labels - elif self._method_calling == 3: - labels = dict(self._labels) - ni = 0 - for i, x in enumerate(iter(X)): - is_iter = isinstance(x, Iterable) - if is_iter: - x = list(x) - if is_iter and len(x) in [0, 2, 3]: - if len(x) == 0: - warn("Ignoring empty element on index: " + str(i)) - continue - else: - # Our element is an iterable of at least 2 elements - L = x[1] - elif isinstance(x, Graph): - # get labels in any existing format - L = x.get_labels(purpose="any") - else: - raise TypeError( - "each element of X must be either a " - "graph object or a list with at least " - "a graph like object and node labels " - "dict \n" - ) - - # construct the data input for the numpy array - for label, frequency in Counter(L.values()).items(): - # for the row that corresponds to that graph - rows.append(ni) - - # and to the value that this label is indexed - if self.require_ordered_features: - try: - col_idx = int(label) - label_start_idx # Offset - except ValueError: - logging.error( - "Failed to convert label to a valid integer. Check whether all labels are" - "numeric, and whether you called this kernel directly instead of from the" - "Weisfiler-Lehman kernel. Falling back to the default unordered feature" - "matrix." - ) - self.require_ordered_features = False - if not self.require_ordered_features: - col_idx = labels.get(label, None) - if col_idx is None: - # if not indexed, add the new index (the next) - col_idx = len(labels) - labels[label] = col_idx - - # designate the certain column information - cols.append(col_idx) - - # as well as the frequency value to data - data.append(frequency) - ni += 1 - - if self.require_ordered_features: - label_length = max(label_end_idx - label_start_idx, *cols) + 1 + rows, cols, data = [], [], [] + if self._method_calling in [0, 1, 2]: + labels = {} + self._labels = labels + elif self._method_calling == 3: + labels = dict(self._labels) + ni = 0 + for i, x in enumerate(iter(X)): + is_iter = isinstance(x, Iterable) + if is_iter: + x = list(x) + if is_iter and len(x) in [0, 2, 3]: + if len(x) == 0: + warn("Ignoring empty element on index: " + str(i)) + continue + # Our element is an iterable of at least 2 elements + L = x[1] + elif isinstance(x, Graph): + # get labels in any existing format + L = x.get_labels(purpose="any") else: - label_length = len(labels) + raise TypeError( + "each element of X must be either a " + "graph object or a list with at least " + "a graph like object and node labels " + "dict \n" + ) - if self._method_calling in [0, 1, 2]: - if self.sparse == "auto": - self.sparse_ = len(cols) / float(ni * label_length) <= 0.5 - else: - self.sparse_ = bool(self.sparse) + # construct the data input for the numpy array + for label, frequency in Counter(L.values()).items(): + # for the row that corresponds to that graph + rows.append(ni) + + # and to the value that this label is indexed + if self.require_ordered_features: + try: + col_idx = int(label) - label_start_idx # Offset + except ValueError: + logging.error( + "Failed to convert label to a valid integer. Check whether all labels are" + "numeric, and whether you called this kernel directly instead of from the" + "Weisfiler-Lehman kernel. Falling back to the default unordered feature" + "matrix." + ) + self.require_ordered_features = False + if not self.require_ordered_features: + col_idx = labels.get(label, None) + if col_idx is None: + # if not indexed, add the new index (the next) + col_idx = len(labels) + labels[label] = col_idx + + # designate the certain column information + cols.append(col_idx) + + # as well as the frequency value to data + data.append(frequency) + ni += 1 - if self.sparse_: - features = csr_matrix( - (data, (rows, cols)), shape=(ni, label_length), copy=False - ) + if self.require_ordered_features: + label_length = max(label_end_idx - label_start_idx, *cols) + 1 + else: + label_length = len(labels) + + if self._method_calling in [0, 1, 2]: + if self.sparse == "auto": + self.sparse_ = len(cols) / float(ni * label_length) <= 0.5 else: - # Initialise the feature matrix - try: - features = zeros(shape=(ni, label_length)) - features[rows, cols] = data - - except MemoryError: - warn("memory-error: switching to sparse") - self.sparse_, features = ( - True, - csr_matrix( - (data, (rows, cols)), shape=(ni, label_length), copy=False - ), - ) - - if ni == 0: - raise ValueError("parsed input is empty") - return features + self.sparse_ = bool(self.sparse) + + if self.sparse_: + features = csr_matrix( + (data, (rows, cols)), shape=(ni, label_length), copy=False + ) + else: + # Initialise the feature matrix + try: + features = zeros(shape=(ni, label_length)) + features[rows, cols] = data + + except MemoryError: + warn("memory-error: switching to sparse") + self.sparse_, features = ( + True, + csr_matrix( + (data, (rows, cols)), shape=(ni, label_length), copy=False + ), + ) + + if ni == 0: + raise ValueError("parsed input is empty") + return features def _calculate_kernel_matrix(self, Y=None): """Calculate the kernel matrix given a target_graph and a kernel. @@ -282,8 +280,7 @@ def _calculate_kernel_matrix(self, Y=None): if self.sparse_: return K.toarray() - else: - return K + return K def diagonal(self, use_tensor=False): """Calculate the kernel matrix diagonal of the fitted data. @@ -319,12 +316,11 @@ def diagonal(self, use_tensor=False): if use_tensor: Y_diag = torch.einsum("ij, ij->i", [self.Y_tensor, self.Y_tensor]) return self._X_diag, Y_diag + if self.sparse_: + Y_diag = squeeze(array(self._Y.multiply(self._Y).sum(axis=1))) else: - if self.sparse_: - Y_diag = squeeze(array(self._Y.multiply(self._Y).sum(axis=1))) - else: - Y_diag = einsum("ij,ij->i", self._Y, self._Y) - return self._X_diag, Y_diag + Y_diag = einsum("ij,ij->i", self._Y, self._Y) + return self._X_diag, Y_diag except NotFittedError: return self._X_diag @@ -360,8 +356,7 @@ def transform(self, X, return_embedding_only=False, **kwargs): # Input validation and parsing if X is None: raise ValueError("`transform` input cannot be None") - else: - Y = self.parse_input(X, **kwargs) + Y = self.parse_input(X, **kwargs) if return_embedding_only: return Y @@ -446,8 +441,7 @@ def fit(self, X, y=None, **kwargs): # Input validation and parsing if X is None: raise ValueError("`fit` input cannot be None") - else: - self.X = self.parse_input(X, **kwargs) + self.X = self.parse_input(X, **kwargs) # Return the transformer return self diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py index 8c4baf64..f62d0ca0 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py @@ -8,8 +8,8 @@ import warnings from ast import literal_eval from collections import OrderedDict +from collections.abc import Iterable from copy import deepcopy -from typing import Iterable import numpy as np import torch @@ -209,7 +209,7 @@ def parse_input( raise ValueError( "method call must be called either from fit " + "or fit-transform" ) - elif hasattr(self, "_X_diag"): + if hasattr(self, "_X_diag"): # Clean _X_diag value delattr(self, "_X_diag") @@ -221,57 +221,56 @@ def parse_input( # Input validation and parsing if not isinstance(X, collections.abc.Iterable): raise TypeError("input must be an iterable\n") - else: - nx = 0 - Gs_ed, L, distinct_values, extras = {}, {}, set(), {} - for idx, x in enumerate(iter(X)): - is_iter = isinstance(x, collections.abc.Iterable) - if is_iter: - x = list(x) - if is_iter and (len(x) == 0 or len(x) >= 2): - if len(x) == 0: - warnings.warn("Ignoring empty element on index: " + str(idx)) - continue - elif len(x) > 2: - extra = () - if len(x) > 3: - extra = tuple(x[3:]) - x = Graph(x[0], x[1], x[2], graph_format=self._graph_format) - extra = ( - x.get_labels( - purpose=self._graph_format, - label_type="edge", - return_none=True, - ), - *extra, - ) - else: - x = Graph(x[0], x[1], {}, graph_format=self._graph_format) - extra = () - - elif isinstance(x, Graph): - x.desired_format(self._graph_format) - el = x.get_labels( - purpose=self._graph_format, - label_type="edge", - return_none=True, + nx = 0 + Gs_ed, L, distinct_values, extras = {}, {}, set(), {} + for idx, x in enumerate(iter(X)): + is_iter = isinstance(x, collections.abc.Iterable) + if is_iter: + x = list(x) + if is_iter and (len(x) == 0 or len(x) >= 2): + if len(x) == 0: + warnings.warn("Ignoring empty element on index: " + str(idx)) + continue + if len(x) > 2: + extra = () + if len(x) > 3: + extra = tuple(x[3:]) + x = Graph(x[0], x[1], x[2], graph_format=self._graph_format) + extra = ( + x.get_labels( + purpose=self._graph_format, + label_type="edge", + return_none=True, + ), + *extra, ) - extra = () if el is None else (el,) - else: - raise TypeError( - "each element of X must be either a " - + "graph object or a list with at least " - + "a graph like object and node labels " - + "dict \n" - ) - Gs_ed[nx] = x.get_edge_dictionary() - L[nx] = x.get_labels(purpose="dictionary") - extras[nx] = extra - distinct_values |= set(L[nx].values()) - nx += 1 - if nx == 0: - raise ValueError("parsed input is empty") + x = Graph(x[0], x[1], {}, graph_format=self._graph_format) + extra = () + + elif isinstance(x, Graph): + x.desired_format(self._graph_format) + el = x.get_labels( + purpose=self._graph_format, + label_type="edge", + return_none=True, + ) + extra = () if el is None else (el,) + + else: + raise TypeError( + "each element of X must be either a " + + "graph object or a list with at least " + + "a graph like object and node labels " + + "dict \n" + ) + Gs_ed[nx] = x.get_edge_dictionary() + L[nx] = x.get_labels(purpose="dictionary") + extras[nx] = extra + distinct_values |= set(L[nx].values()) + nx += 1 + if nx == 0: + raise ValueError("parsed input is empty") # Save the number of "fitted" graphs. self._nx = nx @@ -411,9 +410,9 @@ def generate_graphs(label_count: int, WL_labels_inverse): if return_embedding_only: return K - elif self._method_calling == 1: + if self._method_calling == 1: return base_graph_kernel - elif self._method_calling == 2: + if self._method_calling == 2: if self.as_tensor: K = torch.stack(K, dim=0).sum(dim=0) return K, base_graph_kernel @@ -451,8 +450,7 @@ def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True): # pylint: di ] # Flush the feature dimensions if X is None: raise ValueError("transform input cannot be None") - else: - km, self.X = self.parse_input(X, gp_fit=gp_fit) + km, self.X = self.parse_input(X, gp_fit=gp_fit) return km @@ -487,40 +485,37 @@ def transform(self, X: Iterable, return_embedding_only: bool = True): # Input validation and parsing if X is None: raise ValueError("transform input cannot be None") - elif not isinstance(X, collections.abc.Iterable): + if not isinstance(X, collections.abc.Iterable): raise ValueError("input must be an iterable\n") - else: - nx = 0 - distinct_values = set() - Gs_ed, L = {}, {} - for i, x in enumerate(iter(X)): - is_iter = isinstance(x, collections.abc.Iterable) - if is_iter: - x = list(x) - if is_iter and len(x) in [0, 2, 3]: - if len(x) == 0: - warnings.warn("Ignoring empty element on index: " + str(i)) - continue - - elif len(x) in [2, 3]: - x = Graph(x[0], x[1], {}, self._graph_format) - elif isinstance(x, Graph): - x.desired_format("dictionary") - else: - raise ValueError( - "each element of X must have at " - + "least one and at most 3 elements\n" - ) - Gs_ed[nx] = x.get_edge_dictionary() - L[nx] = x.get_labels(purpose="dictionary") + nx = 0 + distinct_values = set() + Gs_ed, L = {}, {} + for i, x in enumerate(iter(X)): + is_iter = isinstance(x, collections.abc.Iterable) + if is_iter: + x = list(x) + if is_iter and len(x) in [0, 2, 3]: + if len(x) == 0: + warnings.warn("Ignoring empty element on index: " + str(i)) + continue + + if len(x) in [2, 3]: + x = Graph(x[0], x[1], {}, self._graph_format) + elif isinstance(x, Graph): + x.desired_format("dictionary") + else: + raise ValueError( + "each element of X must have at " + + "least one and at most 3 elements\n" + ) + Gs_ed[nx] = x.get_edge_dictionary() + L[nx] = x.get_labels(purpose="dictionary") - # Hold all the distinct values - distinct_values |= { - v for v in L[nx].values() if v not in self._inv_labels[0] - } - nx += 1 - if nx == 0: - raise ValueError("parsed input is empty") + # Hold all the distinct values + distinct_values |= {v for v in L[nx].values() if v not in self._inv_labels[0]} + nx += 1 + if nx == 0: + raise ValueError("parsed input is empty") nl = len(self._inv_labels[0]) WL_labels_inverse = { @@ -690,8 +685,7 @@ def diagonal(self): Y_diag = torch.tensor(Y_diag) if self._is_transformed: return self._X_diag, Y_diag - else: - return self._X_diag + return self._X_diag @staticmethod def translate_label(curr_layer: dict, h: int, prev_layer: dict | None = None): @@ -706,17 +700,16 @@ def translate_label(curr_layer: dict, h: int, prev_layer: dict | None = None): """ if h == 0: return {v: str(k) for k, v in curr_layer.items()}, curr_layer - else: - assert prev_layer is not None - label_in_node_attr, inv_label_in_node_attr = OrderedDict(), OrderedDict() - for pattern, encoding in curr_layer.items(): - # current pattern is in terms of the encoding previous layer. Find the pattern from the prev_layer - root, leaf = literal_eval(pattern) - root_ = prev_layer[root] - leaf_ = [prev_layer[i] for i in leaf] - label_in_node_attr.update({encoding: "~".join([root_, *leaf_])}) - inv_label_in_node_attr.update({"~".join([root_, *leaf_]): encoding}) - return label_in_node_attr, inv_label_in_node_attr + assert prev_layer is not None + label_in_node_attr, inv_label_in_node_attr = OrderedDict(), OrderedDict() + for pattern, encoding in curr_layer.items(): + # current pattern is in terms of the encoding previous layer. Find the pattern from the prev_layer + root, leaf = literal_eval(pattern) + root_ = prev_layer[root] + leaf_ = [prev_layer[i] for i in leaf] + label_in_node_attr.update({encoding: "~".join([root_, *leaf_])}) + inv_label_in_node_attr.update({"~".join([root_, *leaf_]): encoding}) + return label_in_node_attr, inv_label_in_node_attr @staticmethod def _compute_feature_weight( diff --git a/neps/optimizers/bayesian_optimization/kernels/utils.py b/neps/optimizers/bayesian_optimization/kernels/utils.py index e134bfd0..6d94a25d 100644 --- a/neps/optimizers/bayesian_optimization/kernels/utils.py +++ b/neps/optimizers/bayesian_optimization/kernels/utils.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING import networkx as nx import numpy as np @@ -22,8 +22,8 @@ def transform_to_undirected(gr: list): return undirected_gr -def extract_configs(configs: list[SearchSpace]) -> Tuple[list, list]: - """Extracts graph & HPs from configs objects +def extract_configs(configs: list[SearchSpace]) -> tuple[list, list]: + """Extracts graph & HPs from configs objects. Args: configs (list): Object holding graph and/or HPs @@ -53,10 +53,7 @@ def extract_configs(configs: list[SearchSpace]) -> Tuple[list, list]: def graph_metrics(graph, metric=None, directed=True): - if directed: - G = graph - else: - G = graph.to_undirected() + G = graph if directed else graph.to_undirected() # global metrics if metric == "avg_path_length": @@ -75,14 +72,14 @@ def graph_metrics(graph, metric=None, directed=True): def extract_configs_hierarchy( configs: list, d_graph_features: int, hierarchy_consider=None -) -> Tuple[list, list]: +) -> tuple[list, list]: """Extracts graph & graph features from configs objects Args: configs (list): Object holding graph and/or graph features d_graph_features (int): Number of global graph features used; if d_graph_features=0, indicate not using global graph features hierarchy_consider (list or None): Specify graphs at which earlier hierarchical levels to be considered Returns: - Tuple[list, list]: list of graphs, list of HPs + Tuple[list, list]: list of graphs, list of HPs. """ N = len(configs) @@ -114,7 +111,8 @@ def extract_configs_hierarchy( for hierarchy_id in hierarchy_consider ] for g in combined_graphs - ] + ], + strict=False, ), ) ) diff --git a/neps/optimizers/bayesian_optimization/models/ftpfn.py b/neps/optimizers/bayesian_optimization/models/ftpfn.py index 3831ec61..95b02ba0 100644 --- a/neps/optimizers/bayesian_optimization/models/ftpfn.py +++ b/neps/optimizers/bayesian_optimization/models/ftpfn.py @@ -1,9 +1,9 @@ from __future__ import annotations -from typing import Any from pathlib import Path -import torch +from typing import Any +import torch from ifbo import FTPFN @@ -136,23 +136,21 @@ def get_lcb( self, test_x: torch.Tensor, beta: float = (1 - 0.682) / 2 ) -> torch.Tensor: logits = self._get_logits(test_x) - lcb = self.ftpfn.model.criterion.ucb( + return self.ftpfn.model.criterion.ucb( logits=logits, best_f=None, rest_prob=beta, maximize=False, # IMPORTANT to be False, should calculate the LCB using the lower-bound ICDF as per beta ) - return lcb @torch.no_grad() def get_ucb( self, test_x: torch.Tensor, beta: float = (1 - 0.682) / 2 ) -> torch.Tensor: logits = self._get_logits(test_x) - lcb = self.ftpfn.model.criterion.ucb( + return self.ftpfn.model.criterion.ucb( logits=logits, best_f=None, rest_prob=beta, maximize=True, # IMPORTANT to be True, should calculate the UCB using the upper-bound ICDF as per beta ) - return lcb diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index 8436310c..2ab0b897 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -4,8 +4,9 @@ import logging import math +from collections.abc import Mapping from functools import reduce -from typing import TYPE_CHECKING, Any, Mapping, TypeVar +from typing import TYPE_CHECKING, Any, TypeVar import gpytorch import gpytorch.constraints @@ -306,7 +307,10 @@ def optimize_acq( col, choice_indices = next(iter(cats.items())) fixed_cats = [{col: i} for i in choice_indices] else: - fixed_cats = [dict(zip(cats.keys(), combo)) for combo in product(*cats.values())] + fixed_cats = [ + dict(zip(cats.keys(), combo, strict=False)) + for combo in product(*cats.values()) + ] # TODO: we should deterministically shuffle the fixed_categoricals # as the underlying function does not. diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index b5d518bc..6fe20655 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -1,7 +1,8 @@ from __future__ import annotations import math -from typing import TYPE_CHECKING, Any, Literal, Mapping +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, Literal import torch from botorch.acquisition import LinearMCObjective diff --git a/neps/optimizers/grid_search/optimizer.py b/neps/optimizers/grid_search/optimizer.py index e9f1d9a3..f548b197 100644 --- a/neps/optimizers/grid_search/optimizer.py +++ b/neps/optimizers/grid_search/optimizer.py @@ -1,12 +1,16 @@ +from __future__ import annotations + import random -from typing import Any +from typing import TYPE_CHECKING, Any from typing_extensions import override -from neps.state.optimizer import BudgetInfo -from neps.utils.types import ConfigResult, RawConfig -from neps.search_spaces.search_space import SearchSpace from neps.optimizers.base_optimizer import BaseOptimizer +if TYPE_CHECKING: + from neps.search_spaces.search_space import SearchSpace + from neps.state.optimizer import BudgetInfo + from neps.utils.types import ConfigResult, RawConfig + class GridSearch(BaseOptimizer): def __init__( diff --git a/neps/optimizers/info.py b/neps/optimizers/info.py index 40b08174..d3dfffd8 100644 --- a/neps/optimizers/info.py +++ b/neps/optimizers/info.py @@ -1,18 +1,18 @@ +from __future__ import annotations + import os import yaml class SearcherConfigs: - """ - This class provides methods to access default configuration details + """This class provides methods to access default configuration details for NePS optimizers. """ @staticmethod def _get_searchers_folder_path() -> str: - """ - Helper method to get the folder path for default searchers. + """Helper method to get the folder path for default searchers. Returns: str: The absolute path to the default searchers folder. @@ -22,8 +22,7 @@ def _get_searchers_folder_path() -> str: @staticmethod def get_searchers() -> list[str]: - """ - List all the searcher names that can be used in neps run. + """List all the searcher names that can be used in neps run. Returns: list[str]: A list of searcher names. @@ -40,8 +39,7 @@ def get_searchers() -> list[str]: @staticmethod def get_available_algorithms() -> list[str]: - """ - List all available algorithms used by NePS searchers. + """List all available algorithms used by NePS searchers. Returns: list[str]: A list of algorithm names. @@ -62,8 +60,7 @@ def get_available_algorithms() -> list[str]: @staticmethod def get_searcher_from_algorithm(algorithm: str) -> list[str]: - """ - Get all NePS searchers that use a specific searching algorithm. + """Get all NePS searchers that use a specific searching algorithm. Args: algorithm (str): The name of the algorithm needed for the search. @@ -86,8 +83,7 @@ def get_searcher_from_algorithm(algorithm: str) -> list[str]: @staticmethod def get_searcher_kwargs(searcher: str) -> str: - """ - Get the kwargs and algorithm setup for a specific searcher. + """Get the kwargs and algorithm setup for a specific searcher. Args: searcher (str): The name of the searcher to check the details of. diff --git a/neps/optimizers/multi_fidelity/hyperband.py b/neps/optimizers/multi_fidelity/hyperband.py index 510fb582..f6c445ac 100644 --- a/neps/optimizers/multi_fidelity/hyperband.py +++ b/neps/optimizers/multi_fidelity/hyperband.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import typing from copy import deepcopy from typing import Any, Literal @@ -5,15 +7,6 @@ import numpy as np -from neps.state.optimizer import BudgetInfo -from neps.utils.types import ConfigResult, RawConfig -from neps.search_spaces.search_space import SearchSpace -from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( - BaseAcquisition, -) -from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, -) from neps.optimizers.multi_fidelity.mf_bo import MFBOBase from neps.optimizers.multi_fidelity.promotion_policy import ( AsyncPromotionPolicy, @@ -31,6 +24,17 @@ SuccessiveHalvingBase, ) +if typing.TYPE_CHECKING: + from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( + BaseAcquisition, + ) + from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( + AcquisitionSampler, + ) + from neps.search_spaces.search_space import SearchSpace + from neps.state.optimizer import BudgetInfo + from neps.utils.types import ConfigResult, RawConfig + class HyperbandBase(SuccessiveHalvingBase): """Implements a Hyperband procedure with a sampling and promotion policy.""" @@ -50,29 +54,29 @@ def __init__( cost_value_on_error: None | float = None, ignore_errors: bool = False, logger=None, - prior_confidence: Literal["low", "medium", "high"] = None, + prior_confidence: Literal["low", "medium", "high"] | None = None, random_interleave_prob: float = 0.0, sample_default_first: bool = False, sample_default_at_target: bool = False, ): - args = dict( - pipeline_space=pipeline_space, - budget=budget, - eta=eta, - early_stopping_rate=self.early_stopping_rate, # HB subsumes this param of SH - initial_design_type=initial_design_type, - use_priors=use_priors, - sampling_policy=sampling_policy, - promotion_policy=promotion_policy, - loss_value_on_error=loss_value_on_error, - cost_value_on_error=cost_value_on_error, - ignore_errors=ignore_errors, - logger=logger, - prior_confidence=prior_confidence, - random_interleave_prob=random_interleave_prob, - sample_default_first=sample_default_first, - sample_default_at_target=sample_default_at_target, - ) + args = { + "pipeline_space": pipeline_space, + "budget": budget, + "eta": eta, + "early_stopping_rate": self.early_stopping_rate, # HB subsumes this param of SH + "initial_design_type": initial_design_type, + "use_priors": use_priors, + "sampling_policy": sampling_policy, + "promotion_policy": promotion_policy, + "loss_value_on_error": loss_value_on_error, + "cost_value_on_error": cost_value_on_error, + "ignore_errors": ignore_errors, + "logger": logger, + "prior_confidence": prior_confidence, + "random_interleave_prob": random_interleave_prob, + "sample_default_first": sample_default_first, + "sample_default_at_target": sample_default_at_target, + } super().__init__(**args) # stores the flattened sequence of SH brackets to loop over - the HB heuristic # for (n,r) pairing, i.e., (num. configs, fidelity) @@ -120,7 +124,6 @@ def _handle_promotions(self): # promotions are handled by the individual SH brackets which are explicitly # called in the _update_sh_bracket_state() function # overloaded function disables the need for retrieving promotions for HB overall - return @override def load_optimization_state( @@ -134,7 +137,7 @@ def load_optimization_state( previous_results=previous_results, pending_evaluations=pending_evaluations, budget_info=budget_info, - optimizer_state=optimizer_state + optimizer_state=optimizer_state, ) # important for the global HB to run the right SH self._update_sh_bracket_state() @@ -340,28 +343,28 @@ def __init__( cost_value_on_error: None | float = None, ignore_errors: bool = False, logger=None, - prior_confidence: Literal["low", "medium", "high"] = None, + prior_confidence: Literal["low", "medium", "high"] | None = None, random_interleave_prob: float = 0.0, sample_default_first: bool = False, sample_default_at_target: bool = False, ): - args = dict( - pipeline_space=pipeline_space, - budget=budget, - eta=eta, - initial_design_type=initial_design_type, - use_priors=use_priors, - sampling_policy=sampling_policy, - promotion_policy=promotion_policy, - loss_value_on_error=loss_value_on_error, - cost_value_on_error=cost_value_on_error, - ignore_errors=ignore_errors, - logger=logger, - prior_confidence=prior_confidence, - random_interleave_prob=random_interleave_prob, - sample_default_first=sample_default_first, - sample_default_at_target=sample_default_at_target, - ) + args = { + "pipeline_space": pipeline_space, + "budget": budget, + "eta": eta, + "initial_design_type": initial_design_type, + "use_priors": use_priors, + "sampling_policy": sampling_policy, + "promotion_policy": promotion_policy, + "loss_value_on_error": loss_value_on_error, + "cost_value_on_error": cost_value_on_error, + "ignore_errors": ignore_errors, + "logger": logger, + "prior_confidence": prior_confidence, + "random_interleave_prob": random_interleave_prob, + "sample_default_first": sample_default_first, + "sample_default_at_target": sample_default_at_target, + } super().__init__(**args) # overwrite parent class SH brackets with Async SH brackets self.sh_brackets = {} @@ -402,8 +405,7 @@ def _get_bracket_to_run(self): self.eta ** (K - s) * (K + 1) / (K - s + 1) for s in range(self.max_rung + 1) ] bracket_probs = np.array(bracket_probs) / sum(bracket_probs) - bracket_next = np.random.choice(range(self.max_rung + 1), p=bracket_probs) - return bracket_next + return np.random.choice(range(self.max_rung + 1), p=bracket_probs) def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]: """...and this is the method that decides which point to query. @@ -477,50 +479,50 @@ def __init__( cost_value_on_error: None | float = None, ignore_errors: bool = False, logger=None, - prior_confidence: Literal["low", "medium", "high"] = None, + prior_confidence: Literal["low", "medium", "high"] | None = None, random_interleave_prob: float = 0.0, sample_default_first: bool = False, sample_default_at_target: bool = False, # new arguments for model model_policy: typing.Any = ModelPolicy, surrogate_model: str | Any = "gp", - domain_se_kernel: str = None, - hp_kernels: list = None, - surrogate_model_args: dict = None, + domain_se_kernel: str | None = None, + hp_kernels: list | None = None, + surrogate_model_args: dict | None = None, acquisition: str | BaseAcquisition = "EI", log_prior_weighted: bool = False, acquisition_sampler: str | AcquisitionSampler = "random", ): - hb_args = dict( - pipeline_space=pipeline_space, - budget=budget, - eta=eta, - initial_design_type=initial_design_type, - use_priors=use_priors, - sampling_policy=sampling_policy, - promotion_policy=promotion_policy, - loss_value_on_error=loss_value_on_error, - cost_value_on_error=cost_value_on_error, - ignore_errors=ignore_errors, - logger=logger, - prior_confidence=prior_confidence, - random_interleave_prob=random_interleave_prob, - sample_default_first=sample_default_first, - sample_default_at_target=sample_default_at_target, - ) + hb_args = { + "pipeline_space": pipeline_space, + "budget": budget, + "eta": eta, + "initial_design_type": initial_design_type, + "use_priors": use_priors, + "sampling_policy": sampling_policy, + "promotion_policy": promotion_policy, + "loss_value_on_error": loss_value_on_error, + "cost_value_on_error": cost_value_on_error, + "ignore_errors": ignore_errors, + "logger": logger, + "prior_confidence": prior_confidence, + "random_interleave_prob": random_interleave_prob, + "sample_default_first": sample_default_first, + "sample_default_at_target": sample_default_at_target, + } super().__init__(**hb_args) self.pipeline_space.has_prior = self.use_priors - bo_args = dict( - surrogate_model=surrogate_model, - domain_se_kernel=domain_se_kernel, - hp_kernels=hp_kernels, - surrogate_model_args=surrogate_model_args, - acquisition=acquisition, - log_prior_weighted=log_prior_weighted, - acquisition_sampler=acquisition_sampler, - ) + bo_args = { + "surrogate_model": surrogate_model, + "domain_se_kernel": domain_se_kernel, + "hp_kernels": hp_kernels, + "surrogate_model_args": surrogate_model_args, + "acquisition": acquisition, + "log_prior_weighted": log_prior_weighted, + "acquisition_sampler": acquisition_sampler, + } # counting non-fidelity dimensions in search space ndims = sum( 1 @@ -535,4 +537,5 @@ def __init__( sh.model_policy = self.model_policy sh.sample_new_config = self.sample_new_config + # TODO: TrulyAsyncHyperband diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index bf2bbc83..c1a87862 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -1,27 +1,31 @@ -from typing import Any +from __future__ import annotations + +import warnings +from typing import TYPE_CHECKING, Any from typing_extensions import override import numpy as np import pandas as pd -import warnings -from neps.state.optimizer import BudgetInfo -from neps.utils.types import ConfigResult -from neps.utils.common import instance_from_map -from neps.search_spaces.search_space import FloatParameter, IntegerParameter, SearchSpace from neps.optimizers.base_optimizer import BaseOptimizer from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping -from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( - BaseAcquisition, -) from neps.optimizers.bayesian_optimization.acquisition_samplers import ( AcquisitionSamplerMapping, ) -from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, -) from neps.optimizers.multi_fidelity.mf_bo import FreezeThawModel, PFNSurrogate from neps.optimizers.multi_fidelity.utils import MFObservedData +from neps.search_spaces.search_space import FloatParameter, IntegerParameter, SearchSpace +from neps.utils.common import instance_from_map + +if TYPE_CHECKING: + from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( + BaseAcquisition, + ) + from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( + AcquisitionSampler, + ) + from neps.state.optimizer import BudgetInfo + from neps.utils.types import ConfigResult class IFBO(BaseOptimizer): @@ -32,7 +36,7 @@ class IFBO(BaseOptimizer): def __init__( self, pipeline_space: SearchSpace, - budget: int = None, + budget: int | None = None, step_size: int | float = 1, optimal_assignment: bool = False, # pylint: disable=unused-argument use_priors: bool = False, @@ -45,18 +49,18 @@ def __init__( logger=None, # arguments for model surrogate_model: str | Any = "ftpfn", - surrogate_model_args: dict = None, - domain_se_kernel: str = None, - graph_kernels: list = None, - hp_kernels: list = None, + surrogate_model_args: dict | None = None, + domain_se_kernel: str | None = None, + graph_kernels: list | None = None, + hp_kernels: list | None = None, acquisition: str | BaseAcquisition = acquisition, - acquisition_args: dict = None, + acquisition_args: dict | None = None, acquisition_sampler: str | AcquisitionSampler = "freeze-thaw", - acquisition_sampler_args: dict = None, + acquisition_sampler_args: dict | None = None, model_policy: Any = PFNSurrogate, initial_design_size: int = 1, ): - """Initialise + """Initialise. Args: pipeline_space: Space in which to search @@ -187,18 +191,17 @@ def _adjust_fidelity_for_freeze_thaw_steps( f"Adjusted fidelity lower bound to {pipeline_space.fidelity.lower} " f"for equal-sized steps of {step_size}." ) - print("New fidelity: ", pipeline_space.fidelity) return pipeline_space def _prep_model_args(self, hp_kernels, graph_kernels, pipeline_space): if self.surrogate_model_name in ["gp", "gp_hierarchy"]: # setup for GP implemented in NePS self.surrogate_model_args.update( - dict( + { # domain_se_kernel=domain_se_kernel, - hp_kernels=hp_kernels, - graph_kernels=graph_kernels, - ) + "hp_kernels": hp_kernels, + "graph_kernels": graph_kernels, + } ) if not self.surrogate_model_args["hp_kernels"]: raise ValueError("No kernels are provided!") @@ -246,16 +249,10 @@ def total_budget_spent(self) -> int | float: n_configs = len(self.observed_configs.seen_config_ids) total_budget_level = sum(self.observed_configs.seen_budget_levels) total_initial_budget_spent = n_configs * self.pipeline_space.fidelity.lower - total_budget_spent = ( - total_initial_budget_spent + total_budget_level * self.step_size - ) - - return total_budget_spent + return total_initial_budget_spent + total_budget_level * self.step_size def is_init_phase(self) -> bool: - if self.num_train_configs < self._initial_design_size: - return True - return False + return self.num_train_configs < self._initial_design_size @property def num_train_configs(self): @@ -287,8 +284,8 @@ def load_optimization_state( self._handle_pending_evaluations(pending_evaluations) # an aesthetic choice more than a functional choice - self.observed_configs.df.sort_index( - level=self.observed_configs.df.index.names, inplace=True + self.observed_configs.df = self.observed_configs.df.sort_index( + level=self.observed_configs.df.index.names ) # TODO: can we do better than keeping a copy of the observed configs? # TODO: can we not hide this in load_results and have something that pops out @@ -322,7 +319,7 @@ def index_data_split(config_id: str, config_val): tuple(index_data_split(config_id, config_val)) for config_id, config_val in previous_results.items() ] - indices, rows = zip(*index_row) + indices, rows = zip(*index_row, strict=False) self.observed_configs.add_data(data=list(rows), index=list(indices)) def _handle_pending_evaluations(self, pending_evaluations): diff --git a/neps/optimizers/multi_fidelity/mf_bo.py b/neps/optimizers/multi_fidelity/mf_bo.py index ef31f9cc..790c833a 100755 --- a/neps/optimizers/multi_fidelity/mf_bo.py +++ b/neps/optimizers/multi_fidelity/mf_bo.py @@ -1,15 +1,20 @@ # type: ignore - +from __future__ import annotations from copy import deepcopy + import torch -from neps.utils.common import instance_from_map from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping from neps.optimizers.multi_fidelity.utils import ( - get_tokenized_data, get_training_data_for_freeze_thaw + get_tokenized_data, + get_training_data_for_freeze_thaw, ) -from neps.optimizers.multi_fidelity_prior.utils import calc_total_resources_spent, update_fidelity +from neps.optimizers.multi_fidelity_prior.utils import ( + calc_total_resources_spent, + update_fidelity, +) +from neps.utils.common import instance_from_map class MFBOBase: @@ -20,7 +25,6 @@ class MFBOBase: def _fit_models(self): """Performs necessary procedures to build and use models.""" - if not self.model_based: # do nothing here if the algorithm has model-based search disabled return @@ -139,7 +143,7 @@ def is_init_phase(self) -> bool: def sample_new_config( self, - rung: int = None, + rung: int | None = None, **kwargs, # pylint: disable=unused-argument ): """Samples configuration from policies or random.""" @@ -186,7 +190,7 @@ def __init__( self, pipeline_space, surrogate_model: str = "ftpfn", - surrogate_model_args: dict = None, + surrogate_model_args: dict | None = None, step_size: int = 1, ): self.observed_configs = None @@ -228,7 +232,7 @@ def set_state( name="surrogate model", kwargs=self.surrogate_model_args, ) - + class PFNSurrogate(FreezeThawModel): """Special class to deal with PFN surrogate model and freeze-thaw acquisition.""" @@ -246,18 +250,20 @@ def update_model(self): self.observed_configs.perf_col, self.pipeline_space, step_size=self.step_size, - maximize=True # inverts performance since NePS minimizes + maximize=True, # inverts performance since NePS minimizes ) df_idxs = torch.Tensor(idxs) df_x = torch.Tensor(get_tokenized_data(configs)) df_steps = torch.Tensor(steps) - train_x = torch.hstack([ - df_idxs.reshape(df_steps.shape[0], 1), - df_steps.reshape(df_steps.shape[0], 1), - df_x - ]) + train_x = torch.hstack( + [ + df_idxs.reshape(df_steps.shape[0], 1), + df_steps.reshape(df_steps.shape[0], 1), + df_x, + ] + ) train_y = torch.Tensor(performance) - + # fit the model, on only completed runs self._fit(train_x, train_y) @@ -270,16 +276,18 @@ def update_model(self): self.observed_configs.perf_col, self.pipeline_space, step_size=self.step_size, - maximize=True # inverts performance since NePS minimizes + maximize=True, # inverts performance since NePS minimizes ) _df_x = torch.Tensor(get_tokenized_data(_configs)) _df_idxs = torch.Tensor(_idxs) _df_steps = torch.Tensor(_steps) - _test_x = torch.hstack([ - _df_idxs.reshape(_df_idxs.shape[0], 1), - _df_steps.reshape(_df_steps.shape[0], 1), - _df_x - ]) + _test_x = torch.hstack( + [ + _df_idxs.reshape(_df_idxs.shape[0], 1), + _df_steps.reshape(_df_steps.shape[0], 1), + _df_x, + ] + ) _performances = self._predict(_test_x) # returns maximizing metric # update the training data train_x = torch.vstack([train_x, _test_x]) @@ -294,14 +302,14 @@ def _fit(self, train_x: torch.Tensor, train_y: torch.Tensor): # pylint: disable self.surrogate_model.train_y = train_y def _predict(self, test_x: torch.Tensor) -> torch.Tensor: - assert self.surrogate_model.train_x is not None and self.surrogate_model.train_y is not None, "Model not trained yet!" + assert ( + self.surrogate_model.train_x is not None + and self.surrogate_model.train_y is not None + ), "Model not trained yet!" if self.surrogate_model_name == "ftpfn": mean = self.surrogate_model.get_mean_performance(test_x) if mean.is_cuda: mean = mean.cpu() return mean - else: - # check neps/optimizers/bayesian_optimization/models/__init__.py for options - raise ValueError( - f"Surrogate model {self.surrogate_model_name} not supported!" - ) + # check neps/optimizers/bayesian_optimization/models/__init__.py for options + raise ValueError(f"Surrogate model {self.surrogate_model_name} not supported!") diff --git a/neps/optimizers/multi_fidelity/promotion_policy.py b/neps/optimizers/multi_fidelity/promotion_policy.py index 102b7f82..8f6847ff 100644 --- a/neps/optimizers/multi_fidelity/promotion_policy.py +++ b/neps/optimizers/multi_fidelity/promotion_policy.py @@ -1,10 +1,12 @@ +from __future__ import annotations + from abc import ABC, abstractmethod import numpy as np class PromotionPolicy(ABC): - """Base class for implementing a sampling straregy for SH and its subclasses""" + """Base class for implementing a sampling straregy for SH and its subclasses.""" def __init__(self, eta: int): self.rung_members: dict = {} @@ -54,12 +56,12 @@ def set_state( self.config_map = config_map def retrieve_promotions(self) -> dict: - """Returns the top 1/eta configurations per rung if enough configurations seen""" + """Returns the top 1/eta configurations per rung if enough configurations seen.""" assert self.config_map is not None - self.rung_promotions = {rung: [] for rung in self.config_map.keys()} + self.rung_promotions = {rung: [] for rung in self.config_map} total_rung_evals = 0 - for rung in reversed(sorted(self.config_map.keys())): + for rung in sorted(self.config_map.keys(), reverse=True): total_rung_evals += len(self.rung_members[rung]) if ( total_rung_evals >= self.config_map[rung] @@ -93,7 +95,7 @@ def __init__(self, eta, **kwargs): super().__init__(eta, **kwargs) def retrieve_promotions(self) -> dict: - """Returns the top 1/eta configurations per rung if enough configurations seen""" + """Returns the top 1/eta configurations per rung if enough configurations seen.""" for rung in range(self.max_rung + 1): if rung == self.max_rung: # cease promotions for the highest rung (configs at max budget) @@ -102,6 +104,6 @@ def retrieve_promotions(self) -> dict: top_k = len(self.rung_members_performance[rung]) // self.eta _ordered_idx = np.argsort(self.rung_members_performance[rung]) self.rung_promotions[rung] = np.array(self.rung_members[rung])[_ordered_idx][ - :top_k - ].tolist() + :top_k + ].tolist() return self.rung_promotions diff --git a/neps/optimizers/multi_fidelity/sampling_policy.py b/neps/optimizers/multi_fidelity/sampling_policy.py index da564c6b..9208e4c3 100644 --- a/neps/optimizers/multi_fidelity/sampling_policy.py +++ b/neps/optimizers/multi_fidelity/sampling_policy.py @@ -1,32 +1,38 @@ # mypy: disable-error-code = assignment +from __future__ import annotations + import logging from abc import ABC, abstractmethod -from typing import Any +from typing import TYPE_CHECKING, Any import numpy as np import pandas as pd import torch -from neps.utils.common import instance_from_map -from ...search_spaces.search_space import SearchSpace -from ..bayesian_optimization.acquisition_functions import AcquisitionMapping -from ..bayesian_optimization.acquisition_functions.base_acquisition import ( - BaseAcquisition, -) -from ..bayesian_optimization.acquisition_functions.prior_weighted import ( +from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping +from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import ( DecayingPriorWeightedAcquisition, ) -from ..bayesian_optimization.acquisition_samplers import AcquisitionSamplerMapping -from ..bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, +from neps.optimizers.bayesian_optimization.acquisition_samplers import ( + AcquisitionSamplerMapping, ) -from ..bayesian_optimization.models import SurrogateModelMapping -from ..multi_fidelity_prior.utils import ( +from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping +from neps.optimizers.multi_fidelity_prior.utils import ( compute_config_dist, custom_crossover, local_mutation, update_fidelity, ) +from neps.utils.common import instance_from_map + +if TYPE_CHECKING: + from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( + BaseAcquisition, + ) + from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( + AcquisitionSampler, + ) + from neps.search_spaces.search_space import SearchSpace TOLERANCE = 1e-2 # 1% SAMPLE_THRESHOLD = 1000 # num samples to be rejected for increasing hypersphere radius @@ -35,7 +41,7 @@ class SamplingPolicy(ABC): - """Base class for implementing a sampling strategy for SH and its subclasses""" + """Base class for implementing a sampling strategy for SH and its subclasses.""" def __init__(self, pipeline_space: SearchSpace, patience: int = 100, logger=None): self.pipeline_space = pipeline_space @@ -48,7 +54,7 @@ def sample(self, *args, **kwargs) -> SearchSpace: class RandomUniformPolicy(SamplingPolicy): - """A random policy for sampling configuration, i.e. the default for SH / hyperband + """A random policy for sampling configuration, i.e. the default for SH / hyperband. Args: SamplingPolicy ([type]): [description] @@ -80,7 +86,7 @@ def __init__( self.fraction_from_prior = fraction_from_prior def sample(self, *args, **kwargs) -> SearchSpace: - """Samples from the prior with a certain probabiliyu + """Samples from the prior with a certain probabiliyu. Returns: SearchSpace: [description] @@ -88,10 +94,9 @@ def sample(self, *args, **kwargs) -> SearchSpace: user_priors = False if np.random.uniform() < self.fraction_from_prior: user_priors = True - config = self.pipeline_space.sample( + return self.pipeline_space.sample( patience=self.patience, user_priors=user_priors, ignore_fidelity=True ) - return config class EnsemblePolicy(SamplingPolicy): @@ -151,9 +156,13 @@ def sample_neighbour(self, incumbent, distance, tolerance=TOLERANCE): return config def sample( - self, inc: SearchSpace = None, weights: dict[str, float] = None, *args, **kwargs + self, + inc: SearchSpace = None, + weights: dict[str, float] | None = None, + *args, + **kwargs, ) -> SearchSpace: - """Samples from the prior with a certain probability + """Samples from the prior with a certain probability. Returns: SearchSpace: [description] @@ -256,7 +265,7 @@ def sample( class ModelPolicy(SamplingPolicy): - """A policy for sampling configuration, i.e. the default for SH / hyperband + """A policy for sampling configuration, i.e. the default for SH / hyperband. Args: SamplingPolicy ([type]): [description] @@ -266,7 +275,7 @@ def __init__( self, pipeline_space: SearchSpace, surrogate_model: str | Any = "gp", - surrogate_model_args: dict = None, + surrogate_model_args: dict | None = None, acquisition: str | BaseAcquisition = "EI", log_prior_weighted: bool = False, acquisition_sampler: str | AcquisitionSampler = "random", @@ -328,7 +337,10 @@ def update_model(self, train_x, train_y, pending_x, decay_t=None): # self.acquisition_sampler.set_state(x=train_x, y=train_y) def sample( - self, active_max_fidelity: int = None, fidelity: int = None, **kwargs + self, + active_max_fidelity: int | None = None, + fidelity: int | None = None, + **kwargs, ) -> SearchSpace: """Performs the equivalent of optimizing the acquisition function. @@ -381,11 +393,10 @@ def sample( # computes the EI for all `samples` eis = self.acquisition.eval(x=samples, asscalar=True) # extracting the highest scored sample - config = samples[np.argmax(eis)] + return samples[np.argmax(eis)] # TODO: can generalize s.t. sampler works for all types, currently, # random sampler in NePS does not do what is required here # return self.acquisition_sampler.sample(self.acquisition) - return config class BaseDynamicModelPolicy(SamplingPolicy): @@ -394,10 +405,10 @@ def __init__( pipeline_space: SearchSpace, observed_configs: Any = None, surrogate_model: str | Any = "gp", - domain_se_kernel: str = None, - hp_kernels: list = None, - graph_kernels: list = None, - surrogate_model_args: dict = None, + domain_se_kernel: str | None = None, + hp_kernels: list | None = None, + graph_kernels: list | None = None, + surrogate_model_args: dict | None = None, acquisition: str | BaseAcquisition = "EI", use_priors: bool = False, log_prior_weighted: bool = False, @@ -541,10 +552,9 @@ def sample(self, rand_promotion_prob=0.5, seed=777, is_promotion=False, **kwargs if is_promotion and promoted: return config_id - elif is_promotion: + if is_promotion: return None - else: - return config + return config # def sample(self, **kwargs): # return self._sample(is_promotion=False, **kwargs) diff --git a/neps/optimizers/multi_fidelity/successive_halving.py b/neps/optimizers/multi_fidelity/successive_halving.py index 6df62333..b87f6b77 100644 --- a/neps/optimizers/multi_fidelity/successive_halving.py +++ b/neps/optimizers/multi_fidelity/successive_halving.py @@ -4,20 +4,12 @@ import random import typing from copy import deepcopy +from typing import Literal +from typing_extensions import override import numpy as np import pandas as pd -from typing import Literal -from typing_extensions import override -from neps.utils.types import ConfigResult, RawConfig -from neps.search_spaces import ( - CategoricalParameter, - ConstantParameter, - FloatParameter, - IntegerParameter, - SearchSpace, -) from neps.optimizers.base_optimizer import BaseOptimizer from neps.optimizers.multi_fidelity.promotion_policy import ( AsyncPromotionPolicy, @@ -27,6 +19,16 @@ FixedPriorPolicy, RandomUniformPolicy, ) +from neps.search_spaces import ( + CategoricalParameter, + ConstantParameter, + FloatParameter, + IntegerParameter, + SearchSpace, +) + +if typing.TYPE_CHECKING: + from neps.utils.types import ConfigResult, RawConfig CUSTOM_FLOAT_CONFIDENCE_SCORES = dict(FloatParameter.DEFAULT_CONFIDENCE_SCORES) CUSTOM_FLOAT_CONFIDENCE_SCORES.update({"ultra": 0.05}) @@ -43,7 +45,7 @@ class SuccessiveHalvingBase(BaseOptimizer): def __init__( self, pipeline_space: SearchSpace, - budget: int = None, + budget: int | None = None, eta: int = 3, early_stopping_rate: int = 0, initial_design_type: Literal["max_budget", "unique_configs"] = "max_budget", @@ -54,7 +56,7 @@ def __init__( cost_value_on_error: None | float = None, ignore_errors: bool = False, logger=None, - prior_confidence: Literal["low", "medium", "high"] = None, + prior_confidence: Literal["low", "medium", "high"] | None = None, random_interleave_prob: float = 0.0, sample_default_first: bool = False, sample_default_at_target: bool = False, @@ -142,9 +144,9 @@ def __init__( # crucial data structure used for determining promotion candidates self.observed_configs = pd.DataFrame([], columns=("config", "rung", "perf")) # stores which configs occupy each rung at any time - self.rung_members: dict = dict() # stores config IDs per rung - self.rung_members_performance: dict = dict() # performances recorded per rung - self.rung_promotions: dict = dict() # records a promotable config per rung + self.rung_members: dict = {} # stores config IDs per rung + self.rung_members_performance: dict = {} # performances recorded per rung + self.rung_promotions: dict = {} # records a promotable config per rung self.total_fevals = 0 # setup SH state counter @@ -178,7 +180,7 @@ def get_incumbent_score(self): def _get_rung_map(self, s: int = 0) -> dict: """Maps rungs (0,1,...,k) to a fidelity value based on fidelity bounds, eta, s.""" assert s <= self.stopping_rate_limit - new_min_budget = self.min_budget * (self.eta ** s) + new_min_budget = self.min_budget * (self.eta**s) nrungs = ( np.floor(np.log(self.max_budget / new_min_budget) / np.log(self.eta)).astype( int @@ -186,7 +188,7 @@ def _get_rung_map(self, s: int = 0) -> dict: + 1 ) _max_budget = self.max_budget - rung_map = dict() + rung_map = {} for i in reversed(range(nrungs)): rung_map[i + s] = ( int(_max_budget) @@ -197,9 +199,9 @@ def _get_rung_map(self, s: int = 0) -> dict: return rung_map def _get_config_map(self, s: int = 0) -> dict: - """Maps rungs (0,1,...,k) to the number of configs for each fidelity""" + """Maps rungs (0,1,...,k) to the number of configs for each fidelity.""" assert s <= self.stopping_rate_limit - new_min_budget = self.min_budget * (self.eta ** s) + new_min_budget = self.min_budget * (self.eta**s) nrungs = ( np.floor(np.log(self.max_budget / new_min_budget) / np.log(self.eta)).astype( int @@ -209,8 +211,8 @@ def _get_config_map(self, s: int = 0) -> dict: s_max = self.stopping_rate_limit + 1 _s = self.stopping_rate_limit - s # L2 from Alg 1 in https://arxiv.org/pdf/1603.06560.pdf - _n_config = np.floor(s_max / (_s + 1)) * self.eta ** _s - config_map = dict() + _n_config = np.floor(s_max / (_s + 1)) * self.eta**_s + config_map = {} for i in range(nrungs): config_map[i + s] = int(_n_config) _n_config //= self.eta @@ -253,7 +255,6 @@ def _load_previous_observations( # rung histories are collected only for `previous` and not `pending` configs self.rung_histories[int(_rung)]["config"].append(int(_config)) self.rung_histories[int(_rung)]["perf"].append(perf) - return def _handle_pending_evaluations( self, pending_evaluations: dict[str, ConfigResult] @@ -274,12 +275,11 @@ def _handle_pending_evaluations( else: self.observed_configs.at[int(_config), "rung"] = int(_rung) self.observed_configs.at[int(_config), "perf"] = np.nan - return def clean_rung_information(self): - self.rung_members = {k: [] for k in self.rung_map.keys()} - self.rung_members_performance = {k: [] for k in self.rung_map.keys()} - self.rung_promotions = {k: [] for k in self.rung_map.keys()} + self.rung_members = {k: [] for k in self.rung_map} + self.rung_members_performance = {k: [] for k in self.rung_map} + self.rung_promotions = {k: [] for k in self.rung_map} def _get_rungs_state(self, observed_configs=None): """Collects info on configs at a rung and their performance there.""" @@ -303,7 +303,6 @@ def _get_rungs_state(self, observed_configs=None): idxs = observed_configs.rung == _rung self.rung_members[_rung] = observed_configs.index[idxs].values self.rung_members_performance[_rung] = observed_configs.perf[idxs].values - return def _handle_promotions(self): self.promotion_policy.set_state( @@ -357,14 +356,12 @@ def load_optimization_state( # fit any model/surrogates self._fit_models() - return - def is_init_phase(self) -> bool: return True def sample_new_config( self, - rung: int = None, + rung: int | None = None, **kwargs, ): # Samples configuration from policy or random @@ -459,7 +456,7 @@ def _enhance_priors(self, confidence_score=None): for k, v in self.pipeline_space.items(): if v.is_fidelity or isinstance(v, ConstantParameter): continue - elif isinstance(v, (FloatParameter, IntegerParameter)): + if isinstance(v, FloatParameter | IntegerParameter): if confidence_score is None: confidence = CUSTOM_FLOAT_CONFIDENCE_SCORES[self.prior_confidence] else: @@ -478,7 +475,7 @@ def _enhance_priors(self, confidence_score=None): class SuccessiveHalving(SuccessiveHalvingBase): def _calc_budget_used_in_bracket(self, config_history: list[int]): budget = 0 - for rung in self.config_map.keys(): + for rung in self.config_map: count = sum(config_history == rung) # `range(min_rung, rung+1)` counts the black-box cost of promotions since # SH budgets assume each promotion involves evaluation from scratch @@ -598,7 +595,7 @@ def __init__( cost_value_on_error: None | float = None, ignore_errors: bool = False, logger=None, - prior_confidence: Literal["low", "medium", "high"] = None, + prior_confidence: Literal["low", "medium", "high"] | None = None, random_interleave_prob: float = 0.0, sample_default_first: bool = False, sample_default_at_target: bool = False, diff --git a/neps/optimizers/multi_fidelity/utils.py b/neps/optimizers/multi_fidelity/utils.py index f551e73f..0158fbdf 100644 --- a/neps/optimizers/multi_fidelity/utils.py +++ b/neps/optimizers/multi_fidelity/utils.py @@ -1,25 +1,28 @@ # type: ignore -from typing import Any, Sequence +from __future__ import annotations +from collections.abc import Sequence from copy import deepcopy +from typing import TYPE_CHECKING, Any + import numpy as np import pandas as pd -import torch -from neps.search_spaces.search_space import SearchSpace from neps.optimizers.utils import map_real_hyperparameters_from_tabular_ids +if TYPE_CHECKING: + from neps.search_spaces.search_space import SearchSpace + def continuous_to_tabular( config: SearchSpace, categorical_space: SearchSpace ) -> SearchSpace: - """ - Convert the continuous parameters in the config into categorical ones based on - the categorical_space provided + """Convert the continuous parameters in the config into categorical ones based on + the categorical_space provided. """ result = config.clone() for hp_name, _ in config.items(): - if hp_name in categorical_space.keys(): + if hp_name in categorical_space: choices = np.array(categorical_space[hp_name].choices) diffs = choices - config[hp_name].value # NOTE: in case of a tie the first value in the choices array will be returned @@ -44,14 +47,13 @@ def get_tokenized_data( configs: list[SearchSpace], ignore_fidelity: bool = True, ) -> np.ndarray: # pd.Series: # tuple[np.ndarray, np.ndarray, np.ndarray]: - """Extracts configurations, indices and performances given a DataFrame + """Extracts configurations, indices and performances given a DataFrame. Tokenizes the given set of observations as required by a PFN surrogate model. """ - configs = np.array( + return np.array( [normalize_vectorize_config(c, ignore_fidelity=ignore_fidelity) for c in configs] ) - return configs def get_freeze_thaw_normalized_step( @@ -98,8 +100,7 @@ def get_training_data_for_freeze_thaw( class MFObservedData: - """ - (Under development) + """(Under development). This module is used to unify the data access across different Multi-Fidelity optimizers. It stores column names and index names. Possible optimizations @@ -176,8 +177,7 @@ def completed_runs_index(self) -> pd.Index | pd.MultiIndex: def next_config_id(self) -> int: if len(self.seen_config_ids): return max(self.seen_config_ids) + 1 - else: - return 0 + return 0 def add_data( self, @@ -185,9 +185,7 @@ def add_data( index: tuple[int, ...] | Sequence[tuple[int, ...]] | Sequence[int] | int, error: bool = False, ): - """ - Add data only if none of the indices are already existing in the DataFrame - """ + """Add data only if none of the indices are already existing in the DataFrame.""" # TODO: If index is only config_id extend it if not isinstance(index, list): index_list = [index] @@ -213,16 +211,11 @@ def update_data( index: tuple[int, ...] | Sequence[tuple[int, ...]] | Sequence[int] | int, error: bool = False, ): - """ - Update data if all the indices already exist in the DataFrame - """ - if not isinstance(index, list): - index_list = [index] - else: - index_list = index + """Update data if all the indices already exist in the DataFrame.""" + index_list = [index] if not isinstance(index, list) else index if self.df.index.isin(index_list).sum() == len(index_list): - column_names, data = zip(*data_dict.items()) - data = list(zip(*data)) + column_names, data = zip(*data_dict.items(), strict=False) + data = list(zip(*data, strict=False)) self.df.loc[index_list, list(column_names)] = data elif error: @@ -243,8 +236,7 @@ def all_configs_list(self) -> list[Any]: return self.df.loc[:, self.config_col].sort_index().values.tolist() def get_incumbents_for_budgets(self, maximize: bool = False): - """ - Returns a series object with the best partial configuration for each budget id + """Returns a series object with the best partial configuration for each budget id. Note: this will always map the best lowest ID if two configurations have the same performance at the same fidelity @@ -255,13 +247,14 @@ def get_incumbents_for_budgets(self, maximize: bool = False): else: config_ids = learning_curves.idxmin(axis=0) - indices = list(zip(config_ids.values.tolist(), config_ids.index.to_list())) + indices = list( + zip(config_ids.values.tolist(), config_ids.index.to_list(), strict=False) + ) partial_configs = self.df.loc[indices, self.config_col].to_list() return pd.Series(partial_configs, index=config_ids.index, name=self.config_col) def get_best_performance_for_each_budget(self, maximize: bool = False): - """ - Returns a series object with the best partial configuration for each budget id + """Returns a series object with the best partial configuration for each budget id. Note: this will always map the best lowest ID if two configurations has the same performance at the same fidelity @@ -280,12 +273,10 @@ def get_budget_level_for_best_performance(self, maximize: bool = False) -> int: y_star = self.get_best_seen_performance(maximize=maximize) # uses the minimum of the budget that see the maximum obseved score op = max if maximize else min - z_inc = int(op([_z for _z, _y in perf_per_z.items() if _y == y_star])) - return z_inc + return int(op([_z for _z, _y in perf_per_z.items() if _y == y_star])) def get_best_learning_curve_id(self, maximize: bool = False): - """ - Returns a single configuration id of the best observed performance + """Returns a single configuration id of the best observed performance. Note: this will always return the single best lowest ID if two configurations has the same performance @@ -293,25 +284,20 @@ def get_best_learning_curve_id(self, maximize: bool = False): learning_curves = self.get_learning_curves() if maximize: return learning_curves.max(axis=1).idxmax() - else: - return learning_curves.min(axis=1).idxmin() + return learning_curves.min(axis=1).idxmin() def get_best_seen_performance(self, maximize: bool = False): learning_curves = self.get_learning_curves() if maximize: return learning_curves.max(axis=1).max() - else: - return learning_curves.min(axis=1).min() + return learning_curves.min(axis=1).min() def add_budget_column(self): combined_df = self.df.reset_index(level=1) - combined_df.set_index( - keys=[self.budget_idx], drop=False, append=True, inplace=True - ) - return combined_df + return combined_df.set_index(keys=[self.budget_idx], drop=False, append=True) def reduce_to_max_seen_budgets(self): - self.df.sort_index(inplace=True) + self.df = self.df.sort_index() combined_df = self.add_budget_column() return combined_df.groupby(level=0).last() @@ -344,7 +330,7 @@ def extract_learning_curve( def get_best_performance_per_config(self, maximize: bool = False) -> pd.Series: """Returns the best score recorded per config across fidelities seen.""" op = np.max if maximize else np.min - perf = ( + return ( self.df.sort_values( "budget_id", ascending=False ) # sorts with largest budget first @@ -354,7 +340,6 @@ def get_best_performance_per_config(self, maximize: bool = False) -> pd.Series: op ) # finds the minimum over per-config learning curve ) - return perf def get_max_observed_fidelity_level_per_config(self) -> pd.Series: """Returns the highest fidelity level recorded per config seen.""" diff --git a/neps/optimizers/multi_fidelity_prior/async_priorband.py b/neps/optimizers/multi_fidelity_prior/async_priorband.py index ce2352cf..0a859dec 100644 --- a/neps/optimizers/multi_fidelity_prior/async_priorband.py +++ b/neps/optimizers/multi_fidelity_prior/async_priorband.py @@ -1,18 +1,11 @@ -import typing +from __future__ import annotations -import numpy as np +import typing from typing import Literal from typing_extensions import override -from neps.state.optimizer import BudgetInfo -from neps.utils.types import ConfigResult, RawConfig -from neps.search_spaces.search_space import SearchSpace -from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( - BaseAcquisition, -) -from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, -) +import numpy as np + from neps.optimizers.multi_fidelity.mf_bo import MFBOBase from neps.optimizers.multi_fidelity.promotion_policy import AsyncPromotionPolicy from neps.optimizers.multi_fidelity.sampling_policy import EnsemblePolicy, ModelPolicy @@ -21,6 +14,17 @@ ) from neps.optimizers.multi_fidelity_prior.priorband import PriorBandBase +if typing.TYPE_CHECKING: + from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( + BaseAcquisition, + ) + from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( + AcquisitionSampler, + ) + from neps.search_spaces.search_space import SearchSpace + from neps.state.optimizer import BudgetInfo + from neps.utils.types import ConfigResult, RawConfig + class PriorBandAsha(MFBOBase, PriorBandBase, AsynchronousSuccessiveHalvingWithPriors): """Implements a PriorBand on top of ASHA.""" @@ -50,12 +54,12 @@ def __init__( # arguments for model model_based: bool = False, # crucial argument to set to allow model-search modelling_type: str = "joint", # could also be {"rung"} - initial_design_size: int = None, + initial_design_size: int | None = None, model_policy: typing.Any = ModelPolicy, surrogate_model: str | typing.Any = "gp", - domain_se_kernel: str = None, - hp_kernels: list = None, - surrogate_model_args: dict = None, + domain_se_kernel: str | None = None, + hp_kernels: list | None = None, + surrogate_model_args: dict | None = None, acquisition: str | BaseAcquisition = "EI", log_prior_weighted: bool = False, acquisition_sampler: str | AcquisitionSampler = "random", @@ -95,15 +99,15 @@ def __init__( }, } - bo_args = dict( - surrogate_model=surrogate_model, - domain_se_kernel=domain_se_kernel, - hp_kernels=hp_kernels, - surrogate_model_args=surrogate_model_args, - acquisition=acquisition, - log_prior_weighted=log_prior_weighted, - acquisition_sampler=acquisition_sampler, - ) + bo_args = { + "surrogate_model": surrogate_model, + "domain_se_kernel": domain_se_kernel, + "hp_kernels": hp_kernels, + "surrogate_model_args": surrogate_model_args, + "acquisition": acquisition, + "log_prior_weighted": log_prior_weighted, + "acquisition_sampler": acquisition_sampler, + } self.model_based = model_based self.modelling_type = modelling_type self.initial_design_size = initial_design_size @@ -128,10 +132,7 @@ def get_config_and_ids( [type]: [description] """ rung_to_promote = self.is_promotable() - if rung_to_promote is not None: - rung = rung_to_promote + 1 - else: - rung = self.min_rung + rung = rung_to_promote + 1 if rung_to_promote is not None else self.min_rung self.set_sampling_weights_and_inc(rung=rung) # performs standard ASHA but sampling happens as per the EnsemblePolicy return super().get_config_and_ids() @@ -166,43 +167,43 @@ def __init__( # arguments for model model_based: bool = False, # crucial argument to set to allow model-search modelling_type: str = "joint", # could also be {"rung"} - initial_design_size: int = None, + initial_design_size: int | None = None, model_policy: typing.Any = ModelPolicy, surrogate_model: str | typing.Any = "gp", - domain_se_kernel: str = None, - hp_kernels: list = None, - surrogate_model_args: dict = None, + domain_se_kernel: str | None = None, + hp_kernels: list | None = None, + surrogate_model_args: dict | None = None, acquisition: str | BaseAcquisition = "EI", log_prior_weighted: bool = False, acquisition_sampler: str | AcquisitionSampler = "random", ): # collecting arguments required by ASHA - args = dict( - pipeline_space=pipeline_space, - budget=budget, - eta=eta, - early_stopping_rate=self.early_stopping_rate, - initial_design_type=initial_design_type, - sampling_policy=sampling_policy, - promotion_policy=promotion_policy, - loss_value_on_error=loss_value_on_error, - cost_value_on_error=cost_value_on_error, - ignore_errors=ignore_errors, - logger=logger, - prior_confidence=prior_confidence, - random_interleave_prob=random_interleave_prob, - sample_default_first=sample_default_first, - sample_default_at_target=sample_default_at_target, - ) - bo_args = dict( - surrogate_model=surrogate_model, - domain_se_kernel=domain_se_kernel, - hp_kernels=hp_kernels, - surrogate_model_args=surrogate_model_args, - acquisition=acquisition, - log_prior_weighted=log_prior_weighted, - acquisition_sampler=acquisition_sampler, - ) + args = { + "pipeline_space": pipeline_space, + "budget": budget, + "eta": eta, + "early_stopping_rate": self.early_stopping_rate, + "initial_design_type": initial_design_type, + "sampling_policy": sampling_policy, + "promotion_policy": promotion_policy, + "loss_value_on_error": loss_value_on_error, + "cost_value_on_error": cost_value_on_error, + "ignore_errors": ignore_errors, + "logger": logger, + "prior_confidence": prior_confidence, + "random_interleave_prob": random_interleave_prob, + "sample_default_first": sample_default_first, + "sample_default_at_target": sample_default_at_target, + } + bo_args = { + "surrogate_model": surrogate_model, + "domain_se_kernel": domain_se_kernel, + "hp_kernels": hp_kernels, + "surrogate_model_args": surrogate_model_args, + "acquisition": acquisition, + "log_prior_weighted": log_prior_weighted, + "acquisition_sampler": acquisition_sampler, + } super().__init__( **args, prior_weight_type=prior_weight_type, @@ -257,7 +258,7 @@ def load_optimization_state( previous_results=previous_results, pending_evaluations=pending_evaluations, budget_info=budget_info, - optimizer_state=optimizer_state + optimizer_state=optimizer_state, ) # important for the global HB to run the right SH self._update_sh_bracket_state() @@ -278,8 +279,7 @@ def _get_bracket_to_run(self): self.eta ** (K - s) * (K + 1) / (K - s + 1) for s in range(self.max_rung + 1) ] bracket_probs = np.array(bracket_probs) / sum(bracket_probs) - bracket_next = np.random.choice(range(self.max_rung + 1), p=bracket_probs) - return bracket_next + return np.random.choice(range(self.max_rung + 1), p=bracket_probs) def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]: """...and this is the method that decides which point to query. diff --git a/neps/optimizers/multi_fidelity_prior/priorband.py b/neps/optimizers/multi_fidelity_prior/priorband.py index be7b3151..bdf3a567 100644 --- a/neps/optimizers/multi_fidelity_prior/priorband.py +++ b/neps/optimizers/multi_fidelity_prior/priorband.py @@ -1,16 +1,10 @@ +from __future__ import annotations + import typing from typing import Literal import numpy as np -from neps.utils.types import RawConfig -from neps.search_spaces.search_space import SearchSpace -from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( - BaseAcquisition, -) -from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, -) from neps.optimizers.multi_fidelity.hyperband import HyperbandCustomDefault from neps.optimizers.multi_fidelity.mf_bo import MFBOBase from neps.optimizers.multi_fidelity.promotion_policy import SyncPromotionPolicy @@ -22,6 +16,16 @@ get_prior_weight_for_decay, ) +if typing.TYPE_CHECKING: + from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( + BaseAcquisition, + ) + from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( + AcquisitionSampler, + ) + from neps.search_spaces.search_space import SearchSpace + from neps.utils.types import RawConfig + class PriorBandBase: """Class that defines essential properties needed by PriorBand. @@ -35,16 +39,14 @@ def find_all_distances_from_incumbent(self, incumbent): # computing distance of incumbent from all seen points in history distances = [dist(config) for config in self.observed_configs.config] # ensuring the distances exclude 0 or the distance from itself - distances = [d for d in distances if d > 0] - return distances + return [d for d in distances if d > 0] def find_1nn_distance_from_incumbent(self, incumbent): """Finds the distance to the nearest neighbour.""" distances = self.find_all_distances_from_incumbent(incumbent) - distance = min(distances) - return distance + return min(distances) - def find_incumbent(self, rung: int = None) -> SearchSpace: + def find_incumbent(self, rung: int | None = None) -> SearchSpace: """Find the best performing configuration seen so far.""" rungs = self.observed_configs.rung.values idxs = self.observed_configs.index.values @@ -120,7 +122,7 @@ def is_activate_inc(self) -> bool: continuation_resources = bracket.rung_map[bracket.min_rung] resources = bracket.config_map[bracket.min_rung] * continuation_resources for r in range(1, len(bracket.rung_map)): - rung = sorted(list(bracket.rung_map.keys()), reverse=False)[r] + rung = sorted(bracket.rung_map.keys(), reverse=False)[r] continuation_resources = bracket.rung_map[rung] - bracket.rung_map[rung - 1] resources += bracket.config_map[rung] * continuation_resources @@ -144,7 +146,7 @@ def calc_sampling_args(self, rung) -> dict: # scales weight of prior by eta raised to the current rung level # at the base rung thus w_prior = w_random # at the max rung r, w_prior = eta^r * w_random - _w_prior = (self.eta ** rung) * _w_random + _w_prior = (self.eta**rung) * _w_random elif self.prior_weight_type == "linear": _w_random = 1 w_prior_min_rung = 1 * _w_random @@ -174,12 +176,11 @@ def calc_sampling_args(self, rung) -> dict: w_inc = _w_inc * w_prior w_prior = _w_prior * w_prior - sampling_args = { + return { "prior": w_prior, "inc": w_inc, "random": w_random, } - return sampling_args def prior_to_incumbent_ratio(self) -> float | float: """Calculates the normalized weight distribution between prior and incumbent. @@ -188,15 +189,14 @@ def prior_to_incumbent_ratio(self) -> float | float: """ if self.inc_style == "constant": return self._prior_to_incumbent_ratio_constant() - elif self.inc_style == "decay": + if self.inc_style == "decay": resources = calc_total_resources_spent(self.observed_configs, self.rung_map) return self._prior_to_incumbent_ratio_decay( resources, self.eta, self.min_budget, self.max_budget ) - elif self.inc_style == "dynamic": + if self.inc_style == "dynamic": return self._prior_to_incumbent_ratio_dynamic(self.max_rung) - else: - raise ValueError(f"Invalid option {self.inc_style}") + raise ValueError(f"Invalid option {self.inc_style}") def _prior_to_incumbent_ratio_decay( self, resources: float, eta: int, min_budget, max_budget @@ -256,17 +256,14 @@ def _prior_to_incumbent_ratio_dynamic(self, rung: int) -> float | float: # normalizing scores to be weighted ratios w_prior = prior_score / sum(weighted_top_config_scores) w_inc = inc_score / sum(weighted_top_config_scores) + elif rung == self.min_rung: + # setting `w_inc = eta * w_prior` as default till score calculation begins + w_prior = self.eta / (1 + self.eta) + w_inc = 1 / (1 + self.eta) else: - # if eta-configurations NOT recorded yet - # check if it is the base rung - if rung == self.min_rung: - # setting `w_inc = eta * w_prior` as default till score calculation begins - w_prior = self.eta / (1 + self.eta) - w_inc = 1 / (1 + self.eta) - else: - # if rung > min.rung then the lower rung could already have enough - # configurations and thus can be recursively queried till the base rung - return self._prior_to_incumbent_ratio_dynamic(rung - 1) + # if rung > min.rung then the lower rung could already have enough + # configurations and thus can be recursively queried till the base rung + return self._prior_to_incumbent_ratio_dynamic(rung - 1) return w_prior, w_inc @@ -296,12 +293,12 @@ def __init__( # arguments for model model_based: bool = False, # crucial argument to set to allow model-search modelling_type: str = "joint", # could also be {"rung"} - initial_design_size: int = None, + initial_design_size: int | None = None, model_policy: typing.Any = ModelPolicy, surrogate_model: str | typing.Any = "gp", - domain_se_kernel: str = None, - hp_kernels: list = None, - surrogate_model_args: dict = None, + domain_se_kernel: str | None = None, + hp_kernels: list | None = None, + surrogate_model_args: dict | None = None, acquisition: str | BaseAcquisition = "EI", log_prior_weighted: bool = False, acquisition_sampler: str | AcquisitionSampler = "random", @@ -340,15 +337,15 @@ def __init__( }, } - bo_args = dict( - surrogate_model=surrogate_model, - domain_se_kernel=domain_se_kernel, - hp_kernels=hp_kernels, - surrogate_model_args=surrogate_model_args, - acquisition=acquisition, - log_prior_weighted=log_prior_weighted, - acquisition_sampler=acquisition_sampler, - ) + bo_args = { + "surrogate_model": surrogate_model, + "domain_se_kernel": domain_se_kernel, + "hp_kernels": hp_kernels, + "surrogate_model_args": surrogate_model_args, + "acquisition": acquisition, + "log_prior_weighted": log_prior_weighted, + "acquisition_sampler": acquisition_sampler, + } self.model_based = model_based self.modelling_type = modelling_type self.initial_design_size = initial_design_size diff --git a/neps/optimizers/multi_fidelity_prior/utils.py b/neps/optimizers/multi_fidelity_prior/utils.py index 9f4c1a47..a1c5c6dd 100644 --- a/neps/optimizers/multi_fidelity_prior/utils.py +++ b/neps/optimizers/multi_fidelity_prior/utils.py @@ -1,16 +1,22 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import numpy as np -import pandas as pd import scipy from neps.search_spaces import ( CategoricalParameter, ConstantParameter, + GraphParameter, NumericalParameter, Parameter, - GraphParameter, SearchSpace, ) +if TYPE_CHECKING: + import pandas as pd + def update_fidelity(config, fidelity): config.fidelity.set_value(fidelity) @@ -32,7 +38,6 @@ def local_mutation( new_config: dict[str, Parameter] = {} for hp_name, hp in config.items(): - if hp.is_fidelity or np.random.uniform() > mutation_rate: new_config[hp_name] = hp.clone() @@ -79,7 +84,6 @@ def custom_crossover( getting config2's value of the corresponding HP. By default, crossover rate is 50%. """ for _ in range(patience): - child_config = config1.clone() for key, hyperparameter in config1.items(): if not hyperparameter.is_fidelity and np.random.random() < crossover_prob: @@ -121,8 +125,7 @@ def compute_config_dist(config1: SearchSpace, config2: SearchSpace) -> float: config1["categorical"] + [0], config2["categorical"] + [0] ) - distance = d_cont + d_cat - return distance + return d_cont + d_cat def compute_scores( @@ -153,8 +156,7 @@ def calc_total_resources_spent(observed_configs: pd.DataFrame, rung_map: dict) - for i in range(len(observed_configs)) if not np.isnan(observed_configs.at[i, "perf"]) ] - total_resources = sum(rung_map[r] for r in rungs_used) - return total_resources + return sum(rung_map[r] for r in rungs_used) # def get_prior_weight_for_decay( @@ -176,7 +178,7 @@ def calc_total_resources_spent(observed_configs: pd.DataFrame, rung_map: dict) - def get_prior_weight_for_decay( resources_used: float, eta: int, min_budget, max_budget ) -> float: - """Creates a step function schedule for the prior weight decay. + r"""Creates a step function schedule for the prior weight decay. The prior weight ratio is decayed every time the total resources used is equivalent to the cost of one successive halving bracket within the HB schedule. @@ -186,5 +188,4 @@ def get_prior_weight_for_decay( decay = 2 unit_resources = eta * max_budget idx = resources_used // unit_resources - weight = 1 / decay**idx - return weight + return 1 / decay**idx diff --git a/neps/optimizers/multiple_knowledge_sources/prototype_optimizer.py b/neps/optimizers/multiple_knowledge_sources/prototype_optimizer.py index d14657bf..2263e16d 100644 --- a/neps/optimizers/multiple_knowledge_sources/prototype_optimizer.py +++ b/neps/optimizers/multiple_knowledge_sources/prototype_optimizer.py @@ -1,11 +1,15 @@ +from __future__ import annotations + import logging -from typing import Any, override +from typing import TYPE_CHECKING, Any, override -from neps.state.optimizer import BudgetInfo, OptimizationState -from neps.utils.types import ConfigResult, RawConfig -from neps.search_spaces.search_space import SearchSpace -from neps.utils.data_loading import read_tasks_and_dev_stages_from_disk from neps.optimizers.base_optimizer import BaseOptimizer +from neps.utils.data_loading import read_tasks_and_dev_stages_from_disk + +if TYPE_CHECKING: + from neps.search_spaces.search_space import SearchSpace + from neps.state.optimizer import BudgetInfo + from neps.utils.types import ConfigResult, RawConfig # TODO: Test if anything breaks after the recent changes diff --git a/neps/optimizers/random_search/optimizer.py b/neps/optimizers/random_search/optimizer.py index abe16866..e2da6f40 100644 --- a/neps/optimizers/random_search/optimizer.py +++ b/neps/optimizers/random_search/optimizer.py @@ -1,11 +1,15 @@ -from typing import Any +from __future__ import annotations + +from typing import TYPE_CHECKING, Any from typing_extensions import override -from neps.state.optimizer import BudgetInfo -from neps.utils.types import ConfigResult, RawConfig -from neps.search_spaces.search_space import SearchSpace from neps.optimizers.base_optimizer import BaseOptimizer +if TYPE_CHECKING: + from neps.search_spaces.search_space import SearchSpace + from neps.state.optimizer import BudgetInfo + from neps.utils.types import ConfigResult, RawConfig + class RandomSearch(BaseOptimizer): def __init__(self, use_priors=False, ignore_fidelity=True, **optimizer_kwargs): diff --git a/neps/optimizers/regularized_evolution/optimizer.py b/neps/optimizers/regularized_evolution/optimizer.py index d112be31..215e95ce 100644 --- a/neps/optimizers/regularized_evolution/optimizer.py +++ b/neps/optimizers/regularized_evolution/optimizer.py @@ -1,19 +1,23 @@ +from __future__ import annotations + import math import os import random +from collections.abc import Callable from pathlib import Path -from typing import Any, Callable +from typing import TYPE_CHECKING, Any from typing_extensions import override import numpy as np import yaml -from neps.state.optimizer import BudgetInfo -from neps.utils.types import ConfigResult, RawConfig - -from neps.search_spaces.search_space import SearchSpace from neps.optimizers.base_optimizer import BaseOptimizer +if TYPE_CHECKING: + from neps.search_spaces.search_space import SearchSpace + from neps.state.optimizer import BudgetInfo + from neps.utils.types import ConfigResult, RawConfig + class RegularizedEvolution(BaseOptimizer): def __init__( @@ -63,18 +67,19 @@ def load_optimization_state( train_x = [el.config for el in previous_results.values()] train_y = [self.get_loss(el.result) for el in previous_results.values()] self.num_train_x = len(train_x) - self.population = [ - (x, y) - for x, y in zip( - train_x[-self.population_size:], train_y[-self.population_size:] + self.population = list( + zip( + train_x[-self.population_size :], + train_y[-self.population_size :], + strict=False, ) - ] - self.pending_evaluations = [el for el in pending_evaluations.values()] + ) + self.pending_evaluations = list(pending_evaluations.values()) def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]: if len(self.population) < self.population_size: if self.assisted: - if 0 == len(os.listdir(self.assisted_init_population_dir)): + if len(os.listdir(self.assisted_init_population_dir)) == 0: cur_population_size = self.population_size - len(self.population) configs = [ self.pipeline_space.sample( @@ -83,13 +88,12 @@ def get_config_and_ids(self) -> tuple[RawConfig, str, str | None]: for _ in range(cur_population_size * 2) ] if self.assisted_zero_cost_proxy is not None: - zero_cost_proxy_values = self.assisted_zero_cost_proxy( - x=configs) # type: ignore[misc] + zero_cost_proxy_values = self.assisted_zero_cost_proxy(x=configs) # type: ignore[misc] else: raise Exception("Zero cost proxy function is not defined!") indices = np.argsort(zero_cost_proxy_values)[-cur_population_size:][ - ::-1 - ] + ::-1 + ] for idx, config_idx in enumerate(indices): filename = str(idx).zfill( int(math.log10(cur_population_size)) + 1 diff --git a/neps/optimizers/utils.py b/neps/optimizers/utils.py index e9d29222..8a9a030f 100644 --- a/neps/optimizers/utils.py +++ b/neps/optimizers/utils.py @@ -1,19 +1,24 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + import pandas as pd -from neps.search_spaces.search_space import SearchSpace - +if TYPE_CHECKING: + from neps.search_spaces.search_space import SearchSpace + def map_real_hyperparameters_from_tabular_ids( x: pd.Series, pipeline_space: SearchSpace ) -> pd.Series: - """ Maps the tabular IDs to the actual HPs from the pipeline space. - + """Maps the tabular IDs to the actual HPs from the pipeline space. + Args: x (pd.Series): A pandas series with the tabular IDs. TODO: Mention expected format of the series. pipeline_space (SearchSpace): The pipeline space. - Returns: + Returns: pd.Series: A pandas series with the actual HPs. TODO: Mention expected format of the series. """ @@ -21,8 +26,11 @@ def map_real_hyperparameters_from_tabular_ids( return x # copying hyperparameter configs based on IDs _x = pd.Series( - [pipeline_space.custom_grid_table[x.loc[idx]["id"].value] for idx in x.index.values], - index=x.index + [ + pipeline_space.custom_grid_table[x.loc[idx]["id"].value] + for idx in x.index.values + ], + index=x.index, ) # setting the passed fidelities for the corresponding IDs for idx in _x.index.values: diff --git a/neps/plot/tensorboard_eval.py b/neps/plot/tensorboard_eval.py index 2211537d..66512ee9 100644 --- a/neps/plot/tensorboard_eval.py +++ b/neps/plot/tensorboard_eval.py @@ -335,10 +335,7 @@ def _write_image_config( if tblogger.current_epoch >= 0 and tblogger.current_epoch % counter == 0: # Log every multiple of "counter" - if num_images > len(image): - # If the number of images requested by the user - # is more than the ones available. - num_images = len(image) + num_images = min(num_images, len(image)) if random_images is False: subset_images = image[:num_images] diff --git a/neps/sampling/distributions.py b/neps/sampling/distributions.py index 6b557e5a..f865d173 100644 --- a/neps/sampling/distributions.py +++ b/neps/sampling/distributions.py @@ -3,9 +3,10 @@ from __future__ import annotations import math +from collections.abc import Mapping from dataclasses import dataclass from numbers import Number -from typing import TYPE_CHECKING, ClassVar, Mapping +from typing import TYPE_CHECKING, ClassVar from typing_extensions import override import torch diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py index f2373a68..83c40e68 100644 --- a/neps/sampling/priors.py +++ b/neps/sampling/priors.py @@ -9,8 +9,9 @@ from __future__ import annotations +from collections.abc import Container, Iterable, Mapping, Sequence from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Container, Iterable, Mapping, Protocol, Sequence +from typing import TYPE_CHECKING, Any, Protocol from typing_extensions import override import torch @@ -278,7 +279,9 @@ def __post_init__(self): self._meaningful_dists = [] return - self._meaningful_ixs, self._meaningful_doms, self._meaningful_dists = zip(*rest) + self._meaningful_ixs, self._meaningful_doms, self._meaningful_dists = zip( + *rest, strict=False + ) @property @override @@ -311,7 +314,7 @@ def log_prob(self, x: torch.Tensor, *, frm: list[Domain] | Domain) -> torch.Tens # Calculate the log probabilities of the sample domain tensors under their # respective distributions. - itr = iter(zip(self._meaningful_ixs, self._meaningful_dists)) + itr = iter(zip(self._meaningful_ixs, self._meaningful_dists, strict=False)) first_i, first_dist = next(itr) log_probs = first_dist.log_prob(translated_x[..., first_i]) @@ -416,7 +419,7 @@ def ncols(self) -> int: def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor: # OPTIM: Avoid an initial allocation by using the output of the first # distribution to store the weighted probabilities - itr = zip(self.probabilities, self.priors) + itr = zip(self.probabilities, self.priors, strict=False) first_prob, first_prior = next(itr) weighted_probs = first_prob * first_prior.log_prob(x, frm=frm) diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py index c7456155..43758094 100644 --- a/neps/sampling/samplers.py +++ b/neps/sampling/samplers.py @@ -6,9 +6,10 @@ from __future__ import annotations +from collections.abc import Sequence from dataclasses import dataclass, field from functools import reduce -from typing import Protocol, Sequence +from typing import Protocol from typing_extensions import override import torch diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py index 7342a136..c1a10196 100644 --- a/neps/search_spaces/domain.py +++ b/neps/search_spaces/domain.py @@ -44,8 +44,9 @@ from __future__ import annotations import math +from collections.abc import Iterable from dataclasses import dataclass, field -from typing import Generic, Iterable, TypeVar +from typing import Generic, TypeVar import torch from torch import Tensor @@ -351,7 +352,7 @@ def translate( ) out = torch.empty_like(x) - for i, (f, t) in enumerate(zip(frm, to)): + for i, (f, t) in enumerate(zip(frm, to, strict=False)): out[..., i] = t.cast(x[..., i], frm=f) return out diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py index 71555fef..5f68aff9 100644 --- a/neps/search_spaces/encoding.py +++ b/neps/search_spaces/encoding.py @@ -1,12 +1,11 @@ from __future__ import annotations +from collections.abc import Mapping, Sequence from dataclasses import dataclass, field from typing import ( TYPE_CHECKING, Any, Generic, - Mapping, - Sequence, TypeAlias, TypeVar, ) @@ -264,14 +263,17 @@ def unpack(self, x: torch.Tensor) -> list[dict[str, Any]]: values[hp_name] = transformer.decode(tensor) keys = list(values.keys()) - return [dict(zip(keys, vals)) for vals in zip(*values.values())] + return [ + dict(zip(keys, vals, strict=False)) + for vals in zip(*values.values(), strict=False) + ] @classmethod def default(cls, parameters: Mapping[str, Parameter]) -> TensorEncoder: sorted_params = sorted(parameters.items()) transformers: dict[str, TensorTransformer] = {} for name, hp in sorted_params: - if isinstance(hp, (FloatParameter, IntegerParameter)): + if isinstance(hp, FloatParameter | IntegerParameter): transformers[name] = MinMaxNormalizer(hp.domain) else: assert isinstance(hp, CategoricalParameter) diff --git a/neps/state/__init__.py b/neps/state/__init__.py index 7a85c7d4..e870d656 100644 --- a/neps/state/__init__.py +++ b/neps/state/__init__.py @@ -1,3 +1,4 @@ +from neps.state.optimizer import BudgetInfo, OptimizationState, OptimizerInfo from neps.state.protocols import ( Locker, ReaderWriter, @@ -5,7 +6,6 @@ VersionedResource, Versioner, ) -from neps.state.optimizer import BudgetInfo, OptimizationState, OptimizerInfo from neps.state.seed_snapshot import SeedSnapshot from neps.state.trial import Trial From 54b605929d48ff059626c4fe3bfc9f3f03c4a1fd Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 18 Sep 2024 13:09:45 +0200 Subject: [PATCH 37/63] ci: Update deps on botorch/gpytorch --- pyproject.toml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9b670189..3e304c55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,7 @@ pandas = "^2" networkx = "^2.6.3" nltk = "^3.6.4" scipy = ">=1.13.1" -torch = ">1.7.0,!=2.0.1, !=2.1.0" +torch = ">=2.0.1" matplotlib = "^3" more-itertools = "*" portalocker = "^2" @@ -64,6 +64,8 @@ tensorboard = "^2" typing-extensions = "*" torchvision = ">=0.8.0" ifbo = ">=0.3.10" +botorch = ">=0.12" +gpytorch = "1.13.0" [tool.poetry.group.dev.dependencies] ruff = "*" @@ -80,13 +82,6 @@ mkdocs-literate-nav = "*" mike = "*" black = "*" # This allows mkdocstrings to format signatures in the docs - -[tool.poetry.group.experimental] -optional = true - -[tool.poetry.group.experimental.dependencies] -gpytorch = "1.8.0" - [build-system] requires = ["poetry-core>=1.1.0"] build-backend = "poetry.core.masonry.api" From 6782a8ec50ba3ac1b3479146f9ffd28d618e4142 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 18 Sep 2024 14:46:22 +0200 Subject: [PATCH 38/63] fix: rely on botorch's new defaults https://github.com/pytorch/botorch/discussions/2451 --- .../kernels/get_kernels.py | 40 ----- .../bayesian_optimization/models/gp.py | 163 +++--------------- .../bayesian_optimization/optimizer.py | 11 +- 3 files changed, 27 insertions(+), 187 deletions(-) delete mode 100644 neps/optimizers/bayesian_optimization/kernels/get_kernels.py diff --git a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py b/neps/optimizers/bayesian_optimization/kernels/get_kernels.py deleted file mode 100644 index 36add92e..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/get_kernels.py +++ /dev/null @@ -1,40 +0,0 @@ -from __future__ import annotations - -from neps.search_spaces.architecture.core_graph_grammar import CoreGraphGrammar -from neps.search_spaces.hyperparameters.categorical import CategoricalParameter -from neps.search_spaces.hyperparameters.float import FloatParameter -from neps.search_spaces.hyperparameters.integer import IntegerParameter -from neps.utils.common import has_instance, instance_from_map - -from . import GraphKernelMapping, StationaryKernelMapping - - -def get_kernels( - pipeline_space, domain_se_kernel, graph_kernels, hp_kernels, optimal_assignment -): - if not graph_kernels: - graph_kernels = [] - if has_instance(pipeline_space.values(), CoreGraphGrammar): - graph_kernels.append("wl") - if not hp_kernels: - hp_kernels = [] - if has_instance(pipeline_space.values(), FloatParameter, IntegerParameter): - hp_kernels.append("m52") - if has_instance(pipeline_space.values(), CategoricalParameter): - hp_kernels.append("hm") - graph_kernels = [ - instance_from_map(GraphKernelMapping, kernel, "kernel", as_class=True)( - oa=optimal_assignment, - se_kernel=instance_from_map( - StationaryKernelMapping, domain_se_kernel, "se kernel" - ), - ) - for kernel in graph_kernels - ] - hp_kernels = [ - instance_from_map(StationaryKernelMapping, kernel, "kernel") - for kernel in hp_kernels - ] - if not graph_kernels and not hp_kernels: - raise ValueError("No kernels are provided!") - return graph_kernels, hp_kernels diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index 2ab0b897..0281cd1a 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -import math from collections.abc import Mapping from functools import reduce from typing import TYPE_CHECKING, Any, TypeVar @@ -12,14 +11,13 @@ import gpytorch.constraints import torch from botorch.acquisition.analytic import SingleTaskGP -from botorch.models.gp_regression_mixed import ( - CategoricalKernel, - Likelihood, - OutcomeTransform, +from botorch.models.gp_regression import ( + get_covar_module_with_dim_scaled_prior, ) +from botorch.models.gp_regression_mixed import CategoricalKernel, OutcomeTransform from botorch.models.transforms.outcome import Standardize from botorch.optim import optimize_acqf, optimize_acqf_mixed -from gpytorch.kernels import MaternKernel, ScaleKernel +from gpytorch.kernels import ScaleKernel from torch._dynamo.utils import product from neps.search_spaces.encoding import ( @@ -37,111 +35,6 @@ T = TypeVar("T") -def likelihood_with_prior_on_log_scale( - mean: float = 1e-2, - std: float = math.sqrt(3), - bounds: tuple[float, float] = (1e-6, 1), -) -> gpytorch.likelihoods.GaussianLikelihood: - """Default Gaussian likelihood with priors for the noise.""" - # The effect of the likelihood of noise is pretty crucial w.r.t. - # whether we are going to overfit every point by overfitting with - # the lengthscale, or whether we smooth through and assume variation - # is due to noise. Setting it's prior is hard. For a non-noisy - # function, we'd want it looooowww, like 1e-8 kind of low. For - # even a 0.01% noise, we need that all the way up to 1e-2. Hence - # - # If we had 10% noise and we allow the noise to easily optimize towards - # 1e-8, then the lengthscales are forced to become very small, essentially - # overfitting. If we have 0% noise and we don't allow it to easily get low - # then we will drastically underfit. - # A guiding principle here is that we should allow the noise to be just - # as if not slightly easier to tune than the lengthscales. I.e. we prefer - # smoother functions as it is easier to acquisition over. However once we - # over smooth and underfit, any new observations that inform us otherwise - # could just be attributed to noise. - # - # TOOD: We may want to move the likelihood inside the GP and decay the - # amount the GP can attribute to noise (reduce std and mean) relative - # to samples seen, effectively reducing the smoothness of the GP overtime - _noise_prior = gpytorch.priors.LogNormalPrior(math.log(mean) + std**2, std) - return gpytorch.likelihoods.GaussianLikelihood( - noise_prior=_noise_prior, - # Going below 1e-6 could introduuce a lot of numerical instability in the - # kernels, even if it's a noiseless function - noise_constraint=gpytorch.constraints.Interval( - lower_bound=bounds[0], - upper_bound=bounds[1], - initial_value=mean, - ), - ) - - -def default_signal_variance_prior() -> gpytorch.priors.NormalPrior: - """Default prior for the signal variance.""" - # The outputscale prior is a bit more tricky. Essentially - # it describes how much we expect the function to move - # around the mean (0 as we normalize the `ys`) - # Based on `Vanilla GP work great in High Dimensions` by Carl Hvafner - # where it's fixed to `1.0`, we follow suit but allow some minor deviation - # with a prior. - return gpytorch.priors.NormalPrior(loc=1.0, scale=0.1) - - -def default_lengthscale_prior( - N: int, -) -> tuple[gpytorch.priors.LogNormalPrior, gpytorch.constraints.Interval]: - """Default prior for the lengthscale.""" - # Based on `Vanilla GP work great in High Dimensions` by Carl Hvafner - # TODO: I'm not convinced entirely that the `std` is independant - # of the dimension and number of samples - lengthscale_prior = gpytorch.priors.LogNormalPrior( - loc=math.sqrt(2.0) + math.log(N) / 2, - scale=math.sqrt(3.0) * math.log(N), - ) - # NOTE: It's possible to just specify `GreaterThan`, however - # digging through the code, if this ends up at botorch's optimize, - # it will read this and take the bounds and give it to Scipy's - # L-BFGS-B optimizer. Without an upper bound, it defaults to `inf`, - # which can impact gradient estimates. - # tldr; set a bound if you have one, it always helps - lengthscale_constraint = gpytorch.constraints.Interval( - lower_bound=1e-4, - upper_bound=1e3, - initial_value=math.sqrt(2.0) + math.log(N) / 2, - ) - return lengthscale_prior, lengthscale_constraint - - -def default_mean() -> gpytorch.means.ConstantMean: - """Default mean for the GP.""" - return gpytorch.means.ConstantMean( - constant_prior=gpytorch.priors.NormalPrior(0, 0.2), - constant_constraint=gpytorch.constraints.Interval( - lower_bound=-1e6, - upper_bound=1e6, - initial_value=0.0, - ), - ) - - -def default_matern_kernel( - N: int, - active_dims: tuple[int, ...] | None = None, -) -> ScaleKernel: - """Default Matern kernel for the GP.""" - lengthscale_prior, lengthscale_constraint = default_lengthscale_prior(N) - - return ScaleKernel( - MaternKernel( - nu=2.5, - ard_num_dims=N, - active_dims=active_dims, - lengthscale_prior=lengthscale_prior, - lengthscale_constraint=lengthscale_constraint, - ), - ) - - def default_categorical_kernel( N: int, active_dims: tuple[int, ...] | None = None, @@ -161,14 +54,14 @@ def default_single_obj_gp( x: TensorPack, y: torch.Tensor, *, - outcome_transform: OutcomeTransform | None = None, -) -> tuple[SingleTaskGP, Likelihood]: + y_transform: OutcomeTransform | None = None, +) -> SingleTaskGP: """Default GP for single objective optimization.""" if y.ndim == 1: y = y.unsqueeze(-1) - if outcome_transform is None: - outcome_transform = Standardize(m=1) + if y_transform is None: + y_transform = Standardize(m=1) encoder = x.encoder numerics: list[int] = [] @@ -179,38 +72,24 @@ def default_single_obj_gp( else: numerics.append(encoder.index_of[hp_name]) - # TODO: If we have a low cardinality integer, we should consider - # just treating it as a categorical... - likelihood = likelihood_with_prior_on_log_scale() - # Purely vectorial if len(categoricals) == 0: - gp = SingleTaskGP( - train_X=x.tensor, - train_Y=y, - mean_module=default_mean(), - likelihood=likelihood, - # Only matern kernel - covar_module=default_matern_kernel(len(numerics)), - outcome_transform=outcome_transform, - ) - return gp, likelihood + return SingleTaskGP(train_X=x.tensor, train_Y=y, outcome_transform=y_transform) # Purely categorical if len(numerics) == 0: - gp = SingleTaskGP( + return SingleTaskGP( train_X=x.tensor, train_Y=y, - mean_module=default_mean(), - likelihood=likelihood, - # Only categorical kernel covar_module=default_categorical_kernel(len(categoricals)), - outcome_transform=outcome_transform, + outcome_transform=y_transform, ) - return gp, likelihood # Mixed - numeric_kernel = default_matern_kernel(len(numerics), active_dims=tuple(numerics)) + numeric_kernel = get_covar_module_with_dim_scaled_prior( + ard_num_dims=len(numerics), + active_dims=tuple(numerics), + ) cat_kernel = default_categorical_kernel( len(categoricals), active_dims=tuple(categoricals) ) @@ -223,19 +102,17 @@ def default_single_obj_gp( # # In a toy example with a single binary categorical which acted like F * {0, 1}, # the model collapsed to always predicting `0`. Causing all parameters defining F - # to essentially be guess at random. This is a lot more stable while testing... - # TODO: Figure out why... + # to essentially be guess at random. This is a lot more stable but likely not as + # good... + # TODO: Figure out how to improve stability of this. kernel = numeric_kernel + cat_kernel - gp = SingleTaskGP( + return SingleTaskGP( train_X=x.tensor, train_Y=y, - mean_module=default_mean(), - likelihood=likelihood, covar_module=kernel, - outcome_transform=outcome_transform, + outcome_transform=y_transform, ) - return gp, likelihood def optimize_acq( diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 6fe20655..d9bd10e3 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -312,13 +312,15 @@ def ask( y = _missing_y_strategy(y) # Now fit our model - y_model, y_likelihood = default_single_obj_gp( + y_model = default_single_obj_gp( x, y, # TODO: We should consider applying some heurisitc to see if this should # also include a log transform, similar as we do to cost if using `use_cost`. - outcome_transform=Standardize(m=1), + y_transform=Standardize(m=1), ) + y_likelihood = y_model.likelihood + fit_gpytorch_mll( ExactMarginalLogLikelihood(likelihood=y_likelihood, model=y_model) ) @@ -368,16 +370,17 @@ def ask( cost = torch.tensor(costs, dtype=torch.float64, device=self.device) cost_z_score = _missing_cost_strategy(cost) - cost_model, cost_likelihood = default_single_obj_gp( + cost_model = default_single_obj_gp( x, cost_z_score, - outcome_transform=ChainedOutcomeTransform( + y_transform=ChainedOutcomeTransform( # TODO: Maybe some way for a user to specify their cost # is on a log scale? log=Log(), standardize=Standardize(m=1), ), ) + cost_likelihood = cost_model.likelihood # Optimize the cost model fit_gpytorch_mll( From 4e8e30824b059a930380c740ca64752c37b6cacb Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 18 Sep 2024 14:58:58 +0200 Subject: [PATCH 39/63] merge --- neps/__init__.py | 2 - neps/search_spaces/__init__.py | 2 - neps/search_spaces/architecture/api.py | 33 +- neps/search_spaces/architecture/cfg.py | 115 ++- .../cfg_variants/cfg_resolution.py | 385 --------- .../cfg_variants/constrained_cfg.py | 78 +- .../architecture/core_graph_grammar.py | 705 +++------------- neps/search_spaces/architecture/crossover.py | 135 +-- neps/search_spaces/architecture/graph.py | 470 +++-------- .../architecture/graph_grammar.py | 793 ++---------------- neps/search_spaces/architecture/mutations.py | 47 +- neps/search_spaces/architecture/primitives.py | 332 ++++---- neps/search_spaces/architecture/topologies.py | 81 +- ...rs_for_architecture_and_hyperparameters.py | 6 +- .../experimental/hierarchical_architecture.py | 8 +- ...erarchical_architecture_hierarchical_GP.py | 6 +- pyproject.toml | 245 +++--- 17 files changed, 788 insertions(+), 2655 deletions(-) delete mode 100644 neps/search_spaces/architecture/cfg_variants/cfg_resolution.py diff --git a/neps/__init__.py b/neps/__init__.py index b2276ca3..c27257ef 100644 --- a/neps/__init__.py +++ b/neps/__init__.py @@ -8,8 +8,6 @@ FloatParameter, FunctionParameter, GraphGrammar, - GraphGrammarCell, - GraphGrammarRepetitive, IntegerParameter, ) from neps.status.status import get_summary_dict, status diff --git a/neps/search_spaces/__init__.py b/neps/search_spaces/__init__.py index 8a289100..0cfbd9bc 100644 --- a/neps/search_spaces/__init__.py +++ b/neps/search_spaces/__init__.py @@ -2,8 +2,6 @@ from neps.search_spaces.architecture.graph_grammar import ( CoreGraphGrammar, GraphGrammar, - GraphGrammarCell, - GraphGrammarRepetitive, GraphParameter, ) from neps.search_spaces.hyperparameters import ( diff --git a/neps/search_spaces/architecture/api.py b/neps/search_spaces/architecture/api.py index a3af1510..de19a9ef 100644 --- a/neps/search_spaces/architecture/api.py +++ b/neps/search_spaces/architecture/api.py @@ -1,18 +1,20 @@ import inspect -from typing import Callable +from typing import TYPE_CHECKING, Callable import networkx as nx -from torch import nn from .cfg import Grammar from .cfg_variants.constrained_cfg import ConstrainedGrammar -from .graph_grammar import GraphGrammar, GraphGrammarMultipleRepetitive +from .graph_grammar import GraphGrammar + +if TYPE_CHECKING: + from torch import nn def _dict_structure_to_str( - structure: dict, primitives: dict, repetitive_mapping: dict = None + structure: dict, primitives: dict, repetitive_mapping: dict | None = None ) -> str: def _save_replace(string: str, __old: str, __new: str): while string.count(__old) > 0: @@ -25,18 +27,18 @@ def _save_replace(string: str, __old: str, __new: str): grammar = grammar.replace("(", " ") grammar = grammar.replace(")", "") grammar = grammar.replace(",", "") - for primitive in primitives.keys(): + for primitive in primitives: grammar = _save_replace(grammar, f" {primitive} ", f' "{primitive}" ') grammar = _save_replace(grammar, f" {primitive}\n", f' "{primitive}"\n') if repetitive_mapping is not None: - for placeholder in repetitive_mapping.keys(): + for placeholder in repetitive_mapping: grammar = _save_replace(grammar, f" {placeholder} ", f' "{placeholder}" ') grammar = _save_replace(grammar, f" {placeholder}\n", f' "{placeholder}"\n') return grammar def _build(graph, set_recursive_attribute): - in_node = [n for n in graph.nodes if graph.in_degree(n) == 0][0] + in_node = next(n for n in graph.nodes if graph.in_degree(n) == 0) for n in nx.topological_sort(graph): for pred in graph.predecessors(n): e = (pred, n) @@ -44,20 +46,17 @@ def _build(graph, set_recursive_attribute): if pred == in_node: predecessor_values = None else: - pred_pred = list(graph.predecessors(pred))[0] + pred_pred = next(iter(graph.predecessors(pred))) predecessor_values = graph.edges[(pred_pred, pred)] graph.edges[e].update(set_recursive_attribute(op_name, predecessor_values)) def ArchitectureParameter(**kwargs): - """Factory function""" - + """Factory function.""" if "structure" not in kwargs: raise ValueError("Factory function requires structure") if not isinstance(kwargs["structure"], list) or len(kwargs["structure"]) == 1: base = GraphGrammar - else: - base = GraphGrammarMultipleRepetitive class _FunctionParameter(base): def __init__( @@ -89,9 +88,9 @@ def __init__( _dict_structure_to_str( st, primitives, - repetitive_mapping=kwargs["terminal_to_sublanguage_map"] - if "terminal_to_sublanguage_map" in kwargs - else None, + repetitive_mapping=kwargs.get( + "terminal_to_sublanguage_map", None + ), ) if isinstance(st, dict) else st @@ -144,9 +143,7 @@ def to_pytorch(self) -> nn.Module: self.prune_graph() if self._set_recursive_attribute: - m = _build( - self, self._set_recursive_attribute - ) + m = _build(self, self._set_recursive_attribute) if m is not None: return m diff --git a/neps/search_spaces/architecture/cfg.py b/neps/search_spaces/architecture/cfg.py index 7e4aa453..958d09f3 100644 --- a/neps/search_spaces/architecture/cfg.py +++ b/neps/search_spaces/architecture/cfg.py @@ -1,25 +1,21 @@ +from __future__ import annotations import itertools import math import sys from collections import defaultdict, deque -from functools import partial -from queue import LifoQueue -from typing import Deque, Tuple, Hashable +from typing import Hashable import numpy as np from nltk import CFG, Production from nltk.grammar import Nonterminal -from scipy.integrate._ivp.radau import P -from torch import Value class Grammar(CFG): - """ - Extended context free grammar (CFG) class from the NLTK python package + """Extended context free grammar (CFG) class from the NLTK python package We have provided functionality to sample from the CFG. We have included generation capability within the class (before it was an external function) - Also allow sampling to return whole trees (not just the string of terminals) + Also allow sampling to return whole trees (not just the string of terminals). """ def __init__(self, *args, **kwargs): @@ -96,7 +92,9 @@ def compute_space_size(self) -> int: int: size of space described by grammar. """ - def recursive_worker(nonterminal: Nonterminal, memory_bank: dict = None) -> int: + def recursive_worker( + nonterminal: Nonterminal, memory_bank: dict | None = None + ) -> int: if memory_bank is None: memory_bank = {} @@ -110,7 +108,7 @@ def recursive_worker(nonterminal: Nonterminal, memory_bank: dict = None) -> int: ] possibilities_per_edge = [ memory_bank[str(e_nonterminal)] - if str(e_nonterminal) in memory_bank.keys() + if str(e_nonterminal) in memory_bank else recursive_worker(e_nonterminal, memory_bank) for e_nonterminal in edges_nonterminals ] @@ -165,7 +163,7 @@ def sampler_restricted(self, n, max_length=5, cfactor=0.1, min_length=0): def sampler( self, n=1, - start_symbol: str = None, + start_symbol: str | None = None, user_priors: bool = False, ): # sample n sequences from the CFG @@ -178,24 +176,27 @@ def sampler( # less likely it is to terminate. Therefore, we set the default sampler (setting convergent=True) to # downweight frequent productions when traversing the grammar. # see https://eli.thegreenplace.net/2010/01/28/generating-random-sentences-from-a-context-free-236grammar - if start_symbol is None: - start_symbol = self.start() - else: - start_symbol = Nonterminal(start_symbol) + start_symbol = self.start() if start_symbol is None else Nonterminal(start_symbol) if self.convergent: cfactor = 0.1 return [ f"{self._convergent_sampler(symbol=start_symbol, cfactor=cfactor)[0]})" - for i in range(0, n) + for _ in range(n) ] else: return [ f"{self._sampler(symbol=start_symbol, user_priors=user_priors)})" - for i in range(0, n) + for _ in range(n) ] - def _sampler(self, symbol=None, user_priors: bool = False, *, _cache: dict[Hashable, str] | None = None): + def _sampler( + self, + symbol=None, + user_priors: bool = False, + *, + _cache: dict[Hashable, str] | None = None, + ): # simple sampler where each production is sampled uniformly from all possible productions # Tree choses if return tree or list of terminals # recursive implementation @@ -207,7 +208,7 @@ def _sampler(self, symbol=None, user_priors: bool = False, *, _cache: dict[Hasha # collect possible productions from the starting symbol productions = self.productions(lhs=symbol) # sample - if 0 == len(productions): + if len(productions) == 0: raise Exception(f"Nonterminal {symbol} has no productions!") if user_priors and self._prior is not None: production = choice(productions, probs=self._prior[str(symbol)]) @@ -228,7 +229,7 @@ def _sampler(self, symbol=None, user_priors: bool = False, *, _cache: dict[Hasha return tree - def sampler_maxMin_func(self, symbol: str = None, largest: bool = True): + def sampler_maxMin_func(self, symbol: str | None = None, largest: bool = True): tree = "(" + str(symbol) # collect possible productions from the starting symbol productions = self.productions(lhs=symbol) @@ -242,9 +243,7 @@ def sampler_maxMin_func(self, symbol: str = None, largest: bool = True): tree = tree + " " + self.sampler_maxMin_func(sym, largest=largest) + ")" return tree - def _convergent_sampler( - self, cfactor, symbol=None, pcount=defaultdict(int) - ): + def _convergent_sampler(self, cfactor, symbol=None, pcount=None): # sampler that down-weights the probability of selcting the same production many times # ensuring that the sampled trees are not 'too' long (size to be controlled by cfactor) # @@ -252,6 +251,8 @@ def _convergent_sampler( #:pcount: storage for the productions used in the current branch # init the sequence + if pcount is None: + pcount = defaultdict(int) tree = "(" + str(symbol) # init counter of tree depth and number of production rules depth, num_prod = 1, 1 @@ -301,8 +302,7 @@ def compute_prior(self, string_tree: str, log: bool = True) -> float: symbols = self.nonterminals + self.terminals q_production_rules: list[tuple[list, int]] = [] non_terminal_productions: dict[str, list[Production]] = { - sym: self.productions(lhs=Nonterminal(sym)) - for sym in self.nonterminals + sym: self.productions(lhs=Nonterminal(sym)) for sym in self.nonterminals } _symbols_by_size = sorted(symbols, key=len, reverse=True) @@ -322,11 +322,11 @@ def compute_prior(self, string_tree: str, log: bool = True) -> float: continue # special case: "(" is (part of) a terminal - if string_tree[i - 1: i + 2] != " ( ": + if string_tree[i - 1 : i + 2] != " ( ": i += 1 continue - if char == ")" and not string_tree[i - 1] == " ": + if char == ")" and string_tree[i - 1] != " ": # closing symbol of production production = q_production_rules.pop()[0][0] lhs_production = production.lhs() @@ -336,7 +336,7 @@ def compute_prior(self, string_tree: str, log: bool = True) -> float: prior_prob += np.log(self.prior[(lhs_production)][idx] + 1e-15) else: prior_prob *= self.prior[str(lhs_production)][idx] - i+=1 + i += 1 continue _s = string_tree[i : i + _longest] @@ -344,7 +344,9 @@ def compute_prior(self, string_tree: str, log: bool = True) -> float: if _s.startswith(sym): break else: - raise RuntimeError(f"Terminal or nonterminal at position {i} does not exist") + raise RuntimeError( + f"Terminal or nonterminal at position {i} does not exist" + ) i += len(sym) - 1 @@ -362,8 +364,7 @@ def compute_prior(self, string_tree: str, log: bool = True) -> float: new_productions = [ production for production in _productions - if str(production.rhs()[_count]) - == sym + if str(production.rhs()[_count]) == sym ] q_production_rules[-1] = (new_productions, _count + 1) @@ -378,8 +379,7 @@ def compute_prior(self, string_tree: str, log: bool = True) -> float: return prior_prob def _generate(self, start=None, depth=None, n=None): - """ - see https://www.nltk.org/_modules/nltk/parse/generate.html + """See https://www.nltk.org/_modules/nltk/parse/generate.html Generates an iterator of all sentences from a CFG. :param grammar: The Grammar used to generate sentences. @@ -461,9 +461,7 @@ def mutate( break _patience -= 1 - child = self._remove_empty_spaces(child) - - return child + return self._remove_empty_spaces(child) def crossover( self, @@ -506,7 +504,7 @@ def crossover( return False, False - def rand_subtree(self, tree: str) -> Tuple[str, int]: + def rand_subtree(self, tree: str) -> tuple[str, int]: """Helper function to choose a random subtree in a given parse tree. Runs a single pass through the tree (stored as string) to look for the location of swappable nonterminal symbols. @@ -520,7 +518,7 @@ def rand_subtree(self, tree: str) -> Tuple[str, int]: split_tree = tree.split(" ") swappable_indices = [ i - for i in range(0, len(split_tree)) + for i in range(len(split_tree)) if split_tree[i][1:] in self.swappable_nonterminals ] r = np.random.randint(1, len(swappable_indices)) @@ -530,7 +528,7 @@ def rand_subtree(self, tree: str) -> Tuple[str, int]: @staticmethod def rand_subtree_fixed_head( - tree: str, head_node: str, swappable_indices: list = None + tree: str, head_node: str, swappable_indices: list | None = None ) -> int: # helper function to choose a random subtree from a given tree with a specific head node # if no such subtree then return False, otherwise return the index of the subtree @@ -539,7 +537,7 @@ def rand_subtree_fixed_head( if swappable_indices is None: split_tree = tree.split(" ") swappable_indices = [ - i for i in range(0, len(split_tree)) if split_tree[i][1:] == head_node + i for i in range(len(split_tree)) if split_tree[i][1:] == head_node ] if not isinstance(swappable_indices, list): raise TypeError("Expected list for swappable indices!") @@ -553,15 +551,14 @@ def rand_subtree_fixed_head( if len(swappable_indices) > 1 else 0 ) - chosen_non_terminal_index = swappable_indices[r] - return chosen_non_terminal_index + return swappable_indices[r] @staticmethod - def remove_subtree(tree: str, index: int) -> Tuple[str, str, str]: + def remove_subtree(tree: str, index: int) -> tuple[str, str, str]: """Helper functioon to remove a subtree from a parse tree given its index. E.g. '(S (S (T 2)) (ADD +) (T 1))' - becomes '(S (S (T 2)) ', '(T 1))' after removing (ADD +) + becomes '(S (S (T 2)) ', '(T 1))' after removing (ADD +). Args: tree (str): parse tree @@ -613,7 +610,7 @@ def __init__(self, *args, **kwargs): def set_depth_constraints(self, depth_constraints): self.depth_constraints = depth_constraints - if not all(k in self.nonterminals for k in self.depth_constraints.keys()): + if not all(k in self.nonterminals for k in self.depth_constraints): raise Exception( f"Nonterminal {set(self.depth_constraints.keys())-set(self.nonterminals)} does not exist in grammar" ) @@ -625,27 +622,24 @@ def is_depth_constrained(): def sampler( # type: ignore[override] self, n: int = 1, - start_symbol: str = None, - depth_information: dict = None, + start_symbol: str | None = None, + depth_information: dict | None = None, ): if self.depth_constraints is None: raise ValueError("Depth constraints are not set!") - if start_symbol is None: - start_symbol = self.start() - else: - start_symbol = Nonterminal(start_symbol) + start_symbol = self.start() if start_symbol is None else Nonterminal(start_symbol) if depth_information is None: depth_information = {} return [ f"{self._depth_constrained_sampler(symbol=start_symbol, depth_information=depth_information)})" - for i in range(0, n) + for i in range(n) ] def _compute_depth_information_for_pre(self, tree: str) -> dict: depth_information = {nt: 0 for nt in self.nonterminals} - q_nonterminals: Deque = deque() + q_nonterminals: deque = deque() for split in tree.split(" "): if split == "": continue @@ -666,7 +660,7 @@ def _compute_depth_information(self, tree: str) -> tuple: helper_subtree_depth = [0] * len(split_tree) helper_dict_depth_information = {nt: 0 for nt in self.nonterminals} helper_dict_subtree_depth: dict = {nt: deque() for nt in self.nonterminals} - q_nonterminals: Deque = deque() + q_nonterminals: deque = deque() for i, split in enumerate(split_tree): if split == "": continue @@ -692,7 +686,7 @@ def _compute_depth_information(self, tree: str) -> tuple: def _compute_max_depth(self, tree: str, subtree_node: str) -> int: max_depth = 0 depth_information = {nt: 0 for nt in self.nonterminals} - q_nonterminals: Deque = deque() + q_nonterminals: deque = deque() for split in tree.split(" "): if split == "": continue @@ -708,19 +702,21 @@ def _compute_max_depth(self, tree: str, subtree_node: str) -> int: split = split[:-1] return max_depth - def _depth_constrained_sampler(self, symbol=None, depth_information: dict = None): + def _depth_constrained_sampler( + self, symbol=None, depth_information: dict | None = None + ): if depth_information is None: depth_information = {} # init the sequence tree = "(" + str(symbol) # collect possible productions from the starting symbol & filter if constraints are violated lhs = str(symbol) - if lhs in depth_information.keys(): + if lhs in depth_information: depth_information[lhs] += 1 else: depth_information[lhs] = 1 if ( - lhs in self.depth_constraints.keys() + lhs in self.depth_constraints and depth_information[lhs] >= self.depth_constraints[lhs] ): productions = [ @@ -769,8 +765,7 @@ def mutate( if parent != child: # ensure that parent is really mutated break _patience -= 1 - child = self._remove_empty_spaces(child) - return child + return self._remove_empty_spaces(child) def crossover( self, diff --git a/neps/search_spaces/architecture/cfg_variants/cfg_resolution.py b/neps/search_spaces/architecture/cfg_variants/cfg_resolution.py deleted file mode 100644 index 5bc8fb5e..00000000 --- a/neps/search_spaces/architecture/cfg_variants/cfg_resolution.py +++ /dev/null @@ -1,385 +0,0 @@ -from collections import deque -from typing import Deque - -import networkx as nx -import numpy as np -from nltk.grammar import Nonterminal - -from ..cfg import Grammar, choice - - -class ResolutionGrammar(Grammar): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.n_downsamples = None - self.terminal_to_graph_map = None - self.downsampling_lhs = None - self.downsample_terminal = None - self.depth_constraints = None - - def set_resolution_constraints( - self, - n_downsamples: int, - terminal_to_graph: dict, - downsampling_lhs: str, - downsample_terminal: str = "downsample", - depth_constraints: dict = None, - ): - self.n_downsamples = n_downsamples - - terminal_to_graph_map: dict = {} - for k, v in terminal_to_graph.items(): - terminal_to_graph_map[k] = {} - terminal_to_graph_map[k]["edge_list"] = v - - G = nx.DiGraph() - G.add_edges_from(v) - src = [n for n, d in G.in_degree() if d == 0][0] - tgt = [n for n, d in G.out_degree() if d == 0][0] - terminal_to_graph_map[k]["paths"] = { - k: [] for k in range(1, nx.dag_longest_path_length(G) + 1) - } - for path in nx.all_simple_edge_paths(G, source=src, target=tgt): - terminal_to_graph_map[k]["paths"][len(path)].append(path[::-1]) - - self.terminal_to_graph_map = terminal_to_graph_map - - self.downsampling_lhs = downsampling_lhs - self.swappable_nonterminals.remove(self.downsampling_lhs) - - self.downsample_terminal = downsample_terminal - - if depth_constraints is not None: - self.depth_constraints = depth_constraints - if not all(k in self.nonterminals for k in self.depth_constraints.keys()): - raise Exception( - f"Nonterminal {set(self.depth_constraints.keys())-set(self.nonterminals)} does not exist in grammar" - ) - else: - self.depth_constraints = {} - - @staticmethod - def is_resolution_constrained(): - return True - - def sampler( - self, - n=1, - start_symbol: str = None, - n_downsamples: int = None, - depth_information: dict = None, - ): - if start_symbol is None: - start_symbol = self.start() - else: - start_symbol = Nonterminal(start_symbol) - - if depth_information is None: - depth_information = {} - if n_downsamples is None: - n_downsamples = self.n_downsamples - return [ - f"{self._resolution_constrained_sampler(symbol=start_symbol, n_downsamples=n_downsamples, depth_information=depth_information)})" - for _ in range(n) - ] - - def _compute_depth_information_for_pre(self, tree: str) -> dict: - depth_information = {nt: 0 for nt in self.nonterminals} - q_nonterminals: Deque = deque() - for split in tree.split(" "): - if split == "": - continue - elif split[0] == "(": - q_nonterminals.append(split[1:]) - depth_information[split[1:]] += 1 - continue - while split[-1] == ")": - nt = q_nonterminals.pop() - depth_information[nt] -= 1 - split = split[:-1] - return depth_information - - def _compute_depth_information(self, tree: str) -> tuple: - split_tree = tree.split(" ") - depth_information = [0] * len(split_tree) - subtree_depth = [0] * len(split_tree) - helper_subtree_depth = [0] * len(split_tree) - helper_dict_depth_information = {nt: 0 for nt in self.nonterminals} - helper_dict_subtree_depth: dict = {nt: deque() for nt in self.nonterminals} - q_nonterminals: Deque = deque() - for i, split in enumerate(split_tree): - if split == "": - continue - elif split[0] == "(": - nt = split[1:] - q_nonterminals.append(nt) - depth_information[i] = helper_dict_depth_information[nt] + 1 - helper_dict_depth_information[nt] += 1 - helper_dict_subtree_depth[nt].append(i) - for j in helper_dict_subtree_depth[nt]: - subtree_depth[j] = max(subtree_depth[j], helper_subtree_depth[j] + 1) - helper_subtree_depth[j] += 1 - continue - while split[-1] == ")": - nt = q_nonterminals.pop() - helper_dict_depth_information[nt] -= 1 - for j in helper_dict_subtree_depth[nt]: - helper_subtree_depth[j] -= 1 - _ = helper_dict_subtree_depth[nt].pop() - split = split[:-1] - return depth_information, subtree_depth - - def _compute_max_depth(self, tree: str, subtree_node: str) -> int: - max_depth = 0 - depth_information = {nt: 0 for nt in self.nonterminals} - q_nonterminals: Deque = deque() - for split in tree.split(" "): - if split == "": - continue - elif split[0] == "(": - q_nonterminals.append(split[1:]) - depth_information[split[1:]] += 1 - if split[1:] == subtree_node and depth_information[split[1:]] > max_depth: - max_depth = depth_information[split[1:]] - continue - while split[-1] == ")": - nt = q_nonterminals.pop() - depth_information[nt] -= 1 - split = split[:-1] - return max_depth - - @staticmethod - def assign_downsamples(edge_list, paths, n_downsamples): - if n_downsamples == 0: - return [0] * len(edge_list) - edge_list_to_downsamples = {e: 0 for e in edge_list} - - if max(paths.keys()) >= n_downsamples: - for path in paths[n_downsamples]: - for e in path: - edge_list_to_downsamples[e] = 1 - - for k in reversed(sorted(paths.keys())): - k_paths = paths[k] - if len(k_paths) == 0 or k == n_downsamples: - continue - tmp_indices = list(range(len(k_paths))) - np.random.shuffle(tmp_indices) - for idx in tmp_indices: - path = k_paths[idx] - already_set_n_downsamples = sum(edge_list_to_downsamples[e] for e in path) - if already_set_n_downsamples == n_downsamples: - continue - _path = [e for e in path if edge_list_to_downsamples[e] == 0] - - _n_downsamples = n_downsamples - already_set_n_downsamples - if len(_path) == 1: - edge_list_to_downsamples[path[0]] = _n_downsamples - elif len(_path) < _n_downsamples: - indices = np.random.choice( - list(range(len(_path))), - size=n_downsamples // len(_path), - replace=False, - ) - for i, e in enumerate(_path): - edge_list_to_downsamples[e] = ( - n_downsamples // len(_path) + 1 - if i in indices - else n_downsamples // len(_path) - ) - else: - indices = np.random.choice( - list(range(len(_path))), - size=_n_downsamples, - replace=False, - ) - for i in indices: - edge_list_to_downsamples[_path[i]] = 1 - - return [edge_list_to_downsamples[e] for e in edge_list] - - def _resolution_constrained_sampler( - self, symbol=None, n_downsamples: int = 0, depth_information: dict = None - ): - if depth_information is None: - depth_information = {} - - # init the sequence - tree = "(" + str(symbol) - - lhs = str(symbol) - if lhs in depth_information.keys(): - depth_information[lhs] += 1 - else: - depth_information[lhs] = 1 - - # collect possible productions from the starting symbol & filter if constraints are violated - if lhs == self.downsampling_lhs: - productions = [ - production - for production in self.productions(lhs=symbol) - if sum(str(x) == self.downsample_terminal for x in production.rhs()) - == n_downsamples - ] - elif ( - lhs in self.depth_constraints.keys() - and depth_information[lhs] < self.depth_constraints[lhs]["min"]["number"] - ): - productions = [ - production - for production in self.productions(lhs=symbol) - if not ( - len(production.rhs()) == 1 - and str(production.rhs()[0]) - in self.depth_constraints[lhs]["min"]["exclude_rhs"] - ) - ] - elif ( - lhs in self.depth_constraints.keys() - and depth_information[lhs] >= self.depth_constraints[lhs]["max"]["number"] - ): - productions = [ - production - for production in self.productions(lhs=symbol) - if lhs - not in [str(sym) for sym in production.rhs() if not isinstance(sym, str)] - ] - else: - productions = self.productions(lhs=symbol) - - if len(productions) == 0: - raise Exception( - "There can be no word sampled! This is due to the grammar and/or constraints." - ) - - # sample - production = choice(productions) - n_downsamples_per_edge = [] - counter = 0 - for sym in production.rhs(): - if isinstance(sym, str): - tree = tree + " " + sym - if sym in self.terminal_to_graph_map.keys(): - n_downsamples_per_edge = self.assign_downsamples( - self.terminal_to_graph_map[sym]["edge_list"], - self.terminal_to_graph_map[sym]["paths"], - n_downsamples, - ) - else: - if counter < len(n_downsamples_per_edge): - _n_downsamples = n_downsamples_per_edge[counter] - elif ( - len(production.rhs()) == 1 - and str(production.rhs()[0]) == self.downsampling_lhs - ): - _n_downsamples = n_downsamples - else: - _n_downsamples = 0 - tree = ( - tree - + " " - + self._resolution_constrained_sampler( - sym, - n_downsamples=_n_downsamples, - depth_information=depth_information, - ) - + ")" - ) - counter += 1 - - depth_information[lhs] -= 1 - return tree - - def mutate( - self, parent: str, subtree_index: int, subtree_node: str, patience: int = 50 - ) -> str: - # chop out subtree - pre, _, post = self.remove_subtree(parent, subtree_index) - _patience = patience - while _patience > 0: - # only sample subtree -> avoids full sampling of large parse trees - depth_information = self._compute_depth_information_for_pre(pre) - new_subtree = self.sampler( - 1, start_symbol=subtree_node, depth_information=depth_information - )[0] - child = pre + new_subtree + post - if parent != child: # ensure that parent is really mutated - break - _patience -= 1 - child = self._remove_empty_spaces(child) - return child - - def crossover( - self, - parent1: str, - parent2: str, - patience: int = 50, - return_crossover_subtrees: bool = False, - ): - # randomly swap subtrees in two trees - # if no suitiable subtree exists then return False - subtree_node, subtree_index = self.rand_subtree(parent1) - # chop out subtree - pre, sub, post = self.remove_subtree(parent1, subtree_index) - head_node_depth = self._compute_depth_information_for_pre(pre)[subtree_node] + 1 - sub_depth = self._compute_max_depth(sub, subtree_node) - _patience = patience - while _patience > 0: - # sample subtree from donor - donor_subtree_index = self._rand_subtree_fixed_head( - parent2, subtree_node, head_node_depth, sub_depth=sub_depth - ) - # if no subtrees with right head node return False - if not donor_subtree_index: - _patience -= 1 - else: - donor_pre, donor_sub, donor_post = self.remove_subtree( - parent2, donor_subtree_index - ) - # return the two new tree - child1 = pre + donor_sub + post - child2 = donor_pre + sub + donor_post - - child1 = self._remove_empty_spaces(child1) - child2 = self._remove_empty_spaces(child2) - - if return_crossover_subtrees: - return ( - child1, - child2, - (pre, sub, post), - (donor_pre, donor_sub, donor_post), - ) - - return child1, child2 - - return False, False - - def _rand_subtree_fixed_head( - self, - tree: str, - head_node: str, - head_node_depth: int = 0, - sub_depth: int = 0, - ) -> int: - # helper function to choose a random subtree from a given tree with a specific head node - # if no such subtree then return False, otherwise return the index of the subtree - - # single pass through tree (stored as string) to look for the location of swappable_non_terminmals - if head_node in self.depth_constraints: - depth_information, subtree_depth = self._compute_depth_information(tree) - split_tree = tree.split(" ") - swappable_indices = [ - i - for i in range(len(split_tree)) - if split_tree[i][1:] == head_node - and head_node_depth - 1 + subtree_depth[i] - <= self.depth_constraints[head_node] - and depth_information[i] - 1 + sub_depth - <= self.depth_constraints[head_node] - ] - else: - swappable_indices = None - return super().rand_subtree_fixed_head( - tree=tree, head_node=head_node, swappable_indices=swappable_indices - ) diff --git a/neps/search_spaces/architecture/cfg_variants/constrained_cfg.py b/neps/search_spaces/architecture/cfg_variants/constrained_cfg.py index a79ce212..00564835 100644 --- a/neps/search_spaces/architecture/cfg_variants/constrained_cfg.py +++ b/neps/search_spaces/architecture/cfg_variants/constrained_cfg.py @@ -6,16 +6,15 @@ from copy import deepcopy from functools import partial from queue import LifoQueue -from typing import Deque import numpy as np from nltk.grammar import Nonterminal -from ..cfg import Grammar, choice +from neps.search_spaces.architecture.cfg import Grammar, choice class Constraint: - def __init__(self, current_derivation: str = None) -> None: + def __init__(self, current_derivation: str | None = None) -> None: self.current_derivation = current_derivation @staticmethod @@ -46,7 +45,7 @@ def __init__(self, *args, **kwargs): self._prior: dict = None - def set_constraints(self, constraints: dict, none_operation: str = None): + def set_constraints(self, constraints: dict, none_operation: str | None = None): self.constraints = constraints self.none_operation = none_operation self.constraint_is_class = isinstance(self.constraints, Constraint) @@ -85,14 +84,11 @@ def _check_prior(value: dict): def sampler( # type: ignore[override] self, n=1, - start_symbol: str = None, + start_symbol: str | None = None, not_allowed_productions=None, user_priors: bool = False, ): - if start_symbol is None: - start_symbol = self.start() - else: - start_symbol = Nonterminal(start_symbol) + start_symbol = self.start() if start_symbol is None else Nonterminal(start_symbol) return [ self._constrained_sampler( @@ -154,7 +150,7 @@ def _constrained_sampler( probs = [p for i, p in enumerate(probs) if i not in not_allowed_indices] # rescale s.t. probs sum up to one cur_prob_sum = sum(probs) - probs = list(map(lambda x: x / cur_prob_sum, probs)) + probs = [x / cur_prob_sum for x in probs] assert len(probs) == len(productions) production = choice(productions, probs=probs) @@ -216,9 +212,7 @@ def skip_char(char: str) -> bool: and string_tree[i + 1] == " " ): return False - if char == "(": - return True - return False + return char == "(" def find_longest_match( i: int, string_tree: str, symbols: list, max_match: int @@ -253,7 +247,7 @@ def find_longest_match( char = string_tree[i] if skip_char(char): pass - elif char == ")" and not string_tree[i - 1] == " ": + elif char == ")" and string_tree[i - 1] != " ": # closing symbol of production production = q_production_rules.get(block=False)[0][0] idx = self.productions(production.lhs()).index(production) @@ -264,9 +258,9 @@ def find_longest_match( ): outer_production = q_production_rules.queue[-1][0][0] if len(q_production_rules.queue) not in current_derivations: - current_derivations[ - len(q_production_rules.queue) - ] = self.constraints(outer_production.rhs()[0]) + current_derivations[len(q_production_rules.queue)] = ( + self.constraints(outer_production.rhs()[0]) + ) context_information = self.constraints( outer_production.rhs()[0], current_derivations[len(q_production_rules.queue)], @@ -303,7 +297,7 @@ def find_longest_match( ] # rescale s.t. prior sum up to one cur_prob_sum = sum(prior) - prior = list(map(lambda x: x / cur_prob_sum, prior)) + prior = [x / cur_prob_sum for x in prior] idx -= sum(idx > i for i in not_allowed_indices) prior = prior[idx] @@ -343,7 +337,7 @@ def find_longest_match( return prior_prob def _compute_current_context(self, pre_subtree: str, post_subtree: str): - q_nonterminals: Deque = deque() + q_nonterminals: deque = deque() for sym in pre_subtree.split(" "): if sym == "": continue @@ -371,7 +365,7 @@ def _compute_current_context(self, pre_subtree: str, post_subtree: str): if len(productions) == 0: raise Exception("Cannot find corresponding production!") - q_context: Deque = deque() + q_context: deque = deque() current_derivation = [] rhs_counter = 0 tmp_str = "" @@ -476,9 +470,9 @@ def mutate( not_allowed_productions = self._get_not_allowed_productions( self.productions(lhs=Nonterminal(subtree_node)), context_information[ - [i for i, cd in enumerate(current_derivation) if cd is None][ - 0 - ] + next( + i for i, cd in enumerate(current_derivation) if cd is None + ) ], ) elif isinstance(context_information, bool): @@ -508,8 +502,7 @@ def mutate( ): break _patience -= 1 - child = self._remove_empty_spaces(child) - return child + return self._remove_empty_spaces(child) def crossover( self, @@ -534,7 +527,7 @@ def crossover( parent1_not_allowed_productions = self._get_not_allowed_productions( self.productions(lhs=Nonterminal(subtree_node)), context_information[ - [i for i, cd in enumerate(current_derivation) if cd is None][0] + next(i for i, cd in enumerate(current_derivation) if cd is None) ], ) elif isinstance(context_information, bool): @@ -570,11 +563,11 @@ def crossover( self._get_not_allowed_productions( self.productions(lhs=Nonterminal(subtree_node)), context_information[ - [ + next( i for i, cd in enumerate(current_derivation) if cd is None - ][0] + ) ], ) ) @@ -637,7 +630,9 @@ def compute_space_size(self) -> int: int: size of space described by grammar. """ - def recursive_worker(nonterminal: Nonterminal, memory_bank: dict = None) -> int: + def recursive_worker( + nonterminal: Nonterminal, memory_bank: dict | None = None + ) -> int: def _get_all_variants(production): variants = [production] nonterminals = [ @@ -715,18 +710,17 @@ def _get_all_variants(production): potential_production ) ) - else: - if any( - production.rhs()[0] == self.none_operation - for nonterminal in nonterminals - for production in self.productions(nonterminal) - ): - potential_productions += _get_all_variants(potential_production) - elif not ( - len(potential_production.rhs()) == 1 - and potential_production.rhs()[0] == self.none_operation - ): - potential_productions.append(potential_production) + elif any( + production.rhs()[0] == self.none_operation + for nonterminal in nonterminals + for production in self.productions(nonterminal) + ): + potential_productions += _get_all_variants(potential_production) + elif not ( + len(potential_production.rhs()) == 1 + and potential_production.rhs()[0] == self.none_operation + ): + potential_productions.append(potential_production) _possibilites = 0 for potential_production in potential_productions: nonterminals = [ @@ -736,7 +730,7 @@ def _get_all_variants(production): ] possibilities_per_edge = [ memory_bank[str(e_nonterminal)] - if str(e_nonterminal) in memory_bank.keys() + if str(e_nonterminal) in memory_bank else recursive_worker(e_nonterminal, memory_bank) for e_nonterminal in nonterminals ] diff --git a/neps/search_spaces/architecture/core_graph_grammar.py b/neps/search_spaces/architecture/core_graph_grammar.py index 17323b48..f0862882 100644 --- a/neps/search_spaces/architecture/core_graph_grammar.py +++ b/neps/search_spaces/architecture/core_graph_grammar.py @@ -6,7 +6,6 @@ from abc import abstractmethod from copy import deepcopy from functools import partial -from typing import Deque import networkx as nx import numpy as np @@ -29,11 +28,14 @@ def get_edge_lists_of_topologies(terminal_map: dict) -> dict: if is_topology: if isinstance(v, partial): if hasattr(v.func, "get_edge_list"): - func_args = inspect.getfullargspec(v.func.get_edge_list).args # type: ignore[attr-defined] + func_args = inspect.getfullargspec( + v.func.get_edge_list).args # type: ignore[attr-defined] kwargs = {k: v for k, v in v.keywords.items() if k in func_args} - topology_edge_lists[k] = v.func.get_edge_list(**kwargs) # type: ignore[attr-defined] + topology_edge_lists[k] = v.func.get_edge_list( + **kwargs) # type: ignore[attr-defined] elif hasattr(v.func, "edge_list"): - topology_edge_lists[k] = v.func.edge_list # type: ignore[attr-defined] + topology_edge_lists[ + k] = v.func.edge_list # type: ignore[attr-defined] else: raise Exception( f"Please implement a get_edge_list static method for {v.func.__name__} or set edge_list!" @@ -48,13 +50,13 @@ def __init__( self, grammars: list[Grammar] | Grammar, terminal_to_op_names: dict, - terminal_to_graph_edges: dict = None, + terminal_to_graph_edges: dict | None = None, edge_attr: bool = True, edge_label: str = "op_name", - zero_op: list = None, - identity_op: list = None, - name: str = None, - scope: str = None, + zero_op: list | None = None, + identity_op: list | None = None, + name: str | None = None, + scope: str | None = None, return_all_subgraphs: bool = False, return_graph_per_hierarchy: bool = False, ): @@ -95,7 +97,7 @@ def get_grammars(self) -> list[Grammar]: def clear_graph(self): while len(self.nodes()) != 0: - self.remove_node(list(self.nodes())[0]) + self.remove_node(next(iter(self.nodes()))) @abstractmethod def id_to_string_tree(self, identifier: str): @@ -124,7 +126,7 @@ def prune_tree( terminal_to_torch_map_keys: collections.abc.KeysView, node_label: str = "op_name", ) -> nx.DiGraph: - """Prunes unnecessary parts of parse tree, i.e., only one child + """Prunes unnecessary parts of parse tree, i.e., only one child. Args: tree (nx.DiGraph): Parse tree @@ -154,11 +156,11 @@ def dfs(visited: set, tree: nx.DiGraph, node: int) -> nx.DiGraph: if len(predecessor) > 0: tree.add_edge(predecessor[0], tree.nodes[node]["children"][0]) old_children = tree.nodes[predecessor[0]]["children"] - idx = [i for i, c in enumerate(old_children) if c == node][0] + idx = next(i for i, c in enumerate(old_children) if c == node) tree.nodes[predecessor[0]]["children"] = ( old_children[: idx + 1] + [tree.nodes[node]["children"][0]] - + old_children[idx + 1 :] + + old_children[idx + 1:] ) tree.nodes[predecessor[0]]["children"].remove(node) @@ -167,474 +169,16 @@ def dfs(visited: set, tree: nx.DiGraph, node: int) -> nx.DiGraph: tree.nodes[node]["terminal"] and tree.nodes[node][node_label] not in terminal_to_torch_map_keys ): - predecessor = list(tree.pred[node])[0] + predecessor = next(iter(tree.pred[node])) tree.nodes[predecessor]["children"].remove(node) tree.remove_node(node) return tree return dfs(set(), tree, self._find_root(tree)) - @staticmethod - def _dfs_preorder_nodes(G: nx.DiGraph, source: str = None) -> list[int]: - """Generates nodes in DFS pre-ordering starting at source. - Note that after pruning we cannot reconstruct the associated string tree! - - Args: - G (nx.DiGraph): NetworkX DAG - source (str, optional): Starting node for DFS. Defaults to None. - - Returns: - generator: List of nodes in a DFS pre-ordering. - """ - edges = nx.dfs_labeled_edges(G, source=source) - return list(v for _, v, d in edges if d == "forward") - - @staticmethod - def _find_leafnodes(G): - leafnode = [] - for i in G.nodes: - head = [] - if nx.descendants(G, i) == set(): # find all leaf nodes - for a in nx.ancestors(G, i): # get all ancestors for leaf node - if ( - nx.ancestors(G, a) == set() - ): # Determine if ancestor is a head node - head.append(a) - if len(head) == 1: # if this leaf had only one head then append to leafnode - leafnode.append(i) - return leafnode - - @staticmethod - def _get_neighbors_from_parse_tree(tree: nx.DiGraph, node: int) -> list[int]: - return tree.nodes[node]["children"] - @staticmethod def _find_root(G): - return [n for n, d in G.in_degree() if d == 0][0] - - @staticmethod - def _relabel_nodes(G: nx.DiGraph, mapping: dict) -> nx.DiGraph: - """Relabels the nodes and adjusts children list accordingly. - - Args: - G (nx.DiGraph): graph to relabel - mapping (dict): node mapping - - Returns: - nx.DiGraph: relabeled graph (copied) - """ - # recreation of graph is faster - tree_relabeled = nx.DiGraph() - tree_relabeled.add_nodes_from( - [ - ( - mapping[n[0]], - { - k: v if k != "children" else [mapping[_n] for _n in v] - for k, v in n[1].items() - }, - ) - for n in G.nodes(data=True) - ] - ) - tree_relabeled.add_edges_from([(mapping[e[0]], mapping[e[1]]) for e in G.edges()]) - return tree_relabeled - - def assemble_trees( - self, - base_tree: str | nx.DiGraph, - motif_trees: list[str] | list[nx.DiGraph], - terminal_to_sublanguage_map: dict = None, - node_label: str = "op_name", - ) -> str | nx.DiGraph: - """Assembles the base parse tree with the motif parse trees - - Args: - base_tree (nx.DiGraph): Base parse tree - motif_trees (List[nx.DiGraph]): List of motif parse trees - node_label (str, optional): node label key. Defaults to "op_name". - - Returns: - nx.DiGraph: Assembled parse tree - """ - if not all([isinstance(base_tree, type(tree)) for tree in motif_trees]): - raise ValueError("All trees must be of the same type!") - if isinstance(base_tree, str): - ensembled_tree_string = base_tree - if terminal_to_sublanguage_map is None: - raise NotImplementedError - - for motif, replacement in zip( - terminal_to_sublanguage_map.keys(), motif_trees - ): - if motif in ensembled_tree_string: - ensembled_tree_string = ensembled_tree_string.replace( - motif, replacement - ) - return ensembled_tree_string - elif isinstance(base_tree, nx.DiGraph): - raise NotImplementedError - leafnodes = self._find_leafnodes(base_tree) - root_nodes = [self._find_root(G) for G in motif_trees] - root_op_names = np.array( - [ - motif_tree.nodes[root_node][node_label] - for motif_tree, root_node in zip(motif_trees, root_nodes) - ] - ) - largest_node_number = max(base_tree.nodes()) - # ensembled_tree = base_tree.copy() - # recreation is slightly faster - ensembled_tree: nx.DiGraph = nx.DiGraph() - ensembled_tree.add_nodes_from(base_tree.nodes(data=True)) - ensembled_tree.add_edges_from(base_tree.edges()) - for leafnode in leafnodes: - idx = np.where(base_tree.nodes[leafnode][node_label] == root_op_names)[0] - if len(idx) == 0: - continue - if len(idx) > 1: - raise ValueError( - "More than two similar terminal/start symbols are not supported!" - ) - - tree = motif_trees[idx[0]] - # generate mapping - mapping = { - n: n_new - for n, n_new in zip( - tree.nodes(), - range( - largest_node_number + 1, - largest_node_number + 1 + len(tree), - ), - ) - } - largest_node_number = largest_node_number + 1 + len(tree) - tree_relabeled = self._relabel_nodes(G=tree, mapping=mapping) - - # compose trees - predecessor_in_base_tree = list(ensembled_tree.pred[leafnode])[0] - motif_tree_root_node = self._find_root(tree_relabeled) - successors_in_motif_tree = tree_relabeled.nodes[motif_tree_root_node][ - "children" - ] - - # delete unnecessary edges - ensembled_tree.remove_node(leafnode) - tree_relabeled.remove_node(motif_tree_root_node) - # add new edges - tree_relabeled.add_node(predecessor_in_base_tree) - for n in successors_in_motif_tree: - tree_relabeled.add_edge(predecessor_in_base_tree, n) - - ensembled_tree.update( - edges=tree_relabeled.edges(data=True), - nodes=tree_relabeled.nodes(data=True), - ) - - idx = np.where( - np.array(ensembled_tree.nodes[predecessor_in_base_tree]["children"]) - == leafnode - )[0][0] - old_children = ensembled_tree.nodes[predecessor_in_base_tree]["children"] - ensembled_tree.nodes[predecessor_in_base_tree]["children"] = ( - old_children[: idx + 1] - + successors_in_motif_tree - + old_children[idx + 1 :] - ) - ensembled_tree.nodes[predecessor_in_base_tree]["children"].remove( - leafnode - ) - return ensembled_tree - else: - raise NotImplementedError( - f"Assembling of trees of type {type(base_tree)} is not supported!" - ) - - def build_graph_from_tree( - self, - tree: nx.DiGraph, - terminal_to_torch_map: dict, - node_label: str = "op_name", - flatten_graph: bool = True, - return_cell: bool = False, - ) -> None | Graph: - """Builds the computational graph from a parse tree. - - Args: - tree (nx.DiGraph): parse tree. - terminal_to_torch_map (dict): Mapping from terminal symbols to primitives or topologies. - node_label (str, optional): Key to access terminal symbol. Defaults to "op_name". - return_cell (bool, optional): Whether to return a cell. Is only needed if cell is repeated multiple times. - Defaults to False. - - Returns: - Tuple[Union[None, Graph]]: computational graph (self) or cell. - """ - - def _build_graph_from_tree( - visited: set, - tree: nx.DiGraph, - node: int, - terminal_to_torch_map: dict, - node_label: str, - is_primitive: bool = False, - ): - """Recursive DFS-esque function to build computational graph from parse tree - - Args: - visited (set): set of visited nodes. - tree (nx.DiGraph): parse tree. - node (int): node index. - terminal_to_torch_map (dict): mapping from terminal symbols to primitives or topologies. - node_label (str): key to access operation name - - Raises: - Exception: primitive or topology is unknown, i.e., it is probably missing in the terminal to - torch mapping - Exception: leftmost children can only be primitive, topology or have one child - - Returns: - [type]: computational graph. - """ - if node not in visited: - subgraphs = [] - primitive_hps = [] - if len(tree.out_edges(node)) == 0: - if is_primitive: - return tree.nodes[node][node_label] - else: - if ( - tree.nodes[node][node_label] - not in terminal_to_torch_map.keys() - ): - raise Exception( - f"Unknown primitive or topology: {tree.nodes[node][node_label]}" - ) - return deepcopy( - terminal_to_torch_map[tree.nodes[node][node_label]] - ) - if len(tree.out_edges(node)) == 1: - return _build_graph_from_tree( - visited, - tree, - list(tree.neighbors(node))[0], - terminal_to_torch_map, - node_label, - is_primitive, - ) - # for idx, neighbor in enumerate(tree.neighbors(node)): - for idx, neighbor in enumerate( - self._get_neighbors_from_parse_tree(tree, node) - ): - if idx == 0: # topology or primitive - n = neighbor - while not tree.nodes[n]["terminal"]: - if len(tree.out_edges(n)) != 1: - raise Exception( - "Leftmost Child can only be primitive, topology or recursively have one child!" - ) - n = next(tree.neighbors(n)) - if is_primitive: - primitive_hp_key = tree.nodes[n][node_label] - primitive_hp_dict = {primitive_hp_key: None} - is_primitive_op = True - else: - if ( - tree.nodes[n][node_label] - not in terminal_to_torch_map.keys() - ): - raise Exception( - f"Unknown primitive or topology: {tree.nodes[n][node_label]}" - ) - graph_el = terminal_to_torch_map[tree.nodes[n][node_label]] - is_primitive_op = issubclass( - graph_el.func - if isinstance(graph_el, partial) - else graph_el, - AbstractPrimitive, - ) - elif not tree.nodes[neighbor][ - "terminal" - ]: # exclude '[' ']' ... symbols - if is_primitive: - primitive_hp_dict[primitive_hp_key] = _build_graph_from_tree( - visited, - tree, - neighbor, - terminal_to_torch_map, - node_label, - is_primitive_op, - ) - elif is_primitive_op: - primitive_hps.append( - _build_graph_from_tree( - visited, - tree, - neighbor, - terminal_to_torch_map, - node_label, - is_primitive_op, - ) - ) - else: - subgraphs.append( - _build_graph_from_tree( - visited, - tree, - neighbor, - terminal_to_torch_map, - node_label, - is_primitive_op, - ) - ) - elif ( - tree.nodes[neighbor][node_label] in terminal_to_torch_map.keys() - ): # exclude '[' ']' ... symbols - # TODO check if there is a potential bug here? - subgraphs.append( - deepcopy( - terminal_to_torch_map[tree.nodes[neighbor][node_label]] - ) - ) - - if is_primitive: - return primitive_hp_dict - elif is_primitive_op: - return dict( - collections.ChainMap(*([{"op": graph_el}] + primitive_hps)) - ) - else: - return graph_el(*subgraphs) - - def _flatten_graph( - graph, - flattened_graph, - start_node: int = None, - end_node: int = None, - ): - nodes: dict = {} - for u, v, data in graph.edges(data=True): - if u in nodes.keys(): - _u = nodes[u] - else: - _u = ( - 1 - if len(flattened_graph.nodes.keys()) == 0 - else max(flattened_graph.nodes.keys()) + 1 - ) - _u = ( - start_node - if graph.in_degree(u) == 0 and start_node is not None - else _u - ) - nodes[u] = _u - if _u not in flattened_graph.nodes.keys(): - flattened_graph.add_node(_u) - - if v in nodes.keys(): - _v = nodes[v] - else: - _v = max(flattened_graph.nodes.keys()) + 1 - _v = ( - end_node - if graph.out_degree(v) == 0 and end_node is not None - else _v - ) - nodes[v] = _v - if _v not in flattened_graph.nodes.keys(): - flattened_graph.add_node(_v) - - if isinstance(data["op"], Graph): - flattened_graph = _flatten_graph( - data["op"], flattened_graph, start_node=_u, end_node=_v - ) - else: - flattened_graph.add_edge(_u, _v) - flattened_graph.edges[_u, _v].update(data) - - return flattened_graph - - root_node = self._find_root(tree) - graph = _build_graph_from_tree( - set(), tree, root_node, terminal_to_torch_map, node_label - ) - self._check_graph(graph) - if return_cell: - cell = ( - _flatten_graph(graph, flattened_graph=Graph()) if flatten_graph else graph - ) - return cell - else: - if flatten_graph: - _flatten_graph(graph, flattened_graph=self) - else: - self.add_edge(0, 1) - self.edges[0, 1].set("op", graph) - return None - - def to_graph_repr(self, graph: Graph, edge_attr: bool) -> nx.DiGraph: - """Transforms NASLib-esque graph to NetworkX graph. - - Args: - graph (Graph): NASLib-esque graph. - edge_attr (bool): Transform to edge attribution or node attribution. - - Returns: - nx.DiGraph: edge- or node-attributed representation of computational graph. - """ - if edge_attr: - g = nx.DiGraph() - g.add_nodes_from(graph.nodes()) - for u, v in graph.edges(): - if isinstance(graph.edges[u, v]["op"], Graph): - g.add_edge(u, v, op_name=graph.edges[u, v]["op"].name) - else: - g.add_edge( - u, v, **{self.edge_label: graph.edges[u, v][self.edge_label]} - ) - g.graph_type = "edge_attr" - else: - g = nx.DiGraph() - src = [n for n in graph.nodes() if graph.in_degree(n) == 0][0] - tgt = [n for n in graph.nodes() if graph.out_degree(n) == 0][0] - nof_edges = graph.size() - g.add_nodes_from( - [ - (0, {self.edge_label: "input"}), - (nof_edges + 1, {self.edge_label: "output"}), - ] - ) - node_counter = 1 - open_edge: dict = {} - for node in nx.topological_sort(graph): - for edge in graph.out_edges(node): - g.add_node( - node_counter, - **{self.edge_label: graph.edges[edge][self.edge_label]}, - ) - - u, v = edge - if u == src: # special case for input node - g.add_edge(0, node_counter) - if v == tgt: # special case of output node - g.add_edge(node_counter, nof_edges + 1) - if ( - u in open_edge.keys() - ): # add edge between already seen nodes and new node - for node_count in open_edge[u]: - g.add_edge(node_count, node_counter) - - if v in open_edge.keys(): - open_edge[v].append(node_counter) - else: - open_edge[v] = [node_counter] - node_counter += 1 - g.graph_type = "node_attr" - - self._check_graph(g) - - return g + return next(n for n, d in G.in_degree() if d == 0) @staticmethod def from_stringTree_to_nxTree( @@ -662,9 +206,7 @@ def skip_char(char: str) -> bool: and string_tree[i + 1] == " " ): return False - if char == "(": - return True - return False + return char == "(" def find_longest_match( i: int, string_tree: str, symbols: list[str], max_match: int @@ -710,7 +252,7 @@ def find_longest_match( char = string_tree[i] if skip_char(char): pass - elif char == ")" and not string_tree[i - 1] == " ": + elif char == ")" and string_tree[i - 1] != " ": # closing symbol of production _node_number = q.get(block=False) _node_children = q_children.get(block=False) @@ -740,34 +282,6 @@ def find_longest_match( raise Exception("Invalid string_tree") return G - def from_nxTree_to_stringTree( - self, nxTree: nx.DiGraph, node_label: str = "op_name" - ) -> str: - """Transforms parse tree represented as NetworkX DAG to string representation. - - Args: - nxTree (nx.DiGraph): parse tree. - node_label (str, optional): key to access operation names. Defaults to "op_name". - - Returns: - str: parse tree represented as string. - """ - - def dfs(visited, graph, node): - if node not in visited: - visited.add(node) - if graph.nodes[node]["terminal"]: - return f"{graph.nodes[node][node_label]}" - tmp_str = f"{f'({graph.nodes[node][node_label]}'}" + " " - # for neighbor in graph.neighbors(node): - for neighbor in self._get_neighbors_from_parse_tree(graph, node): - tmp_str += dfs(visited, graph, neighbor) + " " - tmp_str = tmp_str[:-1] + ")" - return tmp_str - return "" - - return dfs(set(), nxTree, node=self._find_root(nxTree)) - def update_op_names(self): # update op names for u, v in self.edges(): @@ -785,8 +299,8 @@ def from_stringTree_to_graph_repr( sym_name: str = "op_name", prune: bool = True, add_subtree_map: bool = False, - return_all_subgraphs: bool = None, - return_graph_per_hierarchy: bool = None, + return_all_subgraphs: bool | None = None, + return_graph_per_hierarchy: bool | None = None, ) -> nx.DiGraph | tuple[nx.DiGraph, collections.OrderedDict]: """Generates graph from parse tree in string representation. Note that we ignore primitive HPs! @@ -821,17 +335,17 @@ def get_node_labels(graph: nx.DiGraph): def get_hierarchicy_dict( string_tree: str, subgraphs: dict, - hierarchy_dict: dict = None, + hierarchy_dict: dict | None = None, hierarchy_level_counter: int = 0, ): if hierarchy_dict is None: hierarchy_dict = {} - if hierarchy_level_counter not in hierarchy_dict.keys(): + if hierarchy_level_counter not in hierarchy_dict: hierarchy_dict[hierarchy_level_counter] = [] hierarchy_dict[hierarchy_level_counter].append(string_tree) node_labels = get_node_labels(subgraphs[string_tree]) for _, node_label in node_labels: - if node_label in subgraphs.keys(): + if node_label in subgraphs: hierarchy_dict = get_hierarchicy_dict( node_label, subgraphs, hierarchy_dict, hierarchy_level_counter + 1 ) @@ -916,15 +430,13 @@ def to_node_attributed_edge_list( if v == tgt: node_list.append((ni, 1)) - for e_ in filter( - lambda e: (e[1] == u), edge_list - ): + for e_ in filter(lambda e: (e[1] == u), edge_list): node_list.append((edge_to_node_map[e_], ni)) return node_list, edge_to_node_map def skip_char(char: str) -> bool: - return True if char in [" ", "\t", "\n", "[", "]"] else False + return char in [" ", "\t", "\n", "[", "]"] if prune: add_subtree_map = False @@ -937,14 +449,14 @@ def skip_char(char: str) -> bool: G = nx.DiGraph() if add_subtree_map: - q_nonterminals: Deque = collections.deque() + q_nonterminals: collections.deque = collections.deque() if compute_subgraphs: - q_subtrees: Deque = collections.deque() - q_subgraphs: Deque = collections.deque() + q_subtrees: collections.deque = collections.deque() + q_subgraphs: collections.deque = collections.deque() subgraphs_dict = collections.OrderedDict() if edge_attr: node_offset = 0 - q_el: Deque = collections.deque() # edge-attr + q_el: collections.deque = collections.deque() # edge-attr terminal_to_graph = self.terminal_to_graph_edges else: # node-attributed G.add_node(0, **{sym_name: "input"}) @@ -1137,11 +649,11 @@ def skip_char(char: str) -> bool: [ (n_in, n_out) for n_in in q_subgraphs[-1]["graph"].predecessors( - n - ) + n + ) for n_out in q_subgraphs[-1]["graph"].successors( - n - ) + n + ) ] ) q_subgraphs[-1]["graph"].remove_node(n) @@ -1261,16 +773,18 @@ def get_graph_representation( ) -> nx.DiGraph: """This functions takes an identifier and constructs the (multi-variate) composition of the functions it describes. + Args: identifier (str): identifier grammar (Grammar): grammar flatten_graph (bool, optional): Whether to flatten the graph. Defaults to True. + Returns: - nx.DiGraph: (multi-variate) composition of functions + nx.DiGraph: (multi-variate) composition of functions. """ def _skip_char(char: str) -> bool: - return True if char in [" ", "\t", "\n", "[", "]"] else False + return char in [" ", "\t", "\n", "[", "]"] def _get_sym_from_split(split: str) -> str: start_idx, end_idx = 0, len(split) @@ -1298,9 +812,7 @@ def to_node_attributed_edge_list( if v in tgt: node_list.append((ni, v)) - for e_ in filter( - lambda e: (e[1] == u), edge_list - ): + for e_ in filter(lambda e: (e[1] == u), edge_list): node_list.append((edge_to_node_map[e_], ni)) return node_list, edge_to_node_map @@ -1329,12 +841,11 @@ def to_node_attributed_edge_list( if sym in grammar.terminals: is_topology = False - if inspect.isclass(self.terminal_to_op_names[sym]) and issubclass( - self.terminal_to_op_names[sym], AbstractTopology - ): - is_topology = True - elif isinstance(self.terminal_to_op_names[sym], partial) and issubclass( - self.terminal_to_op_names[sym].func, AbstractTopology + if ( + inspect.isclass(self.terminal_to_op_names[sym]) + and issubclass(self.terminal_to_op_names[sym], AbstractTopology) + or isinstance(self.terminal_to_op_names[sym], partial) + and issubclass(self.terminal_to_op_names[sym].func, AbstractTopology) ): is_topology = True @@ -1354,16 +865,13 @@ def to_node_attributed_edge_list( if q_nonterminals.qsize() == q_topologies.qsize(): topology, number_of_primitives = q_topologies.get(block=False) primitives = [ - q_primitives.get(block=False) - for _ in range(number_of_primitives) - ][::-1] + q_primitives.get(block=False) + for _ in range(number_of_primitives) + ][::-1] if ( topology in terminal_to_graph and terminal_to_graph[topology] is not None - ): - raise NotImplementedError - # edges = terminal_to_graph[topology] - elif isinstance(topology, partial): + ) or isinstance(topology, partial): raise NotImplementedError else: composed_function = topology(*primitives) @@ -1435,27 +943,27 @@ def prune_graph(self, graph: nx.DiGraph | Graph = None, edge_attr: bool = True): graph.remove_edges_from(remove_edge_list) else: for n in list(nx.topological_sort(graph)): - if n in graph.nodes(): - if ( - graph.nodes[n]["op_name"] in self.zero_op - or graph.nodes[n]["op_name"] in self.identity_op - ): - if graph.nodes[n]["op_name"] in self.identity_op: - # reconnect edges for removed nodes with 'skip_connect' - graph.add_edges_from( - [ - (e_i[0], e_o[1]) - for e_i in graph.in_edges(n) - for e_o in graph.out_edges(n) - ] - ) - # remove nodes with 'skip_connect' or 'none' label - graph.remove_node(n) + if n in graph.nodes() and ( + graph.nodes[n]["op_name"] in self.zero_op + or graph.nodes[n]["op_name"] in self.identity_op + ): + if graph.nodes[n]["op_name"] in self.identity_op: + # reconnect edges for removed nodes with 'skip_connect' + graph.add_edges_from( + [ + (e_i[0], e_o[1]) + for e_i in graph.in_edges(n) + for e_o in graph.out_edges(n) + ] + ) + # remove nodes with 'skip_connect' or 'none' label + graph.remove_node(n) graph = self.prune_unconnected_parts(graph, src_node, tgt_node) if not use_self: return graph + return None @staticmethod def prune_unconnected_parts(graph, src_node, tgt_node): @@ -1486,35 +994,18 @@ def _backtrack_remove(graph, node: int): graph = _backtrack_remove(graph, n) return graph - def _sampler_maxMin(self, largest: bool = True) -> str | list[str]: - """Samples new parse tree(s) based on grammars. - Assumes that the first rule of each production leads to - smallest DAG and last to largest DAG! - - Args: - largest (bool, optional): To find largest DAG, set to True. For smallest DAG set to False. Defaults to True. - - Returns: - Union[str, List[str]]: Parse tree or list of parse trees - """ - trees = [ - grammar.sampler_maxMin_func(grammar.start(), largest) + ")" - for grammar in self.grammars - ] - return trees if len(trees) > 1 else trees[0] - @staticmethod def flatten_graph( graph: nx.DiGraph, flattened_graph: Graph = None, - start_node: int = None, - end_node: int = None, + start_node: int | None = None, + end_node: int | None = None, ): if flattened_graph is None: flattened_graph = Graph() nodes: dict = {} for u, v, data in graph.edges(data=True): - if u in nodes.keys(): + if u in nodes: _u = nodes[u] else: _u = ( @@ -1528,17 +1019,18 @@ def flatten_graph( else _u ) nodes[u] = _u - if _u not in flattened_graph.nodes.keys(): # type: ignore[union-attr] + if _u not in flattened_graph.nodes: # type: ignore[union-attr] flattened_graph.add_node(_u) # type: ignore[union-attr] - flattened_graph.nodes[_u].update(graph.nodes[u]) # type: ignore[union-attr] + flattened_graph.nodes[_u].update( + graph.nodes[u]) # type: ignore[union-attr] - if v in nodes.keys(): + if v in nodes: _v = nodes[v] else: _v = max(flattened_graph.nodes.keys()) + 1 # type: ignore[union-attr] _v = end_node if graph.out_degree(v) == 0 and end_node is not None else _v nodes[v] = _v - if _v not in flattened_graph.nodes.keys(): # type: ignore[union-attr] + if _v not in flattened_graph.nodes: # type: ignore[union-attr] flattened_graph.add_node(_v) # type: ignore[union-attr] flattened_graph.nodes[_v].update( # type: ignore[union-attr] graph.nodes[v] @@ -1587,13 +1079,14 @@ def _compose_functions( char = descriptor[i] if skip_char(char, descriptor, i): pass - elif char == ")" and not descriptor[i - 1] == " ": + elif char == ")" and descriptor[i - 1] != " ": # closing symbol of production if q_nonterminals.qsize() == q_topologies.qsize(): topology, number_of_primitives = q_topologies.get(block=False) primitives = [ - q_primitives.get(block=False) for _ in range(number_of_primitives) - ][::-1] + q_primitives.get(block=False) for _ in + range(number_of_primitives) + ][::-1] composed_function = topology(*primitives) if not q_topologies.empty(): q_primitives.put(composed_function) @@ -1606,14 +1099,13 @@ def _compose_functions( if sym in grammar.terminals and descriptor[i - 1] != "(": is_topology = False - if inspect.isclass(self.terminal_to_op_names[sym]) and issubclass( - self.terminal_to_op_names[sym], AbstractTopology - ): - is_topology = True - elif isinstance( - self.terminal_to_op_names[sym], partial - ) and issubclass( + if ( + inspect.isclass(self.terminal_to_op_names[sym]) + and issubclass(self.terminal_to_op_names[sym], AbstractTopology) + or isinstance(self.terminal_to_op_names[sym], partial) + and issubclass( self.terminal_to_op_names[sym].func, AbstractTopology + ) ): is_topology = True @@ -1640,7 +1132,7 @@ def _compose_functions( return composed_function def graph_to_self(self, graph: nx.DiGraph, clear_self: bool = True) -> None: - """Copies graph to self + """Copies graph to self. Args: graph (nx.DiGraph): graph @@ -1654,7 +1146,10 @@ def graph_to_self(self, graph: nx.DiGraph, clear_self: bool = True) -> None: self.nodes[n].update(**data) def _unparse_tree( - self, identifier: str, grammar: Grammar, as_composition: bool = True, + self, + identifier: str, + grammar: Grammar, + as_composition: bool = True, ): descriptor = self.id_to_string_tree(identifier) @@ -1675,13 +1170,14 @@ def _unparse_tree( char = descriptor[i] if skip_char(char, descriptor, i): pass - elif char == ")" and not descriptor[i - 1] == " ": + elif char == ")" and descriptor[i - 1] != " ": # closing symbol of production if q_nonterminals.qsize() == q_topologies.qsize(): topology, number_of_primitives = q_topologies.get(block=False) primitives = [ - q_primitives.get(block=False) for _ in range(number_of_primitives) - ][::-1] + q_primitives.get(block=False) for _ in + range(number_of_primitives) + ][::-1] if as_composition: if topology == "Linear1": composed_function = primitives[0] @@ -1691,7 +1187,7 @@ def _unparse_tree( ) # composed_function = topology + "(" + ", ".join(primitives) + ")" else: - composed_function = " ".join([topology] + primitives) + composed_function = " ".join([topology, *primitives]) if not q_topologies.empty(): q_primitives.put(composed_function) q_topologies.queue[-1][1] += 1 @@ -1703,14 +1199,13 @@ def _unparse_tree( if sym in grammar.terminals: is_topology = False - if inspect.isclass(self.terminal_to_op_names[sym]) and issubclass( - self.terminal_to_op_names[sym], AbstractTopology - ): - is_topology = True - elif isinstance( - self.terminal_to_op_names[sym], partial - ) and issubclass( + if ( + inspect.isclass(self.terminal_to_op_names[sym]) + and issubclass(self.terminal_to_op_names[sym], AbstractTopology) + or isinstance(self.terminal_to_op_names[sym], partial) + and issubclass( self.terminal_to_op_names[sym].func, AbstractTopology + ) ): is_topology = True @@ -1738,9 +1233,7 @@ def skip_char(char: str, descriptor: str, i: int) -> bool: # special case: "(" is (part of) a terminal if i != 0 and char == "(" and descriptor[i - 1] == " " and descriptor[i + 1] == " ": return False - if char == "(": - return True - return False + return char == "(" def find_longest_match( diff --git a/neps/search_spaces/architecture/crossover.py b/neps/search_spaces/architecture/crossover.py index 83e104a1..a630e528 100644 --- a/neps/search_spaces/architecture/crossover.py +++ b/neps/search_spaces/architecture/crossover.py @@ -1,9 +1,12 @@ +from __future__ import annotations + import random -from typing import Callable, List, Tuple +from typing import TYPE_CHECKING, Callable import numpy as np -from .cfg import Grammar +if TYPE_CHECKING: + from .cfg import Grammar def simple_crossover( @@ -12,7 +15,7 @@ def simple_crossover( grammar: Grammar, patience: int = 50, return_crossover_subtrees: bool = False, -) -> Tuple[str, str]: +) -> tuple[str, str]: if return_crossover_subtrees: return grammar.crossover( parent1=parent1, @@ -28,10 +31,10 @@ def simple_crossover( def repetitive_search_space_crossover( - base_parent: Tuple[str, str], - motif_parents: Tuple[List[str], List[str]], + base_parent: tuple[str, str], + motif_parents: tuple[list[str], list[str]], base_grammar: Grammar, - motif_grammars: List[Grammar], + motif_grammars: list[Grammar], terminal_to_sublanguage_map: dict, number_of_repetitive_motifs_per_grammar: list, inner_crossover_strategy: Callable, @@ -54,7 +57,7 @@ def _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map): base_parent[1], terminal_to_sublanguage_map ) - random_draw = random.randint( + random_draw = random.randint( # noqa: S311 1 if fixed_macro_parent else 0, min( len(parent1_potential_motif_candidates), @@ -62,12 +65,6 @@ def _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map): ), ) if random_draw == 0: # crossover high level grammar, but keep repetitive motifs fixed - # parent1_motifs = _motifs_in_base_tree( - # child1_string_trees[0], terminal_to_sublanguage_map - # ) - # parent2_motifs = _motifs_in_base_tree( - # child2_string_trees[0], terminal_to_sublanguage_map - # ) ( _, _, @@ -81,96 +78,44 @@ def _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map): ) subtrees_child1 = list(subtrees_child1) subtrees_child2 = list(subtrees_child2) - # new_child1_motifs = _motifs_in_base_tree( - # subtrees_child2[1], terminal_to_sublanguage_map - # ) - # new_child2_motifs = _motifs_in_base_tree( - # subtrees_child1[1], terminal_to_sublanguage_map - # ) - - # old_child1_string_trees = deepcopy(child1_string_trees) - # tmp = number_of_repetitive_motifs_per_grammar[1] - # free_motifs = list(set(range(1, tmp + 1)) - set(parent1_motifs)) - # if len(free_motifs) > 0: - # substitute_terminals = list(terminal_to_sublanguage_map.keys()) - # if len(new_child1_motifs) > len(free_motifs): # too many new child motifs - # new_child1_motifs = random.sample( - # new_child1_motifs, - # k=len(free_motifs), - # ) - # elif len(new_child1_motifs) < len( - # free_motifs - # ): # more free spots than necessary - # free_motifs = random.sample( - # free_motifs, - # k=len(new_child1_motifs), - # ) - # for fm, nm in zip(free_motifs, new_child1_motifs): - # child1_string_trees[fm] = child2_string_trees[nm].replace( - # substitute_terminals[nm], substitute_terminals[fm] - # ) - # subtrees_child2[1] = subtrees_child2[1].replace( - # substitute_terminals[nm], substitute_terminals[fm] - # ) child1_string_trees[0] = ( subtrees_child1[0] + subtrees_child2[1] + subtrees_child1[2] ) - # free_motifs = list(set(range(1, tmp + 1)) - set(parent2_motifs)) - # if len(free_motifs) > 0: - # substitute_terminals = list(terminal_to_sublanguage_map.keys()) - # if len(new_child2_motifs) > len(free_motifs): - # new_child2_motifs = random.sample( - # new_child2_motifs, - # k=len(free_motifs), - # ) - # elif len(new_child2_motifs) < len(free_motifs): - # free_motifs = random.sample( - # free_motifs, - # k=len(new_child2_motifs), - # ) - # for fm, nm in zip(free_motifs, new_child2_motifs): - # child2_string_trees[fm] = old_child1_string_trees[nm].replace( - # substitute_terminals[nm], substitute_terminals[fm] - # ) - # subtrees_child1[1] = subtrees_child1[1].replace( - # substitute_terminals[nm], substitute_terminals[fm] - # ) child2_string_trees[0] = ( subtrees_child2[0] + subtrees_child1[1] + subtrees_child2[2] ) + elif multiple_repetitive: + # TODO more general procedure + coin_toss = random.randint(1, len(child1_string_trees) - 1) + motif_grammar_idx = next( + i + for i, x in enumerate(np.cumsum(number_of_repetitive_motifs_per_grammar)) + if x >= coin_toss + ) + ( + child1_string_trees[coin_toss], + child2_string_trees[coin_toss], + ) = inner_crossover_strategy( + child1_string_trees[coin_toss], + child2_string_trees[coin_toss], + motif_grammars[motif_grammar_idx], + ) else: - if multiple_repetitive: - # TODO more general procedure - coin_toss = random.randint(1, len(child1_string_trees) - 1) - motif_grammar_idx = next( - i - for i, x in enumerate(np.cumsum(number_of_repetitive_motifs_per_grammar)) - if x >= coin_toss - ) - ( - child1_string_trees[coin_toss], - child2_string_trees[coin_toss], - ) = inner_crossover_strategy( - child1_string_trees[coin_toss], - child2_string_trees[coin_toss], - motif_grammars[motif_grammar_idx], - ) - else: - parent1_random_draw = random.randint( - 0, len(parent1_potential_motif_candidates) - 1 - ) - parent2_random_draw = random.randint( - 0, len(parent2_potential_motif_candidates) - 1 - ) - ( - child1_string_trees[parent1_random_draw + 1], - child2_string_trees[parent2_random_draw + 1], - ) = inner_crossover_strategy( - child1_string_trees[parent1_random_draw + 1], - child2_string_trees[parent2_random_draw + 1], - motif_grammars[0], - ) + parent1_random_draw = random.randint( + 0, len(parent1_potential_motif_candidates) - 1 + ) + parent2_random_draw = random.randint( + 0, len(parent2_potential_motif_candidates) - 1 + ) + ( + child1_string_trees[parent1_random_draw + 1], + child2_string_trees[parent2_random_draw + 1], + ) = inner_crossover_strategy( + child1_string_trees[parent1_random_draw + 1], + child2_string_trees[parent2_random_draw + 1], + motif_grammars[0], + ) if any(not st for st in child1_string_trees) or any( not st for st in child2_string_trees diff --git a/neps/search_spaces/architecture/graph.py b/neps/search_spaces/architecture/graph.py index f776b231..6412083e 100644 --- a/neps/search_spaces/architecture/graph.py +++ b/neps/search_spaces/architecture/graph.py @@ -1,20 +1,26 @@ +from __future__ import annotations + import copy import inspect import logging import os import random import sys -from collections import Counter -from typing import Callable -from typing import Counter as CounterType import types +from collections import Counter +from pathlib import Path +from typing import ( + Callable, + Counter as CounterType, +) + import networkx as nx import torch from networkx.algorithms.dag import lexicographical_topological_sort -from pathlib import Path from torch import nn from neps.utils.types import AttrDict + from .primitives import AbstractPrimitive, Identity @@ -28,10 +34,9 @@ def log_formats(x): def _find_caller(): - """ - Returns: - str: module name of the caller - tuple: a hashable key to be used to identify different callers + """Returns: + str: module name of the caller + tuple: a hashable key to be used to identify different callers. """ frame = sys._getframe(2) while frame: @@ -42,6 +47,7 @@ def _find_caller(): mod_name = "detectron2" return mod_name, (code.co_filename, frame.f_lineno, code.co_name) frame = frame.f_back + return None _LOG_COUNTER: CounterType = Counter() @@ -49,8 +55,8 @@ def _find_caller(): def log_first_n(lvl, msg, n=1, *, name=None, key="caller"): - """ - Log only for the first n times. + """Log only for the first n times. + Args: lvl (int): the logging level msg (str): @@ -75,7 +81,7 @@ def log_first_n(lvl, msg, n=1, *, name=None, key="caller"): if "caller" in key: hash_key = hash_key + caller_key if "message" in key: - hash_key = hash_key + (msg,) + hash_key = (*hash_key, msg) _LOG_COUNTER[hash_key] += 1 if _LOG_COUNTER[hash_key] <= n: @@ -83,9 +89,7 @@ def log_first_n(lvl, msg, n=1, *, name=None, key="caller"): def iter_flatten(iterable): - """ - Flatten a potentially deeply nested python list - """ + """Flatten a potentially deeply nested python list.""" # taken from https://rightfootin.blogspot.com/2006/09/more-on-python-flatten.html it = iter(iterable) for e in it: @@ -99,8 +103,7 @@ def iter_flatten(iterable): class Graph(torch.nn.Module, nx.DiGraph): - """ - Base class for defining a search space. Add nodes and edges + """Base class for defining a search space. Add nodes and edges as for a directed acyclic graph in `networkx`. Nodes can contain graphs as children, also edges can contain graphs as operations. @@ -163,19 +166,18 @@ class Graph(torch.nn.Module, nx.DiGraph): """ QUERYABLE = False - def __init__(self, name: str = None, scope: str = None): - """ - Initialise a graph. The edges are automatically filled with an EdgeData object + def __init__(self, name: str | None = None, scope: str | None = None): + """Initialise a graph. The edges are automatically filled with an EdgeData object which defines the default operation as Identity. The default combination operation is set as sum. Note: - When inheriting form `Graph` note that `__init__()` cannot take any parameters. - This is due to the way how networkx is implemented, i.e. graphs are reconstructed - internally and no parameters for init are considered. + When inheriting form `Graph` note that `__init__()` cannot take any + parameters. This is due to the way how networkx is implemented, i.e. graphs + are reconstructed internally and no parameters for init are considered. - Our recommended solution is to create static attributes before initialization and - then load them dynamically in `__init__()`. + Our recommended solution is to create static attributes before initialization + and then load them dynamically in `__init__()`. >>> def __init__(self): >>> num_classes = self.NUM_CLASSES @@ -207,7 +209,7 @@ def __init__(self, name: str = None, scope: str = None): # `input` is required for storing the results of incoming edges. # self._nxgraph.node_attr_dict_factory = lambda: dict({'input': {}, 'comb_op': sum}) - self.node_attr_dict_factory = lambda: dict({"input": {}, "comb_op": sum}) + self.node_attr_dict_factory = lambda: {"input": {}, "comb_op": sum} # remember to add all members also in `unparse()` self.name = name @@ -220,8 +222,7 @@ def __eq__(self, other): return self.name == other.name and self.scope == other.scope def __hash__(self): - """ - As it is very complicated to compare graphs (i.e. check all edge + """As it is very complicated to compare graphs (i.e. check all edge attributes, do the have shared attributes, ...) use just the name for comparison. @@ -234,27 +235,20 @@ def __hash__(self): return h def __repr__(self): - return "Graph {}-{:.07f}, scope {}, {} nodes".format( - self.name, self._id, self.scope, self.number_of_nodes() - ) + return f"Graph {self.name}-{self._id:.07f}, scope {self.scope}, {self.number_of_nodes()} nodes" def modules_str(self): - """ - Once the graph has been parsed, prints the modules as they appear in pytorch. - """ + """Once the graph has been parsed, prints the modules as they appear in pytorch.""" if self.is_parsed: result = "" - for g in self._get_child_graphs(single_instances=True) + [self]: - result += "Graph {}:\n {}\n==========\n".format( - g.name, torch.nn.Module.__repr__(g) - ) + for g in [*self._get_child_graphs(single_instances=True), self]: + result += f"Graph {g.name}:\n {torch.nn.Module.__repr__(g)}\n==========\n" return result else: return self.__repr__() def set_scope(self, scope: str, recursively=True): - """ - Sets the scope of this instance of the graph. + """Sets the scope of this instance of the graph. The function should be used in a builder-like pattern `'subgraph'=Graph().set_scope("scope")`. @@ -274,8 +268,7 @@ def set_scope(self, scope: str, recursively=True): return self def add_node(self, node_index, **attr): - """ - Adds a node to the graph. + """Adds a node to the graph. Note that adding a node using an index that has been used already will override its attributes. @@ -288,8 +281,7 @@ def add_node(self, node_index, **attr): nx.DiGraph.add_node(self, node_index, **attr) def copy(self): - """ - Copy as defined in networkx, i.e. a shallow copy. + """Copy as defined in networkx, i.e. a shallow copy. Just handling recursively nested graphs seperately. """ @@ -301,7 +293,7 @@ def copy_dict(d): copied_dict[k] = v.copy() elif isinstance(v, list): copied_dict[k] = [i.copy() if isinstance(i, Graph) else i for i in v] - elif isinstance(v, torch.nn.Module) or isinstance(v, AbstractPrimitive): + elif isinstance(v, (AbstractPrimitive, torch.nn.Module)): copied_dict[k] = copy.deepcopy(v) return copied_dict @@ -317,9 +309,8 @@ def copy_dict(d): G.name = self.name return G - def set_input(self, node_idxs: list): - """ - Route the input from specific parent edges to the input nodes of + def set_input(self, node_idxs: list): # noqa: D417 + """Route the input from specific parent edges to the input nodes of this subgraph. Inputs are assigned in lexicographical order. Example: @@ -345,17 +336,15 @@ def set_input(self, node_idxs: list): """ num_innodes = sum(self.in_degree(n) == 0 for n in self.nodes) - assert num_innodes == len( - node_idxs - ), "Expecting node index for every input node. Excpected {}, got {}".format( - num_innodes, len(node_idxs) + assert num_innodes == len(node_idxs), ( + f"Expecting node index for every input node. Excpected {num_innodes}, " + f"got {len(node_idxs)}" ) self.input_node_idxs = node_idxs # type: ignore[assignment] return self def num_input_nodes(self) -> int: - """ - The number of input nodes, i.e. the nodes without an + """The number of input nodes, i.e. the nodes without an incoming edge. Returns: @@ -363,135 +352,6 @@ def num_input_nodes(self) -> int: """ return sum(self.in_degree(n) == 0 for n in self.nodes) - def _assign_x_to_nodes(self, x): - """ - Assign x to the input nodes of self. Depending whether on - edge or nodes. - - Performs also several sanity checks of the input. - - Args: - x (Tensor or dict): Input to be assigned. - """ - # We need dict in case of cell and int in case of motif - assert isinstance(x, dict) or isinstance(x, torch.Tensor) - - if self.input_node_idxs is None: - assert ( - self.num_input_nodes() == 1 - ), "There are more than one input nodes but input indeces are not defined." - input_node = [n for n in self.nodes if self.in_degree(n) == 0][0] - assert ( - len(list(self.predecessors(input_node))) == 0 - ), "Expecting node 1 to be the parent." - assert ( - "subgraph" not in self.nodes[input_node].keys() - ), "Expecting node 1 not to have a subgraph as it serves as input node." - assert isinstance(x, torch.Tensor) - self.nodes[input_node]["input"] = {0: x} - else: - # assign the input to the corresponding nodes - assert all( - [i in x.keys() for i in self.input_node_idxs] - ), "got x from an unexpected input edge" - if self.num_input_nodes() > len(x): - # here is the case where the same input is assigned to more than one node - # this can happen when there are cells with two inputs but at the very first - # layer of the network, there is just one output (i.e. the data inputed to the - # makro input node). Handle it and log a Info. This should happen only rarly - logger.debug( - f"We are using the same x for two inputs in graph {self.name}" - ) - input_node_iterator = iter(self.input_node_idxs) - for node_idx in lexicographical_topological_sort(self): - if self.in_degree(node_idx) == 0: - self.nodes[node_idx]["input"] = {0: x[next(input_node_iterator)]} - - def forward(self, x, *args): - """ - Forward some data through the graph. This is done recursively - in case there are graphs defined on nodes or as 'op' on edges. - - Args: - x (Tensor or dict): The input. If the graph sits on a node the - input can be a dict with {source_idx: Tensor} to be routed - to the defined input nodes. If the graph sits on an edge, - x is the feature tensor. - args: This is only required to handle cases where the graph sits - on an edge and receives an EdgeData object which will be ignored - """ - logger.debug(f"Graph {self.name} called. Input {log_formats(x)}.") - - # Assign x to the corresponding input nodes - self._assign_x_to_nodes(x) - - for node_idx in lexicographical_topological_sort(self): - node = self.nodes[node_idx] - logger.debug( - "Node {}-{}, current data {}, start processing...".format( - self.name, node_idx, log_formats(node) - ) - ) - - # node internal: process input if necessary - if ("subgraph" in node and "comb_op" not in node) or ( - "comb_op" in node and "subgraph" not in node - ): - log_first_n( - logging.WARN, "Comb_op is ignored if subgraph is defined!", n=1 - ) - # TODO: merge 'subgraph' and 'comb_op'. It is basicallly the same thing. Also in parse() - if "subgraph" in node: - x = node["subgraph"].forward(node["input"]) - else: - if len(node["input"].values()) == 1: - x = list(node["input"].values())[0] - else: - x = node["comb_op"]( - [node["input"][k] for k in sorted(node["input"].keys())] - ) - node["input"] = {} # clear the input as we have processed it - - if ( - len(list(self.neighbors(node_idx))) == 0 - and node_idx < list(lexicographical_topological_sort(self))[-1] - ): - # We have more than one output node. This is e.g. the case for - # auxillary losses. Attach them to the graph, handling must done - # by the user. - logger.debug( - "Graph {} has more then one output node. Storing output of non-maximum index node {} at graph dict".format( - self, node_idx - ) - ) - self.graph[f"out_from_{node_idx}"] = x - else: - # outgoing edges: process all outgoing edges - for neigbor_idx in self.neighbors(node_idx): - edge_data = self.get_edge_data(node_idx, neigbor_idx) - # inject edge data only for AbstractPrimitive, not Graphs - if isinstance(edge_data.op, Graph): - edge_output = edge_data.op.forward(x) - elif isinstance(edge_data.op, AbstractPrimitive): - logger.debug( - "Processing op {} at edge {}-{}".format( - edge_data.op, node_idx, neigbor_idx - ) - ) - edge_output = edge_data.op.forward(x) - else: - raise ValueError( - "Unknown class as op: {}. Expected either Graph or AbstactPrimitive".format( - edge_data.op - ) - ) - self.nodes[neigbor_idx]["input"].update({node_idx: edge_output}) - - logger.debug(f"Node {self.name}-{node_idx}, processing done.") - - logger.debug(f"Graph {self.name} exiting. Output {log_formats(x)}.") - return x - def to_pytorch(self, **kwargs) -> nn.Module: return self._to_pytorch(**kwargs) @@ -504,7 +364,7 @@ def _import_code(code: str, name: str): if not self.is_parsed: self.parse() - input_node = [n for n in self.nodes if self.in_degree(n) == 0][0] + input_node = next(n for n in self.nodes if self.in_degree(n) == 0) input_name = "x0" self.nodes[input_node]["input"] = {0: input_name} @@ -522,7 +382,7 @@ def _import_code(code: str, name: str): input_name = f"x{max_xidx + 1}" used_input_names.append(max_xidx + 1) forward_f.append(_forward_f) - x = f"x{max_xidx+1}" + x = f"x{max_xidx + 1}" else: if len(node["input"].values()) == 1: x = next(iter(node["input"].values())) @@ -532,7 +392,7 @@ def _import_code(code: str, name: str): "__name__" in dir(node["comb_op"]) and node["comb_op"].__name__ == "sum" ): - _forward_f = f"x{max_xidx+1}=sum([" + _forward_f = f"x{max_xidx + 1}=sum([" elif isinstance(node["comb_op"], torch.nn.Module): submodule_list.append(node["comb_op"]) _forward_f = f"x{max_xidx + 1}=self.module_list[{len(submodule_list) - 1}]([" @@ -543,7 +403,7 @@ def _import_code(code: str, name: str): _forward_f += inp + "," _forward_f = _forward_f[:-1] + "])" forward_f.append(_forward_f) - x = f"x{max_xidx+1}" + x = f"x{max_xidx + 1}" if int(x[1:]) not in used_input_names: used_input_names.append(int(x[1:])) node["input"] = {} # clear the input as we have processed it @@ -579,9 +439,7 @@ def _import_code(code: str, name: str): forward_f.append(_forward_f) else: raise ValueError( - "Unknown class as op: {}. Expected either Graph or AbstactPrimitive".format( - edge_data.op - ) + f"Unknown class as op: {edge_data.op}. Expected either Graph or AbstactPrimitive" ) self.nodes[neigbor_idx]["input"].update({node_idx: input_name}) @@ -615,8 +473,7 @@ def _import_code(code: str, name: str): return model def parse(self): - """ - Convert the graph into a neural network which can then + """Convert the graph into a neural network which can then be optimized by pytorch. """ for node_idx in lexicographical_topological_sort(self): @@ -626,12 +483,11 @@ def parse(self): f"{self.name}-subgraph_at({node_idx})", self.nodes[node_idx]["subgraph"], ) - else: - if isinstance(self.nodes[node_idx]["comb_op"], torch.nn.Module): - self.add_module( - f"{self.name}-comb_op_at({node_idx})", - self.nodes[node_idx]["comb_op"], - ) + elif isinstance(self.nodes[node_idx]["comb_op"], torch.nn.Module): + self.add_module( + f"{self.name}-comb_op_at({node_idx})", + self.nodes[node_idx]["comb_op"], + ) for neigbor_idx in self.neighbors(node_idx): edge_data = self.get_edge_data(node_idx, neigbor_idx) @@ -649,8 +505,7 @@ def parse(self): self.is_parsed = True def unparse(self): - """ - Undo the pytorch parsing by reconstructing the graph uusing the + """Undo the pytorch parsing by reconstructing the graph uusing the networkx data structures. This is done recursively also for child graphs. @@ -689,8 +544,7 @@ def unparse(self): return g def _get_child_graphs(self, single_instances: bool = False) -> list: - """ - Get all child graphs of the current graph. + """Get all child graphs of the current graph. Args: single_instances (bool): Whether to return multiple instances @@ -730,9 +584,7 @@ def _get_child_graphs(self, single_instances: bool = False) -> list: graphs.append(child_op._get_child_graphs()) else: logger.debug( - "Got embedded op, but is neither a graph nor a list: {}".format( - embedded_ops - ) + f"Got embedded op, but is neither a graph nor a list: {embedded_ops}" ) elif inspect.isclass(edge_data.op): assert not issubclass( @@ -744,7 +596,7 @@ def _get_child_graphs(self, single_instances: bool = False) -> list: else: raise ValueError(f"Unknown format of op: {edge_data.op}") - graphs = [g for g in iter_flatten(graphs)] + graphs = list(iter_flatten(graphs)) if single_instances: single: list = [] @@ -755,50 +607,9 @@ def _get_child_graphs(self, single_instances: bool = False) -> list: else: return sorted(graphs, key=lambda g: g.name) - def get_all_edge_data( - self, key: str, scope="all", private_edge_data: bool = False - ) -> list: - """ - Get edge attributes of this graph and all child graphs in one go. - - Args: - key (str): The key of the attribute - scope (str): The scope to be applied - private_edge_data (bool): Whether to return data from graph copies as well. - - Returns: - list: All data in a list. - """ - assert scope is not None - result = [] - for graph in self._get_child_graphs(single_instances=not private_edge_data) + [ - self - ]: - if ( - scope == "all" - or graph.scope == scope - or (isinstance(scope, list) and graph.scope in scope) - ): - for _, _, edge_data in graph.edges.data(): - if edge_data.has(key): - result.append(edge_data[key]) - return result - - def set_at_edges(self, key, value, shared=False): - """ - Sets the attribute for all edges in this and any child graph - """ - for graph in self._get_child_graphs(single_instances=shared) + [self]: - logger.debug(f"Updating edges of graph {graph.name}") - for _, _, edge_data in graph.edges.data(): - if not edge_data.is_final(): - edge_data.set(key, value, shared) - def compile(self): - """ - Instanciates the ops at the edges using the arguments specified at the edges - """ - for graph in self._get_child_graphs(single_instances=False) + [self]: + """Instanciates the ops at the edges using the arguments specified at the edges.""" + for graph in [*self._get_child_graphs(single_instances=False), self]: logger.debug(f"Compiling graph {graph.name}") for _, v, edge_data in graph.edges.data(): if not edge_data.is_final(): @@ -832,8 +643,7 @@ def compile(self): @staticmethod def _verify_update_function(update_func: Callable, private_edge_data: bool): - """ - Verify that the update function actually modifies only + """Verify that the update function actually modifies only shared/private edge data attributes based on setting of `private_edge_data`. @@ -844,7 +654,6 @@ def _verify_update_function(update_func: Callable, private_edge_data: bool): to all graph instances including copies or just to one instance per graph """ - test = EdgeData() test.set("shared", True, shared=True) test.set("op", [True]) @@ -881,8 +690,7 @@ def _verify_update_function(update_func: Callable, private_edge_data: bool): def update_edges( self, update_func: Callable, scope="all", private_edge_data: bool = False ): - """ - This updates the edge data of this graph and all child graphs. + """This updates the edge data of this graph and all child graphs. This is the preferred way to manipulate the edges after the definition of the graph, e.g. by optimizers who want to insert their own op. `update_func(current_edge_data)`. This way optimizers @@ -905,13 +713,14 @@ def update_edges( """ Graph._verify_update_function(update_func, private_edge_data) assert scope is not None - for graph in self._get_child_graphs(single_instances=not private_edge_data) + [ - self + for graph in [ + *self._get_child_graphs(single_instances=not private_edge_data), + self, ]: if ( - scope == "all" - or scope == graph.scope - or (isinstance(scope, list) and graph.scope in scope) + scope in ("all", graph.scope) + or isinstance(scope, list) + and graph.scope in scope ): logger.debug(f"Updating edges of graph {graph.name}") for u, v, edge_data in graph.edges.data(): @@ -923,8 +732,7 @@ def update_edges( def update_nodes( self, update_func: Callable, scope="all", single_instances: bool = True ): - """ - Update the nodes of the graph and its incoming and outgoing edges by iterating over the + """Update the nodes of the graph and its incoming and outgoing edges by iterating over the graph and applying `update_func` to each of it. This is the preferred way to change the search space once it has been defined. @@ -952,11 +760,11 @@ def update_nodes( with MixedOp or SampleOp) """ assert scope is not None - for graph in self._get_child_graphs(single_instances) + [self]: + for graph in [*self._get_child_graphs(single_instances), self]: if ( - scope == "all" - or graph.scope == scope - or (isinstance(scope, list) and graph.scope in scope) + scope in ("all", graph.scope) + or isinstance(scope, list) + and graph.scope in scope ): logger.debug(f"Updating nodes of graph {graph.name}") for node_idx in lexicographical_topological_sort(graph): @@ -973,11 +781,10 @@ def update_nodes( self._delete_flagged_edges() def _delete_flagged_edges(self): - """ - Delete edges which associated EdgeData is flagged as deleted. - """ - for graph in self._get_child_graphs(single_instances=False) + [ - self + """Delete edges which associated EdgeData is flagged as deleted.""" + for graph in [ + *self._get_child_graphs(single_instances=False), + self, ]: # we operate on shallow copies to_remove = [] for u, v, edge_data in graph.edges.data(): @@ -988,79 +795,16 @@ def _delete_flagged_edges(self): graph.remove_edges_from(to_remove) def clone(self): - """ - Deep copy of the current graph. + """Deep copy of the current graph. Returns: Graph: Deep copy of the graph. """ return copy.deepcopy(self) - def reset_weights(self, inplace: bool = False): - """ - Resets the weights for the 'op' at all edges. - - Args: - inplace (bool): Do the operation in place or - return a modified copy. - Returns: - Graph: Returns the modified version of the graph. - """ - - def weight_reset(m): - if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear): - m.reset_parameters() - - if inplace: - graph = self - else: - graph = self.clone() - - graph.apply(weight_reset) - - return graph - - def prepare_discretization(self): - """ - In some cases the search space is manipulated before the final - discretization is happening, e.g. DARTS. In such chases this should - be defined in the search space, so all optimizers can call it. - """ - - def prepare_evaluation(self): - """ - In some cases the evaluation architecture does not match the searched - one. An example is where the makro_model is extended to increase the - parameters. This is done here. - """ - - def get_dense_edges(self): - """ - Returns the edge indices (i, j) that would make a fully connected - DAG without circles such that i < j and i != j. Assumes nodes are - already created. - - Returns: - list: list of edge indices. - """ - edges = [] - nodes = sorted(list(self.nodes())) - for i in nodes: - for j in nodes: - if i != j and j > i: - edges.append((i, j)) - return edges - - def add_edges_densly(self): - """ - Adds edges to get a fully connected DAG without cycles - """ - self.add_edges_from(self.get_dense_edges()) - class EdgeData: - """ - Class that holds data for each edge. + """Class that holds data for each edge. Data can be shared between instances of the graph where the edges lives in. @@ -1071,10 +815,9 @@ class EdgeData: in a dict-like fashion with `[key]`. To set a new item use `.set()`. """ - def __init__(self, data: dict = None): - """ - Initializes a new EdgeData object. - 'op' is set as Identity() and private by default + def __init__(self, data: dict | None = None): + """Initializes a new EdgeData object. + 'op' is set as Identity() and private by default. Args: data (dict): Inject some initial data. Will be always private. @@ -1094,8 +837,7 @@ def __init__(self, data: dict = None): self.set(k, v, shared=False) def has(self, key: str): - """ - Checks whether `key` exists. + """Checks whether `key` exists. Args: key (str): The key to check. @@ -1105,7 +847,7 @@ def has(self, key: str): """ assert not key.startswith("_"), "Access to private keys not allowed!" - return key in self._private.keys() or key in self._shared.keys() + return key in self._private or key in self._shared def __getitem__(self, key: str): assert not str(key).startswith("_"), "Access to private keys not allowed!" @@ -1119,7 +861,7 @@ def get(self, key: str, default): def __getattr__(self, key: str): if key.startswith("__"): # Required for deepcopy, not sure why - raise AttributeError(key) # + raise AttributeError(key) assert not key.startswith("_"), "Access to private keys not allowed!" if key in self._private: return self._private[key] @@ -1135,14 +877,13 @@ def __setattr__(self, name: str, val): raise ValueError("not allowed. use set().") def __str__(self): - return f"private: <{str(self._private)}>, shared: <{str(self._shared)}>" + return f"private: <{self._private!s}>, shared: <{self._shared!s}>" def __repr__(self): return self.__str__() def update(self, data): - """ - Update the data in here. If the data is added as dict, + """Update the data in here. If the data is added as dict, then all variables will be handled as private. Args: @@ -1159,8 +900,7 @@ def update(self, data): raise ValueError(f"Unsupported type {data}") def remove(self, key: str): - """ - Removes an item from the EdgeData + """Removes an item from the EdgeData. Args: key (str): The key for the item to be removed. @@ -1173,8 +913,7 @@ def remove(self, key: str): raise KeyError(f"Tried to delete unkown key {key}") def copy(self): - """ - When a graph is copied to get multiple instances (e.g. when + """When a graph is copied to get multiple instances (e.g. when reusing subgraphs at more than one location) then this function will be called for all edges. @@ -1204,8 +943,7 @@ def copy(self): return new_self def set(self, key: str, value, shared=False): - """ - Used to assign a new item to the EdgeData object. + """Used to assign a new item to the EdgeData object. Args: key (str): The key. @@ -1214,9 +952,7 @@ def set(self, key: str, value, shared=False): be a shallow copy between different instances of EdgeData (and consequently between different instances of Graph). """ - assert isinstance(key, str), "Accepting only string keys, got {}".format( - type(key) - ) + assert isinstance(key, str), f"Accepting only string keys, got {type(key)}" assert not key.startswith("_"), "Access to private keys not allowed!" assert not self.is_final(), "Trying to change finalized edge!" if shared: @@ -1224,15 +960,13 @@ def set(self, key: str, value, shared=False): raise ValueError("Key {} alredy defined as non-shared") else: self._shared[key] = value + elif key in self._shared: + raise ValueError(f"Key {key} alredy defined as shared") else: - if key in self._shared: - raise ValueError(f"Key {key} alredy defined as shared") - else: - self._private[key] = value + self._private[key] = value def clone(self): - """ - Return a true deep copy of EdgeData. Even shared + """Return a true deep copy of EdgeData. Even shared items are not shared anymore. Returns: @@ -1241,20 +975,15 @@ def clone(self): return copy.deepcopy(self) def delete(self): - """ - Flag to delete the edge where this instance is attached to. - """ + """Flag to delete the edge where this instance is attached to.""" self._shared["_deleted"] = True def is_deleted(self): - """ - Returns true if the edge is flagged to be deleted - """ + """Returns true if the edge is flagged to be deleted.""" return self._shared["_deleted"] def finalize(self): - """ - Sets this edge as final. This means it cannot be changed + """Sets this edge as final. This means it cannot be changed anymore and will also not appear in the update functions of the graph. """ @@ -1262,9 +991,8 @@ def finalize(self): return self def is_final(self): - """ - Returns: - bool: True if the edge was finalized, False else + """Returns: + bool: True if the edge was finalized, False else. """ return self._private["_final"] diff --git a/neps/search_spaces/architecture/graph_grammar.py b/neps/search_spaces/architecture/graph_grammar.py index 1c9fa159..932de768 100644 --- a/neps/search_spaces/architecture/graph_grammar.py +++ b/neps/search_spaces/architecture/graph_grammar.py @@ -7,18 +7,23 @@ from typing import Any, ClassVar, Mapping from typing_extensions import override, Self from neps.utils.types import NotSet +from typing import TYPE_CHECKING, Any, ClassVar, Mapping +from typing_extensions import Self, override import networkx as nx import numpy as np -from nltk import Nonterminal -from ..parameter import ParameterWithPrior, MutatableParameter -from .cfg import Grammar -from .cfg_variants.constrained_cfg import ConstrainedGrammar +from neps.search_spaces.parameter import MutatableParameter, ParameterWithPrior +from neps.utils.types import NotSet + from .core_graph_grammar import CoreGraphGrammar from .crossover import repetitive_search_space_crossover, simple_crossover from .mutations import bananas_mutate, repetitive_search_space_mutation, simple_mutate +if TYPE_CHECKING: + from .cfg import Grammar + from .cfg_variants.constrained_cfg import ConstrainedGrammar + # TODO(eddiebergman): This is a halfway solution, but essentially a lot # of things `Parameter` does, does not fit nicely with a Graph based @@ -28,7 +33,9 @@ # The problem here is that the `Parameter` expects the `load_from` # and the `.value` to be the same type, which is not the case for # graph based parameters. -class GraphParameter(ParameterWithPrior[nx.DiGraph, str], MutatableParameter): +class GraphParameter( # noqa: D101 + ParameterWithPrior[nx.DiGraph, str], MutatableParameter +): # NOTE(eddiebergman): What I've managed to learn so far is that # these hyperparameters work mostly with strings externally, # i.e. setting the value through `load_from` or `set_value` should be a string. @@ -38,7 +45,8 @@ class GraphParameter(ParameterWithPrior[nx.DiGraph, str], MutatableParameter): # At serialization time, it doesn't actually serialize the .value but instead # relies on the string it was passed initially, I'm not actually sure if there's # a way to go from the graph object to the string in this code... - # Essentially on the outside, we need to ensure we don't pass ih the graph object itself + # Essentially on the outside, we need to ensure we don't pass ih the graph object + # itself DEFAULT_CONFIDENCE_SCORES: ClassVar[Mapping[str, float]] = {"not_in_use": 1.0} default_confidence_choice = "not_in_use" has_prior: bool @@ -46,26 +54,22 @@ class GraphParameter(ParameterWithPrior[nx.DiGraph, str], MutatableParameter): @property @abstractmethod - def id(self) -> str: - ... + def id(self) -> str: ... # NOTE(eddiebergman): Unlike traditional parameters, it seems @property @abstractmethod - def value(self) -> nx.DiGraph: - ... + def value(self) -> nx.DiGraph: ... # NOTE(eddiebergman): This is a function common to the three graph # parameters that is used for `load_from` @abstractmethod - def create_from_id(self, value: str) -> None: - ... + def create_from_id(self, value: str) -> None: ... # NOTE(eddiebergman): Function shared between graph parameters. # Used to `set_value()` @abstractmethod - def reset(self) -> None: - ... + def reset(self) -> None: ... @override def __eq__(self, other: Any) -> bool: @@ -75,8 +79,7 @@ def __eq__(self, other: Any) -> bool: return self.id == other.id @abstractmethod - def compute_prior(self, normalized_value: float) -> float: - ... + def compute_prior(self, normalized_value: float) -> float: ... @override def set_value(self, value: str | None) -> None: @@ -85,8 +88,8 @@ def set_value(self, value: str | None) -> None: # `self.value = None` if not isinstance(value, str): raise ValueError( - f"Expected a string for setting value a `GraphParameter`", - f" got {type(value)}" + "Expected a string for setting value a `GraphParameter`", + f" got {type(value)}", ) self.reset() self.normalized_value = value @@ -142,21 +145,22 @@ def load_from(self, value: str | Self) -> None: self.create_from_id(value) @abstractmethod - def mutate(self, parent: Self | None = None, *, - mutation_strategy: str = "bananas") -> Self: - ... + def mutate( # noqa: D102 + self, parent: Self | None = None, *, mutation_strategy: str = "bananas" + ) -> Self: ... @abstractmethod - def crossover(self, parent1: Self, parent2: Self | None = None) -> tuple[Self, Self]: - ... + def crossover( # noqa: D102 + self, parent1: Self, parent2: Self | None = None + ) -> tuple[Self, Self]: ... def _get_non_unique_neighbors(self, num_neighbours: int) -> list[Self]: raise NotImplementedError - def value_to_normalized(self, value: nx.DiGraph) -> float: + def value_to_normalized(self, value: nx.DiGraph) -> float: # noqa: D102 raise NotImplementedError - def normalized_to_value(self, normalized_value: float) -> nx.DiGraph: + def normalized_to_value(self, normalized_value: float) -> nx.DiGraph: # noqa: D102 raise NotImplementedError @override @@ -189,21 +193,25 @@ def clone(self) -> Self: class GraphGrammar(GraphParameter, CoreGraphGrammar): hp_name = "graph_grammar" - def __init__( + def __init__( # noqa: D107, PLR0913 self, grammar: Grammar, terminal_to_op_names: dict, - prior: dict = None, - terminal_to_graph_edges: dict = None, - edge_attr: bool = True, + prior: dict | None = None, + terminal_to_graph_edges: dict | None = None, + edge_attr: bool = True, # noqa: FBT001, FBT002 edge_label: str = "op_name", - zero_op: list = ["Zero", "zero"], - identity_op: list = ["Identity", "id"], - new_graph_repr_func: bool = False, - name: str = None, - scope: str = None, + zero_op: list | None = None, + identity_op: list | None = None, + new_graph_repr_func: bool = False, # noqa: FBT001, FBT002 + name: str | None = None, + scope: str | None = None, **kwargs, ): + if identity_op is None: + identity_op = ["Identity", "id"] + if zero_op is None: + zero_op = ["Zero", "zero"] if isinstance(grammar, list) and len(grammar) != 1: raise NotImplementedError("Does not support multiple grammars") @@ -236,7 +244,8 @@ def sample(self, *, user_priors: bool = False) -> Self: copy_self = self.clone() copy_self.reset() copy_self.string_tree = copy_self.grammars[0].sampler(1, user_priors=user_priors)[ - 0] + 0 + ] _ = copy_self.value # required for checking if graph is valid! return copy_self @@ -316,7 +325,7 @@ def compute_prior(self, *, log: bool = True) -> float: return self.grammars[0].compute_prior(self.string_tree, log=log) @property - def id(self) -> str: + def id(self) -> str: # noqa: D102 if self._function_id is None or self._function_id == "": if self.string_tree == "": raise ValueError("Cannot infer identifier!") @@ -327,7 +336,7 @@ def id(self) -> str: def id(self, value: str) -> None: self._function_id = value - def create_from_id(self, identifier: str) -> None: + def create_from_id(self, identifier: str) -> None: # noqa: D102 self.reset() self._function_id = identifier self.id = identifier @@ -335,724 +344,46 @@ def create_from_id(self, identifier: str) -> None: _ = self.value # required for checking if graph is valid! @staticmethod - def id_to_string_tree(identifier: str) -> str: + def id_to_string_tree(identifier: str) -> str: # noqa: D102 return identifier @staticmethod - def string_tree_to_id(string_tree: str) -> str: + def string_tree_to_id(string_tree: str) -> str: # noqa: D102 return string_tree @property - def search_space_size(self) -> int: + def search_space_size(self) -> int: # noqa: D102 return self.grammars[0].compute_space_size @abstractmethod - def create_new_instance_from_id(self, identifier: str): + def create_new_instance_from_id(self, identifier: str): # noqa: D102 raise NotImplementedError - def reset(self) -> None: + def reset(self) -> None: # noqa: D102 self.clear_graph() self.string_tree = "" self.nxTree = None self._value = None self._function_id = "" - def compose_functions(self, flatten_graph: bool = True) -> nx.DiGraph: - return self._compose_functions(self.id, self.grammars[0], flatten_graph) - - def unparse_tree(self, identifier: str, as_composition: bool = True): - return self._unparse_tree(identifier, self.grammars[0], as_composition) - - def get_dictionary(self) -> dict[str, str]: - return {"graph_grammar": self.id} - - def create_nx_tree(self, string_tree: str) -> nx.DiGraph: - nxTree = self.from_stringTree_to_nxTree(string_tree, self.grammars[0]) - return self.prune_tree( - nxTree, terminal_to_torch_map_keys=self.terminal_to_op_names.keys() - ) - - -class GraphGrammarCell(GraphGrammar): - hp_name = "graph_grammar_cell" - - def __init__( + def compose_functions( # noqa: D102 self, - grammar: Grammar, - terminal_to_op_names: dict, - terminal_to_graph_edges: dict = None, - edge_attr: bool = True, - edge_label: str = "op_name", - zero_op: list = ["Zero", "zero"], - identity_op: list = ["Identity", "id"], - name: str = None, - scope: str = None, - **kwargs, - ): - super().__init__( - grammar, - terminal_to_op_names, - terminal_to_graph_edges, - edge_attr=edge_attr, - edge_label=edge_label, - zero_op=zero_op, - identity_op=identity_op, - name=name, - scope=scope, - **kwargs, - ) - - self.cell = None - - def reset(self) -> None: - super().reset() - self.cell = None - - @abstractmethod - def create_graph_from_string(self, child: str): - raise NotImplementedError - - -class GraphGrammarRepetitive(GraphParameter, CoreGraphGrammar): - hp_name = "graph_grammar_repetitive" - - def __init__( - self, - grammars: list[Grammar], - terminal_to_op_names: dict, - terminal_to_sublanguage_map: dict, - number_of_repetitive_motifs: int, - terminal_to_graph_edges: dict = None, - edge_attr: bool = True, - edge_label: str = "op_name", - zero_op: list = ["Zero", "zero"], - identity_op: list = ["Identity", "id"], - name: str = None, - scope: str = None, - ): - CoreGraphGrammar.__init__( - self, - grammars=grammars, - terminal_to_op_names=terminal_to_op_names, - terminal_to_graph_edges=terminal_to_graph_edges, - edge_attr=edge_attr, - edge_label=edge_label, - zero_op=zero_op, - identity_op=identity_op, - name=name, - scope=scope, - ) - GraphParameter.__init__(self, value=None, default=None, is_fidelity=False) - - self.id: str = "" - self.string_tree: str = "" - self.string_tree_list: list[str] = [] - self.nxTree: nx.DiGraph | None = None - self._value: nx.DiGraph | None = None - - self.full_grammar = self.get_full_grammar(self.grammars) - self.terminal_to_sublanguage_map = terminal_to_sublanguage_map - self.number_of_repetitive_motifs = number_of_repetitive_motifs - - @override - def mutate( - self, - parent: Self | None = None, - mutation_rate: float = 1.0, - mutation_strategy: str = "bananas", - ) -> Self: - raise NotImplementedError - if parent is None: - parent = self - - # bananas mutate - if mutation_strategy == "bananas": - inner_mutation_strategy = partial(bananas_mutate, mutation_rate=mutation_rate) - child_string_tree_list, is_same = repetitive_search_space_mutation( - base_parent=parent.string_tree_list[0], - motif_parents=parent.string_tree_list[1:], - base_grammar=self.grammars[0], - motif_grammars=self.grammars[1:], - terminal_to_sublanguage_map=self.terminal_to_sublanguage_map, - inner_mutation_strategy=inner_mutation_strategy, - ) - else: - child_string_tree_list, is_same = repetitive_search_space_mutation( - base_parent=parent.string_tree_list[0], - motif_parents=parent.string_tree_list[1:], - base_grammar=self.grammars[0], - motif_grammars=self.grammars[1:], - terminal_to_sublanguage_map=self.terminal_to_sublanguage_map, - inner_mutation_strategy=super().mutate, - ) - - if all(is_same): - raise ValueError("Parent is the same as child!") - - return self.create_graph_from_string(child_string_tree_list) - - @override - def crossover( - self, - parent1: Self, - parent2: Self | None = None, - ) -> tuple[Self, Self]: - raise NotImplementedError - if parent2 is None: - parent2 = self - children = repetitive_search_space_crossover( - base_parent=(parent1.string_tree_list[0], parent2.string_tree_list[0]), - motif_parents=(parent1.string_tree_list[1:], parent2.string_tree_list[1:]), - base_grammar=self.grammars[0], - motif_grammars=self.grammars[1:], - terminal_to_sublanguage_map=self.terminal_to_sublanguage_map, - inner_crossover_strategy=simple_crossover, - ) - if all(not c for c in children): - raise Exception("Cannot create crossover") - return [parent2.create_graph_from_string(child) for child in children] - - @override - def sample(self, *, user_priors: bool = False) -> Self: - copy_self = self.clone() - copy_self.reset() - copy_self.string_tree_list = [grammar.sampler(1)[0] for grammar in - copy_self.grammars] - copy_self.string_tree = copy_self.assemble_trees( - copy_self.string_tree_list[0], - copy_self.string_tree_list[1:], - terminal_to_sublanguage_map=copy_self.terminal_to_sublanguage_map, - ) - copy_self.id = "\n".join(copy_self.string_tree_list) - _ = copy_self.value # required for checking if graph is valid! - return copy_self - - @property - @override - def value(self) -> nx.DiGraph: - if self._value is None: - _val = self.from_stringTree_to_graph_repr( - self.string_tree, - self.full_grammar, - valid_terminals=self.terminal_to_op_names.keys(), - edge_attr=self.edge_attr, - ) - assert isinstance(_val, nx.DiGraph) - self._value = _val - return self._value - - @override - def compute_prior(self, *, log: bool = True) -> float: - prior_probs = [ - g.compute_prior(st, log=log) - for g, st in zip(self.grammars, self.string_tree_list) - ] - if log: - return sum(prior_probs) - else: - return np.prod(prior_probs) - - def __eq__(self, other: Any) -> bool: - if not isinstance(other, GraphGrammarRepetitive): - return NotImplemented - - return self.id == other.id - - def reset(self) -> None: - self.clear_graph() - self.string_tree_list = [] - self.string_tree = "" - self.nxTree = None - self._value = None - self.id = "" - - @staticmethod - def get_full_grammar(grammars): - full_grammar = deepcopy(grammars[0]) - rules = full_grammar.productions() - nonterminals = full_grammar.nonterminals - terminals = full_grammar.terminals - for g in grammars[1:]: - rules.extend(g.productions()) - nonterminals.extend(g.nonterminals) - terminals.extend(g.terminals) - return full_grammar - - @abstractmethod - def create_graph_from_string(self, child: list[str]): - raise NotImplementedError - - def get_dictionary(self) -> dict[str, str]: - return {"graph_grammar": "\n".join(self.string_tree_list)} - - def create_nx_tree(self, string_tree: str) -> nx.DiGraph: - nxTree = self.from_stringTree_to_nxTree(string_tree, self.full_grammar) - return self.prune_tree( - nxTree, terminal_to_torch_map_keys=self.terminal_to_op_names.keys() - ) - - def create_from_id(self, identifier: str | list[str]) -> None: - self.reset() - self.string_tree_list = ( - identifier.split("\n") if isinstance(identifier, str) else identifier - ) - self.string_tree = self.assemble_trees( - self.string_tree_list[0], - self.string_tree_list[1:], - terminal_to_sublanguage_map=self.terminal_to_sublanguage_map, - ) - self.id = "\n".join(self.string_tree_list) - _ = self.value # required for checking if graph is valid! - - @property - def search_space_size(self) -> int: - def recursive_worker( - nonterminal: Nonterminal, grammar, lower_level_motifs: int = 0 - ) -> int: - primitive_nonterminal = "OPS" - if str(nonterminal) == primitive_nonterminal: - return ( - lower_level_motifs * self.number_of_repetitive_motifs - + len(grammar.productions(lhs=Nonterminal(primitive_nonterminal))) - - self.number_of_repetitive_motifs - ) - potential_productions = grammar.productions(lhs=nonterminal) - _possibilites = 0 - for potential_production in potential_productions: - edges_nonterminals = [ - rhs_sym - for rhs_sym in potential_production.rhs() - if str(rhs_sym) in grammar.nonterminals - ] - possibilities_per_edge = [ - recursive_worker(e_nonterminal, grammar, lower_level_motifs) - for e_nonterminal in edges_nonterminals - ] - product = 1 - for p in possibilities_per_edge: - product *= p - _possibilites += product - return _possibilites - - lower_level_motifs = recursive_worker(self.grammars[1].start(), self.grammars[1]) - return recursive_worker( - self.grammars[0].start(), - self.grammars[0], - lower_level_motifs=lower_level_motifs, - ) - - -class GraphGrammarMultipleRepetitive(GraphParameter, CoreGraphGrammar): - hp_name = "graph_grammar_multiple_repetitive" + flatten_graph: bool = True, # noqa: FBT001, FBT002 + ) -> nx.DiGraph: + return self._compose_functions(self.id, self.grammars[0], flatten_graph) - def __init__( + def unparse_tree( # noqa: D102 self, - grammars: list[Grammar] | list[ConstrainedGrammar], - terminal_to_op_names: dict, - terminal_to_sublanguage_map: dict, - prior: list[dict] = None, - terminal_to_graph_edges: dict = None, - fixed_macro_grammar: bool = False, - edge_attr: bool = True, - edge_label: str = "op_name", - zero_op: list = ["Zero", "zero"], - identity_op: list = ["Identity", "id"], - name: str = None, - scope: str = None, - **kwargs, + identifier: str, + as_composition: bool = True, # noqa: FBT001, FBT002 ): - def _check_mapping(macro_grammar, motif_grammars, terminal_to_sublanguage_map): - for terminal, start_symbol in terminal_to_sublanguage_map.items(): - if terminal not in macro_grammar.terminals: - raise Exception(f"Terminal {terminal} not defined in macro grammar") - if not any( - start_symbol == str(grammar.start()) for grammar in motif_grammars - ): - raise Exception( - f"Start symbol {start_symbol} not defined in motif grammar" - ) - - def _identify_macro_grammar(grammar, terminal_to_sublanguage_map): - grammars = deepcopy(grammar) - motif_grammars = [] - for start_symbol in terminal_to_sublanguage_map.values(): - motif_grammars += [ - grammar - for grammar in grammars - if start_symbol == str(grammar.start()) - ] - grammars = [ - grammar - for grammar in grammars - if start_symbol != str(grammar.start()) - ] - if len(grammars) != 1: - raise Exception("Cannot identify macro grammar") - return grammars[0], motif_grammars - - if prior is not None: - assert len(grammars) == len( - prior - ), "At least one of the grammars has no prior defined!" - for g, p in zip(grammars, prior): - g.prior = p - self.has_prior = prior is not None - - self.macro_grammar, grammars = _identify_macro_grammar( - grammars, terminal_to_sublanguage_map - ) - _check_mapping(self.macro_grammar, grammars, terminal_to_sublanguage_map) - - self.fixed_macro_grammar = fixed_macro_grammar - if not self.fixed_macro_grammar: - grammars.insert(0, self.macro_grammar) - - self.terminal_to_sublanguage_map = OrderedDict(terminal_to_sublanguage_map) - if any( - k in terminal_to_op_names for k in self.terminal_to_sublanguage_map.keys() - ): - raise Exception( - f"Terminals {[k for k in self.terminal_to_sublanguage_map.keys()]} already defined in primitives mapping and cannot be used for repetitive substitutions" - ) - self.number_of_repetitive_motifs_per_grammar = [ - sum( - map( - (str(grammar.start())).__eq__, - self.terminal_to_sublanguage_map.values(), - ) - ) - if str(grammar.start()) in self.terminal_to_sublanguage_map.values() - else 1 - for grammar in grammars - ] - - CoreGraphGrammar.__init__( - self, - grammars=grammars, - terminal_to_op_names={ - **terminal_to_op_names, - **self.terminal_to_sublanguage_map, - }, - terminal_to_graph_edges=terminal_to_graph_edges, - edge_attr=edge_attr, - edge_label=edge_label, - zero_op=zero_op, - identity_op=identity_op, - name=name, - scope=scope, - **kwargs, - ) - GraphParameter.__init__(self, value=None, default=None, is_fidelity=False) - - self._function_id: str = "" - self.string_tree: str = "" - self.string_tree_list: list[str] = [] - self.nxTree: nx.DiGraph | None = None - self._value: nx.DiGraph | None = None - - if self.fixed_macro_grammar: - self.fixed_macro_string_tree = self.macro_grammar.sampler(1)[0] - - if self.fixed_macro_grammar: - self.full_grammar = self.get_full_grammar( - [self.macro_grammar] + self.grammars - ) - else: - self.full_grammar = self.get_full_grammar(self.grammars) - - @override - def sample(self, *, user_priors: bool = False) -> Self: - copy_self = self.clone() - copy_self.reset() - copy_self.string_tree_list = [ - grammar.sampler(1, user_priors=user_priors)[0] - for grammar, number_of_motifs in zip( - copy_self.grammars, copy_self.number_of_repetitive_motifs_per_grammar - ) - for _ in range(number_of_motifs) - ] - copy_self.string_tree = copy_self.assemble_string_tree(copy_self.string_tree_list) - _ = copy_self.value # required for checking if graph is valid! - return copy_self - - @property - @override - def value(self) -> nx.DiGraph: - if self._value is None: - if self.fixed_macro_grammar: - self._value = [] - string_list_idx = 0 - for grammar, number_of_motifs in zip( - self.grammars, self.number_of_repetitive_motifs_per_grammar - ): - for _ in range(number_of_motifs): - self._value.append( - self.from_stringTree_to_graph_repr( - self.string_tree_list[string_list_idx], - grammar, - valid_terminals=self.terminal_to_op_names.keys(), - edge_attr=self.edge_attr, - ) - ) - string_list_idx += 1 - self._value = self._value[0] # TODO trick - else: - self._value = self.from_stringTree_to_graph_repr( - self.string_tree, - self.full_grammar, - valid_terminals=self.terminal_to_op_names.keys(), - edge_attr=self.edge_attr, - ) - motif_trees = self.string_tree_list[1:] - repetitive_mapping = { - replacement: motif - for motif, replacement in zip( - self.terminal_to_sublanguage_map.keys(), motif_trees - ) - } - for subgraph in self._value[1].values(): - old_node_attributes = nx.get_node_attributes(subgraph, "op_name") - new_node_labels = { - k: (repetitive_mapping[v] if v in motif_trees else v) - for k, v in old_node_attributes.items() - } - nx.set_node_attributes(subgraph, new_node_labels, name="op_name") - return self._value - - @override - def mutate( - self, - parent: Self | None = None, - mutation_rate: float = 1.0, - mutation_strategy: str = "bananas", - ) -> Self: - if parent is None: - parent = self - - bananas_inner_mutation = partial(bananas_mutate, mutation_rate=mutation_rate) - child_string_tree_list, is_same = repetitive_search_space_mutation( - base_parent=self.fixed_macro_string_tree - if self.fixed_macro_grammar - else parent.string_tree_list[0], - motif_parents=parent.string_tree_list - if self.fixed_macro_grammar - else parent.string_tree_list[1:], - base_grammar=self.macro_grammar, - motif_grammars=self.grammars - if self.fixed_macro_grammar - else self.grammars[1:], - terminal_to_sublanguage_map=self.terminal_to_sublanguage_map, - number_of_repetitive_motifs_per_grammar=self.number_of_repetitive_motifs_per_grammar, - inner_mutation_strategy=bananas_inner_mutation - if mutation_strategy == "bananas" - else super().mutate, - fixed_macro_parent=self.fixed_macro_grammar, - ) - - if all(is_same): - raise ValueError("Parent is the same as child!") - - if self.fixed_macro_grammar: - child_string_tree_list = child_string_tree_list[1:] - - return self.create_new_instance_from_id( - self.string_tree_list_to_id(child_string_tree_list) - ) - - @override - def crossover( - self, - parent1: Self, - parent2: Self | None = None, - ) -> tuple[Self, Self]: - if parent2 is None: - parent2 = self - children = repetitive_search_space_crossover( - base_parent=(parent1.fixed_macro_string_tree, parent2.fixed_macro_string_tree) - if self.fixed_macro_grammar - else (parent1.string_tree_list[0], parent2.string_tree_list[0]), - motif_parents=(parent1.string_tree_list, parent2.string_tree_list) - if self.fixed_macro_grammar - else (parent1.string_tree_list[1:], parent2.string_tree_list[1:]), - base_grammar=self.macro_grammar, - motif_grammars=self.grammars - if self.fixed_macro_grammar - else self.grammars[1:], - terminal_to_sublanguage_map=self.terminal_to_sublanguage_map, - number_of_repetitive_motifs_per_grammar=self.number_of_repetitive_motifs_per_grammar, - inner_crossover_strategy=simple_crossover, - fixed_macro_parent=self.fixed_macro_grammar, - multiple_repetitive=True, - ) - if all(not c for c in children): - raise Exception("Cannot create crossover") - - return tuple( - parent2.create_new_instance_from_id( - self.string_tree_list_to_id( - child[1:] if self.fixed_macro_grammar else child - ) - ) - for child in children - ) - - @override - def compute_prior(self, *, log: bool = True) -> float: - prior_probs = [ - g.compute_prior(st, log=log) - for g, st in zip(self.grammars, self.string_tree_list) - ] - if log: - return sum(prior_probs) - else: - return np.prod(prior_probs) - - @property - def id(self) -> str: - if self._function_id is None or self._function_id == "": - if len(self.string_tree_list) == 0: - raise ValueError("Cannot infer identifier") - self._function_id = self.string_tree_list_to_id(self.string_tree_list) - return self._function_id - - @id.setter - def id(self, value: str) -> None: - self._function_id = value - - @staticmethod - def id_to_string_tree_list(identifier: str) -> list[str]: - return identifier.split("\n") - - def id_to_string_tree(self, identifier: str) -> str: - string_tree_list = self.id_to_string_tree_list(identifier) - return self.assemble_string_tree(string_tree_list) - - @staticmethod - def string_tree_list_to_id(string_tree_list: list[str]) -> str: - return "\n".join(string_tree_list) - - def string_tree_to_id(self, string_tree: str) -> str: - raise NotImplementedError - - def assemble_string_tree(self, string_tree_list: list[str]) -> str: - if self.fixed_macro_grammar: - string_tree = self.assemble_trees( - self.fixed_macro_string_tree, - string_tree_list, - terminal_to_sublanguage_map=self.terminal_to_sublanguage_map, - ) - else: - string_tree = self.assemble_trees( - string_tree_list[0], - string_tree_list[1:], - terminal_to_sublanguage_map=self.terminal_to_sublanguage_map, - ) - return string_tree - - def __eq__(self, other: Any) -> bool: - if not isinstance(other, GraphGrammarMultipleRepetitive): - return NotImplemented - return self.id == other.id - - def reset(self) -> None: - self.clear_graph() - self.string_tree_list = [] - self.string_tree = "" - self.nxTree = None - self._value = None - self._function_id = "" - - def compose_functions(self, flatten_graph: bool = True): - return self._compose_functions(self.id, self.full_grammar, flatten_graph) - - def unparse_tree(self, identifier: str, as_composition: bool = True): - return self._unparse_tree(identifier, self.full_grammar, as_composition) - - @staticmethod - def get_full_grammar(grammars): - full_grammar = deepcopy(grammars[0]) - rules = full_grammar.productions() - nonterminals = full_grammar.nonterminals - terminals = full_grammar.terminals - for g in grammars[1:]: - rules.extend(g.productions()) - nonterminals.extend(g.nonterminals) - terminals.extend(g.terminals) - return full_grammar - - @abstractmethod - def create_new_instance_from_id(self, child: str): - raise NotImplementedError + return self._unparse_tree(identifier, self.grammars[0], as_composition) - def get_dictionary(self) -> dict[str, str]: + def get_dictionary(self) -> dict[str, str]: # noqa: D102 return {"graph_grammar": self.id} - def create_nx_tree(self, string_tree: str) -> nx.DiGraph: - nxTree = self.from_stringTree_to_nxTree(string_tree, self.full_grammar) + def create_nx_tree(self, string_tree: str) -> nx.DiGraph: # noqa: D102 + nxTree = self.from_stringTree_to_nxTree(string_tree, self.grammars[0]) return self.prune_tree( nxTree, terminal_to_torch_map_keys=self.terminal_to_op_names.keys() ) - - def create_from_id(self, identifier: str) -> None: - self.reset() - self.id = identifier - self.string_tree_list = self.id_to_string_tree_list(self.id) - self.string_tree = self.id_to_string_tree(self.id) - _ = self.value # required for checking if graph is valid! - - @property - def search_space_size(self) -> int: - def recursive_worker( - nonterminal: Nonterminal, grammar, lower_level_motifs: dict = None - ) -> int: - if lower_level_motifs is None: - lower_level_motifs = {} - potential_productions = grammar.productions(lhs=nonterminal) - _possibilites = 0 - for potential_production in potential_productions: - edges_nonterminals = [ - rhs_sym - for rhs_sym in potential_production.rhs() - if str(rhs_sym) in grammar.nonterminals - ] - possibilities_per_edge = [ - recursive_worker(e_nonterminal, grammar, lower_level_motifs) - for e_nonterminal in edges_nonterminals - ] - possibilities_per_edge += [ - lower_level_motifs[str(rhs_sym)] - for rhs_sym in potential_production.rhs() - if str(rhs_sym) in lower_level_motifs.keys() - ] - product = 1 - for p in possibilities_per_edge: - product *= p - _possibilites += product - return _possibilites - - if self.fixed_macro_grammar: - if len(self.grammars) > 1: - raise Exception( - "Compute space size for fixed macro only works for one repetitive level" - ) - return np.prod( - [ - grammar.compute_space_size - for grammar, n_grammar in zip( - self.grammars, self.number_of_repetitive_motifs_per_grammar - ) - for _ in range(n_grammar) - ] - ) - else: - if len(self.grammars) > 2: - raise Exception( - "Compute space size for no fixed macro only works for one repetitive level" - ) - macro_space_size = self.grammars[0].compute_space_size - motif_space_size = self.grammars[1].compute_space_size - return ( - macro_space_size - // self.number_of_repetitive_motifs_per_grammar[1] - * motif_space_size - ) diff --git a/neps/search_spaces/architecture/mutations.py b/neps/search_spaces/architecture/mutations.py index f07e3347..c588836a 100644 --- a/neps/search_spaces/architecture/mutations.py +++ b/neps/search_spaces/architecture/mutations.py @@ -1,10 +1,13 @@ +from __future__ import annotations # noqa: D100 + import random -from typing import Callable, List, Tuple +from typing import TYPE_CHECKING, Callable -from .cfg import Grammar +if TYPE_CHECKING: + from .cfg import Grammar -def simple_mutate(parent_string_tree: str, grammar: Grammar) -> Tuple[str, bool]: +def simple_mutate(parent_string_tree: str, grammar: Grammar) -> tuple[str, bool]: # noqa: D103 # works if there is only one grammar # randomly choose a subtree from the parent and replace # with a new randomly generated subtree @@ -19,17 +22,17 @@ def simple_mutate(parent_string_tree: str, grammar: Grammar) -> Tuple[str, bool] return child_string_tree, parent_string_tree == child_string_tree -def bananas_mutate( +def bananas_mutate( # noqa: D103 parent_string_tree: str, grammar: Grammar, mutation_rate: float = 1.0, - mutation_prob: float = None, + mutation_prob: float | None = None, patience: int = 50, -) -> Tuple[str, bool]: +) -> tuple[str, bool]: split_tree = parent_string_tree.split(" ") swappable_indices = [ i - for i in range(0, len(split_tree)) + for i in range(len(split_tree)) if split_tree[i][1:] in grammar.swappable_nonterminals ] _mutation_prob = ( @@ -40,7 +43,7 @@ def bananas_mutate( idx = 0 while idx < len(swappable_indices): swap_idx = swappable_indices[idx] - if random.random() < _mutation_prob: + if random.random() < _mutation_prob: # noqa: S311 subtree_node = split_tree[swap_idx][1:] subtree_idx = swap_idx child_string_tree = grammar.mutate( @@ -54,7 +57,7 @@ def bananas_mutate( split_tree = child_string_tree.split(" ") swappable_indices = [ i - for i in range(0, len(split_tree)) + for i in range(len(split_tree)) if split_tree[i][1:] in grammar.swappable_nonterminals ] _mutation_prob = ( @@ -67,18 +70,18 @@ def bananas_mutate( return child_string_tree, child_string_tree == parent_string_tree -def repetitive_search_space_mutation( +def repetitive_search_space_mutation( # noqa: D103 base_parent: str, - motif_parents: List[str], + motif_parents: list[str], base_grammar: Grammar, - motif_grammars: List[Grammar], + motif_grammars: list[Grammar], terminal_to_sublanguage_map: dict, number_of_repetitive_motifs_per_grammar: list, inner_mutation_strategy: Callable, mutation_rate: float = 1.0, - mutation_prob: float = None, - fixed_macro_parent: bool = False, -) -> Tuple[List[str], List[bool]]: + mutation_prob: float | None = None, + fixed_macro_parent: bool = False, # noqa: FBT001, FBT002 +) -> tuple[list[str], list[bool]]: def _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map): return [ i @@ -97,7 +100,7 @@ def _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map): ) child_string_trees = [] - if not fixed_macro_parent and random.random() < mutation_prob: + if not fixed_macro_parent and random.random() < mutation_prob: # noqa: S311 child_string_trees.append(inner_mutation_strategy(base_parent, base_grammar)) indices = _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map) mutation_prob = ( @@ -116,22 +119,12 @@ def _motifs_in_base_tree(base_parent, terminal_to_sublanguage_map): motif_grammars, _number_of_repetitive_motifs_per_grammar ): for _ in range(number_of_motifs): - if parent_string_idx in indices and random.random() < mutation_prob: + if parent_string_idx in indices and random.random() < mutation_prob: # noqa: S311 child_string_trees.append( inner_mutation_strategy(motif_parents[parent_string_idx], grammar) ) else: child_string_trees.append((motif_parents[parent_string_idx], True)) parent_string_idx += 1 - # child_string_trees.extend( - # [ - # inner_mutation_strategy(parent_string_tree, grammar) - # if i in indices and random.random() < mutation_prob - # else (parent_string_tree, True) - # for i, (parent_string_tree, grammar) in enumerate( - # zip(motif_parents, motif_grammars) - # ) - # ] - # ) return [c[0] for c in child_string_trees], [c[1] for c in child_string_trees] diff --git a/neps/search_spaces/architecture/primitives.py b/neps/search_spaces/architecture/primitives.py index eebb828d..916c0fc7 100644 --- a/neps/search_spaces/architecture/primitives.py +++ b/neps/search_spaces/architecture/primitives.py @@ -1,3 +1,5 @@ +from __future__ import annotations # noqa: D100 + from abc import ABCMeta, abstractmethod import torch @@ -5,8 +7,7 @@ class _AbstractPrimitive(nn.Module, metaclass=ABCMeta): - """ - Use this class when creating new operations for edges. + """Use this class when creating new operations for edges. This is required because we are agnostic to operations at the edges. As a consequence, they can contain subgraphs @@ -24,15 +25,12 @@ def __init__(self, kwargs): @abstractmethod def forward(self, x): - """ - The forward processing of the operation. - """ + """The forward processing of the operation.""" raise NotImplementedError @abstractmethod def get_embedded_ops(self): - """ - Return any embedded ops so that they can be + """Return any embedded ops so that they can be analysed whether they contain a child graph, e.g. a 'motif' in the hierachical search space. @@ -46,86 +44,88 @@ def get_op_name(self): return type(self).__name__ -class AbstractPrimitive(_AbstractPrimitive): - def forward(self, x): +class AbstractPrimitive(_AbstractPrimitive): # noqa: D101 + def forward(self, x): # noqa: D102 raise NotImplementedError - def get_embedded_ops(self): + def get_embedded_ops(self): # noqa: D102 return None class Identity(AbstractPrimitive): - """ - An implementation of the Identity operation. - """ + """An implementation of the Identity operation.""" - def __init__(self, **kwargs): + def __init__(self, **kwargs): # noqa: D107 super().__init__(locals()) - def forward(self, x): + def forward(self, x: object) -> object: # noqa: D102 return x class Zero(AbstractPrimitive): - """ - Implementation of the zero operation. It removes + """Implementation of the zero operation. It removes the connection by multiplying its input with zero. """ def __init__(self, stride, **kwargs): - """ - When setting stride > 1 then it is assumed that the + """When setting stride > 1 then it is assumed that the channels must be doubled. """ super().__init__(locals()) self.stride = int(stride) - def forward(self, x): + def forward(self, x): # noqa: D102 if self.stride == 1: return x.mul(0.0) - else: - return x[:, :, :: self.stride, :: self.stride].mul(0.0) + + return x[:, :, :: self.stride, :: self.stride].mul(0.0) def __repr__(self): return f"" class Zero1x1(AbstractPrimitive): - """ - Implementation of the zero operation. It removes + """Implementation of the zero operation. It removes the connection by multiplying its input with zero. """ def __init__(self, stride, **kwargs): - """ - When setting stride > 1 then it is assumed that the + """When setting stride > 1 then it is assumed that the channels must be doubled. """ super().__init__(locals()) self.stride = int(stride) - def forward(self, x): + def forward(self, x): # noqa: D102 if self.stride == 1: return x.mul(0.0) - else: - x = x[:, :, :: self.stride, :: self.stride].mul(0.0) - return torch.cat([x, x], dim=1) # double the channels TODO: ugly as hell + + x = x[:, :, :: self.stride, :: self.stride].mul(0.0) + return torch.cat([x, x], dim=1) # double the channels TODO: ugly as hell def __repr__(self): return f"" class SepConv(AbstractPrimitive): - """ - Implementation of Separable convolution operation as + """Implementation of Separable convolution operation as in the DARTS paper, i.e. 2 sepconv directly after another. """ - def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True, **kwargs): + def __init__( # noqa: D107 + self, + c_in: int, + c_out: int, + kernel_size: int, + stride: int, + padding: int, + affine: bool = True, # noqa: FBT001, FBT002 + **kwargs, + ): super().__init__(locals()) - C_in = int(C_in) - C_out = int(C_out) + c_in = int(c_in) + c_out = int(c_out) kernel_size = int(kernel_size) stride = int(stride) padding = int(padding) @@ -135,53 +135,60 @@ def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True, **kwa self.op = nn.Sequential( nn.ReLU(inplace=False), nn.Conv2d( - C_in, - C_in, + c_in, + c_in, kernel_size=kernel_size, stride=stride, padding=padding, - groups=C_in, + groups=c_in, bias=False, ), - nn.Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False), - nn.BatchNorm2d(C_in, affine=affine), + nn.Conv2d(c_in, c_in, kernel_size=1, padding=0, bias=False), + nn.BatchNorm2d(c_in, affine=affine), nn.ReLU(inplace=False), nn.Conv2d( - C_in, - C_in, + c_in, + c_in, kernel_size=kernel_size, stride=1, padding=padding, - groups=C_in, + groups=c_in, bias=False, ), - nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False), - nn.BatchNorm2d(C_out, affine=affine), + nn.Conv2d(c_in, c_out, kernel_size=1, padding=0, bias=False), + nn.BatchNorm2d(c_out, affine=affine), ) - def forward(self, x): + def forward(self, x): # noqa: D102 return self.op(x) @property - def get_op_name(self): + def get_op_name(self): # noqa: D102 op_name = super().get_op_name op_name += f"{self.kernel_size}x{self.kernel_size}" return op_name class DilConv(AbstractPrimitive): - """ - Implementation of a dilated separable convolution as + """Implementation of a dilated separable convolution as used in the DARTS paper. """ - def __init__( - self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True, **kwargs + def __init__( # noqa: D107 + self, + c_in: int, + c_out: int, + kernel_size: int, + stride: int, + padding: int, + dilation: int, + affine: bool = True, # noqa: FBT001, FBT002 + **kwargs, ): super().__init__(locals()) - C_in = int(C_in) - C_out = int(C_out) + c_in = int(c_in) + c_out = int(c_out) kernel_size = int(kernel_size) stride = int(stride) padding = int(padding) @@ -192,68 +199,66 @@ def __init__( self.op = nn.Sequential( nn.ReLU(inplace=False), nn.Conv2d( - C_in, - C_in, + c_in, + c_in, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, - groups=C_in, + groups=c_in, bias=False, ), - nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False), - nn.BatchNorm2d(C_out, affine=affine), + nn.Conv2d(c_in, c_out, kernel_size=1, padding=0, bias=False), + nn.BatchNorm2d(c_out, affine=affine), ) - def forward(self, x): + def forward(self, x): # noqa: D102 return self.op(x) @property - def get_op_name(self): + def get_op_name(self): # noqa: D102 op_name = super().get_op_name op_name += f"{self.kernel_size}x{self.kernel_size}" return op_name class Stem(AbstractPrimitive): - """ - This is used as an initial layer directly after the + """This is used as an initial layer directly after the image input. """ - def __init__(self, C_out, C_in=3, **kwargs): + def __init__(self, c_out: int, c_in: int = 3, **kwargs): # noqa: D107 super().__init__(locals()) - C_out = int(C_out) + c_out = int(c_out) self.seq = nn.Sequential( - nn.Conv2d(C_in, C_out, 3, padding=1, bias=False), nn.BatchNorm2d(C_out) + nn.Conv2d(c_in, c_out, 3, padding=1, bias=False), nn.BatchNorm2d(c_out) ) - def forward(self, x): + def forward(self, x): # noqa: D102 return self.seq(x) class Sequential(AbstractPrimitive): - """ - Implementation of `torch.nn.Sequential` to be used + """Implementation of `torch.nn.Sequential` to be used as op on edges. """ - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs): # noqa: D107 super().__init__(locals()) self.primitives = args self.op = nn.Sequential(*args) - def forward(self, x): + def forward(self, x): # noqa: D102 return self.op(x) - def get_embedded_ops(self): + def get_embedded_ops(self): # noqa: D102 return list(self.primitives) -class MaxPool(AbstractPrimitive): - def __init__(self, kernel_size, stride, **kwargs): +class MaxPool(AbstractPrimitive): # noqa: D101 + def __init__(self, kernel_size: int, stride: int, **kwargs): # noqa: D107 super().__init__(locals()) kernel_size = int(kernel_size) @@ -261,35 +266,42 @@ def __init__(self, kernel_size, stride, **kwargs): self.maxpool = nn.MaxPool2d(kernel_size, stride=stride, padding=1) - def forward(self, x): - x = self.maxpool(x) - return x + def forward(self, x): # noqa: D102 + return self.maxpool(x) class MaxPool1x1(AbstractPrimitive): - """ - Implementation of MaxPool with an optional 1x1 convolution + """Implementation of MaxPool with an optional 1x1 convolution in case stride > 1. The 1x1 convolution is required to increase the number of channels. """ - def __init__(self, kernel_size, stride, C_in, C_out, affine=True, **kwargs): + def __init__( # noqa: D107 + self, + kernel_size: int, + stride: int, + c_in: int, + c_out: int, + affine: bool = True, # noqa: FBT001, FBT002 + **kwargs, + ): super().__init__(locals()) kernel_size = int(kernel_size) stride = int(stride) - C_in = int(C_in) - C_out = int(C_out) + c_in = int(c_in) + c_out = int(c_out) affine = bool(affine) self.stride = stride self.maxpool = nn.MaxPool2d(kernel_size, stride=stride, padding=1) if stride > 1: - assert C_in is not None and C_out is not None - self.conv = nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False) - self.bn = nn.BatchNorm2d(C_out, affine=affine) + assert c_in is not None + assert c_out is not None + self.conv = nn.Conv2d(c_in, c_out, 1, stride=1, padding=0, bias=False) + self.bn = nn.BatchNorm2d(c_out, affine=affine) - def forward(self, x): + def forward(self, x): # noqa: D102 x = self.maxpool(x) if self.stride > 1: x = self.conv(x) @@ -298,34 +310,32 @@ def forward(self, x): class AvgPool(AbstractPrimitive): - """ - Implementation of Avergae Pooling. - """ + """Implementation of Avergae Pooling.""" - def __init__(self, kernel_size, stride, **kwargs): + def __init__(self, kernel_size: int, stride: int, **kwargs): # noqa: D107 stride = int(stride) super().__init__(locals()) - self.avgpool = nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False) + self.avgpool = nn.AvgPool2d( + kernel_size=3, stride=stride, padding=1, count_include_pad=False + ) - def forward(self, x): - x = self.avgpool(x) - return x + def forward(self, x): # noqa: D102 + return self.avgpool(x) class AvgPool1x1(AbstractPrimitive): - """ - Implementation of Avergae Pooling with an optional + """Implementation of Avergae Pooling with an optional 1x1 convolution afterwards. The convolution is required to increase the number of channels if stride > 1. """ - def __init__( + def __init__( # noqa: D107 self, - kernel_size, - stride, - C_in, - C_out, - affine=True, + kernel_size: int, + stride: int, + c_in: int, + c_out: int, + affine: bool = True, # noqa: FBT001, FBT002 **kwargs, ): super().__init__(locals()) @@ -333,11 +343,12 @@ def __init__( self.stride = int(stride) self.avgpool = nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False) if stride > 1: - assert C_in is not None and C_out is not None - self.conv = nn.Conv2d(C_in, C_out, 1, stride=1, padding=0, bias=False) - self.bn = nn.BatchNorm2d(C_out, affine=affine) + assert c_in is not None + assert c_out is not None + self.conv = nn.Conv2d(c_in, c_out, 1, stride=1, padding=0, bias=False) + self.bn = nn.BatchNorm2d(c_out, affine=affine) - def forward(self, x): + def forward(self, x): # noqa: D102 x = self.avgpool(x) if self.stride > 1: x = self.conv(x) @@ -345,8 +356,16 @@ def forward(self, x): return x -class ReLUConvBN(AbstractPrimitive): - def __init__(self, C_in, C_out, kernel_size, stride=1, affine=True, **kwargs): +class ReLUConvBN(AbstractPrimitive): # noqa: D101 + def __init__( # noqa: D107 + self, + c_in: int, + c_out: int, + kernel_size: int, + stride: int = 1, + affine: bool = True, # noqa: FBT001, FBT002 + **kwargs, + ): super().__init__(locals()) kernel_size = int(kernel_size) stride = int(stride) @@ -355,113 +374,134 @@ def __init__(self, C_in, C_out, kernel_size, stride=1, affine=True, **kwargs): pad = 0 if int(stride) == 1 and kernel_size == 1 else 1 self.op = nn.Sequential( nn.ReLU(inplace=False), - nn.Conv2d(C_in, C_out, kernel_size, stride=stride, padding=pad, bias=False), - nn.BatchNorm2d(C_out, affine=affine), + nn.Conv2d(c_in, c_out, kernel_size, stride=stride, padding=pad, bias=False), + nn.BatchNorm2d(c_out, affine=affine), ) - def forward(self, x): + def forward(self, x): # noqa: D102 return self.op(x) @property - def get_op_name(self): + def get_op_name(self): # noqa: D102 op_name = super().get_op_name op_name += f"{self.kernel_size}x{self.kernel_size}" return op_name class ConvBnReLU(AbstractPrimitive): - """ - Implementation of 2d convolution, followed by 2d batch normalization and ReLU activation. + """Implementation of 2d convolution, followed by 2d batch normalization and + ReLU activation. """ - def __init__(self, C_in, C_out, kernel_size, stride=1, affine=True, **kwargs): + def __init__( # noqa: D107 + self, + c_in: int, + c_out: int, + kernel_size: int, + stride: int = 1, + affine: bool = True, # noqa: FBT001, FBT002 + **kwargs, + ): super().__init__(locals()) self.kernel_size = kernel_size pad = 0 if stride == 1 and kernel_size == 1 else 1 self.op = nn.Sequential( - nn.Conv2d(C_in, C_out, kernel_size, stride=stride, padding=pad, bias=False), - nn.BatchNorm2d(C_out, affine=affine), + nn.Conv2d(c_in, c_out, kernel_size, stride=stride, padding=pad, bias=False), + nn.BatchNorm2d(c_out, affine=affine), nn.ReLU(inplace=False), ) - def forward(self, x): + def forward(self, x): # noqa: D102 return self.op(x) @property - def get_op_name(self): + def get_op_name(self): # noqa: D102 op_name = super().get_op_name op_name += f"{self.kernel_size}x{self.kernel_size}" return op_name class ConvBn(AbstractPrimitive): - """ - Implementation of 2d convolution, followed by 2d batch normalization and ReLU activation. + """Implementation of 2d convolution, followed by 2d batch normalization and ReLU + activation. """ - def __init__(self, C_in, C_out, kernel_size, stride=1, affine=True, **kwargs): + def __init__( # noqa: D107 + self, + c_in: int, + c_out: int, + kernel_size: int, + stride=1, + affine: bool = True, # noqa: FBT001, FBT002 + **kwargs, + ): super().__init__(locals()) self.kernel_size = kernel_size pad = 0 if stride == 1 and kernel_size == 1 else 1 self.op = nn.Sequential( - nn.Conv2d(C_in, C_out, kernel_size, stride=stride, padding=pad, bias=False), - nn.BatchNorm2d(C_out, affine=affine), + nn.Conv2d(c_in, c_out, kernel_size, stride=stride, padding=pad, bias=False), + nn.BatchNorm2d(c_out, affine=affine), ) - def forward(self, x): + def forward(self, x): # noqa: D102 return self.op(x) @property - def get_op_name(self): + def get_op_name(self): # noqa: D102 op_name = super().get_op_name op_name += f"{self.kernel_size}x{self.kernel_size}" return op_name class Concat1x1(AbstractPrimitive): - """ - Implementation of the channel-wise concatination followed by a 1x1 convolution + """Implementation of the channel-wise concatination followed by a 1x1 convolution to retain the channel dimension. """ - def __init__( - self, num_in_edges, C_out, affine=True, **kwargs + def __init__( # noqa: D107 + self, + num_in_edges: int, + c_out: int, + affine: bool = True, # noqa: FBT001, FBT002 + **kwargs, ): super().__init__(locals()) self.conv = nn.Conv2d( - num_in_edges * C_out, C_out, kernel_size=1, stride=1, padding=0, bias=False + num_in_edges * c_out, c_out, kernel_size=1, stride=1, padding=0, bias=False ) - self.bn = nn.BatchNorm2d(C_out, affine=affine) + self.bn = nn.BatchNorm2d(c_out, affine=affine) def forward(self, x): - """ - Expecting a list of input tensors. Stacking them channel-wise - and applying 1x1 conv + """Expecting a list of input tensors. Stacking them channel-wise + and applying 1x1 conv. """ x = torch.cat(x, dim=1) x = self.conv(x) - x = self.bn(x) - return x + return self.bn(x) -class ResNetBasicblock(AbstractPrimitive): - def __init__( - self, C_in, C_out, stride, affine=True, **kwargs +class ResNetBasicblock(AbstractPrimitive): # noqa: D101 + def __init__( # noqa: D107 + self, + c_in: int, + c_out: int, + stride: int, + affine: bool = True, # noqa: FBT001, FBT002 + **kwargs, ): super().__init__(locals()) - assert stride == 1 or stride == 2, f"invalid stride {stride}" - self.conv_a = ReLUConvBN(C_in, C_out, 3, stride) - self.conv_b = ReLUConvBN(C_out, C_out, 3) + assert stride in (1, 2), f"invalid stride {stride}" + self.conv_a = ReLUConvBN(c_in, c_out, 3, stride) + self.conv_b = ReLUConvBN(c_out, c_out, 3) if stride == 2: self.downsample = nn.Sequential( - # nn.AvgPool2d(kernel_size=2, stride=2, padding=0), - nn.Conv2d(C_in, C_out, kernel_size=1, stride=2, padding=0, bias=False), - nn.BatchNorm2d(C_out), + nn.Conv2d(c_in, c_out, kernel_size=1, stride=2, padding=0, bias=False), + nn.BatchNorm2d(c_out), ) else: self.downsample = None - def forward(self, x): + def forward(self, x): # noqa: D102 basicblock = self.conv_a(x) basicblock = self.conv_b(basicblock) residual = self.downsample(x) if self.downsample is not None else x diff --git a/neps/search_spaces/architecture/topologies.py b/neps/search_spaces/architecture/topologies.py index b45db832..5bb040ed 100644 --- a/neps/search_spaces/architecture/topologies.py +++ b/neps/search_spaces/architecture/topologies.py @@ -1,3 +1,5 @@ +from __future__ import annotations # noqa: D100 + import inspect import queue from abc import ABCMeta, abstractmethod @@ -7,21 +9,23 @@ from .graph import Graph -class AbstractTopology(Graph, metaclass=ABCMeta): - edge_list: list = [] +class AbstractTopology(Graph, metaclass=ABCMeta): # noqa: D101 + edge_list: list = [] # noqa: RUF012 - def __init__(self, name: str = None, scope: str = None, merge_fn: Callable = sum): + def __init__( # noqa: D107 + self, name: str | None = None, scope: str | None = None, merge_fn: Callable = sum + ): super().__init__(name=name, scope=scope) self.merge_fn = merge_fn - def mutate(self): + def mutate(self): # noqa: D102 pass - def sample(self): + def sample(self): # noqa: D102 pass - def create_graph(self, vals: dict): + def create_graph(self, vals: dict): # noqa: C901, D102 def get_args_and_defaults(func): signature = inspect.signature(func) return list(signature.parameters.keys()), { @@ -36,18 +40,18 @@ def get_op_name_from_dict(val: dict): args: dict = {} arg_names, default_args = get_args_and_defaults(op) for arg_name in arg_names: - if arg_name == "self" or arg_name == "kwargs" or arg_name in args.keys(): + if arg_name in ("self", "kwargs") or arg_name in args: continue - if arg_name in val.keys(): + if arg_name in val: args[arg_name] = val[arg_name] - elif arg_name in default_args.keys(): + elif arg_name in default_args: args[arg_name] = default_args[arg_name] else: args[arg_name] = 42 if "groups" in args and args["groups"] != 1: - args["C_in"] = args["groups"] - args["C_out"] = args["groups"] + args["c_in"] = args["groups"] + args["c_out"] = args["groups"] return op(**args).get_op_name @@ -57,24 +61,23 @@ def get_op_name_from_dict(val: dict): if isinstance(val, dict): _val = val _val["op_name"] = get_op_name_from_dict(val) + elif isinstance(val, int): # for synthetic benchmarks + _val = {"op": val, "op_name": val} + elif hasattr(val, "get_op_name"): + _val = {"op": val, "op_name": val.get_op_name} + elif callable(val): + _val = {"op": val, "op_name": val.__name__} else: - if isinstance(val, int): # for synthetic benchmarks - _val = {"op": val, "op_name": val} - elif hasattr(val, "get_op_name"): - _val = {"op": val, "op_name": val.get_op_name} - elif callable(val): - _val = {"op": val, "op_name": val.__name__} - else: - raise Exception(f"Cannot extract op name from {val}") + raise Exception(f"Cannot extract op name from {val}") self.edges[u, v].update(_val) @property - def get_op_name(self): + def get_op_name(self): # noqa: D102 return type(self).__name__ - def __call__(self, x): - cur_node_idx = [node for node in self.nodes if self.in_degree(node) == 0][0] + def __call__(self, x): # noqa: D102 + cur_node_idx = next(node for node in self.nodes if self.in_degree(node) == 0) predecessor_inputs = {cur_node_idx: [x]} next_successors = queue.Queue() next_successors.put(cur_node_idx) @@ -103,18 +106,20 @@ def __call__(self, x): return inputs -class AbstractVariableTopology(AbstractTopology): - def __init__(self, name: str = None, scope: str = None, **kwargs): +class AbstractVariableTopology(AbstractTopology): # noqa: D101 + def __init__( # noqa: D107 + self, name: str | None = None, scope: str | None = None, **kwargs + ): super().__init__(name, scope, **kwargs) @staticmethod @abstractmethod - def get_edge_list(**kwargs): + def get_edge_list(**kwargs): # noqa: D102 raise NotImplementedError class _SequentialNEdge(AbstractTopology): - edge_list: list = [] + edge_list: list = [] # noqa: RUF012 def __init__(self, *edge_vals, number_of_edges: int, **kwargs): super().__init__(**kwargs) @@ -132,18 +137,18 @@ def get_edge_list(number_of_edges: int): LinearNEdge = _SequentialNEdge -def get_sequential_n_edge(number_of_edges: int): +def get_sequential_n_edge(number_of_edges: int): # noqa: D103 return partial(_SequentialNEdge, number_of_edges=number_of_edges) -class Residual(AbstractTopology): - edge_list = [ +class Residual(AbstractTopology): # noqa: D101 + edge_list = [ # noqa: RUF012 (1, 2), (1, 3), (2, 3), ] - def __init__(self, *edge_vals, **kwargs): + def __init__(self, *edge_vals, **kwargs): # noqa: D107 super().__init__(**kwargs) self.name = "residual" @@ -151,10 +156,10 @@ def __init__(self, *edge_vals, **kwargs): self.set_scope(self.name) -class Diamond(AbstractTopology): - edge_list = [(1, 2), (1, 3), (2, 4), (3, 4)] +class Diamond(AbstractTopology): # noqa: D101 + edge_list = [(1, 2), (1, 3), (2, 4), (3, 4)] # noqa: RUF012 - def __init__(self, *edge_vals, **kwargs): + def __init__(self, *edge_vals, **kwargs): # noqa: D107 super().__init__(**kwargs) self.name = "diamond" @@ -162,10 +167,10 @@ def __init__(self, *edge_vals, **kwargs): self.set_scope(self.name) -class DiamondMid(AbstractTopology): - edge_list = [(1, 2), (1, 3), (2, 3), (2, 4), (3, 4)] +class DiamondMid(AbstractTopology): # noqa: D101 + edge_list = [(1, 2), (1, 3), (2, 3), (2, 4), (3, 4)] # noqa: RUF012 - def __init__(self, *edge_vals, **kwargs): + def __init__(self, *edge_vals, **kwargs): # noqa: D107 super().__init__(**kwargs) self.name = "diamond_mid" @@ -174,7 +179,7 @@ def __init__(self, *edge_vals, **kwargs): class _DenseNNodeDAG(AbstractTopology): - edge_list: list = [] + edge_list: list = [] # noqa: RUF012 def __init__(self, *edge_vals, number_of_nodes: int, **kwargs): super().__init__(**kwargs) @@ -190,5 +195,5 @@ def get_edge_list(number_of_nodes: int): return [(i + 1, j + 1) for j in range(number_of_nodes) for i in range(j)] -def get_dense_n_node_dag(number_of_nodes: int): +def get_dense_n_node_dag(number_of_nodes: int): # noqa: D103 return partial(_DenseNNodeDAG, number_of_nodes=number_of_nodes) diff --git a/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py b/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py index fd01f7c1..77ce9e9f 100644 --- a/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py +++ b/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py @@ -64,9 +64,9 @@ def set_recursive_attribute(op_name, predecessor_values): - in_channels = 64 if predecessor_values is None else predecessor_values["C_out"] + in_channels = 64 if predecessor_values is None else predecessor_values["c_out"] out_channels = in_channels * 2 if op_name == "ResNetBasicblock" else in_channels - return dict(C_in=in_channels, C_out=out_channels) + return dict(c_in=in_channels, c_out=out_channels) def run_pipeline(some_architecture, some_float, some_integer, some_cat): @@ -79,7 +79,7 @@ def run_pipeline(some_architecture, some_float, some_integer, some_cat): model = some_architecture.to_pytorch() model = nn.Sequential( - ops.Stem(base_channels, C_in=in_channels), + ops.Stem(base_channels, c_in=in_channels), model, nn.AdaptiveAvgPool2d(1), nn.Flatten(), diff --git a/neps_examples/experimental/hierarchical_architecture.py b/neps_examples/experimental/hierarchical_architecture.py index 6751cc7a..55ed9144 100644 --- a/neps_examples/experimental/hierarchical_architecture.py +++ b/neps_examples/experimental/hierarchical_architecture.py @@ -54,12 +54,12 @@ def set_recursive_attribute(op_name, predecessor_values): - in_channels = 64 if predecessor_values is None else predecessor_values["C_out"] + in_channels = 64 if predecessor_values is None else predecessor_values["c_out"] out_channels = in_channels * 2 if op_name == "ResNetBasicblock" else in_channels - return dict(C_in=in_channels, C_out=out_channels) + return dict(c_in=in_channels, c_out=out_channels) -def run_pipeline(architecture): +def run_pipeline(architecture: neps.FunctionParameter): in_channels = 3 n_classes = 20 base_channels = 64 @@ -67,7 +67,7 @@ def run_pipeline(architecture): model = architecture.to_pytorch() model = nn.Sequential( - ops.Stem(base_channels, C_in=in_channels), + ops.Stem(base_channels, c_in=in_channels), model, nn.AdaptiveAvgPool2d(1), nn.Flatten(), diff --git a/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py b/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py index c79a7a01..0d2acfb0 100644 --- a/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py +++ b/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py @@ -59,9 +59,9 @@ def set_recursive_attribute(op_name, predecessor_values): - in_channels = 64 if predecessor_values is None else predecessor_values["C_out"] + in_channels = 64 if predecessor_values is None else predecessor_values["c_out"] out_channels = in_channels * 2 if op_name == "ResNetBasicblock" else in_channels - return dict(C_in=in_channels, C_out=out_channels) + return dict(c_in=in_channels, c_out=out_channels) def run_pipeline(architecture): @@ -74,7 +74,7 @@ def run_pipeline(architecture): model = architecture.to_pytorch() model = nn.Sequential( - ops.Stem(base_channels, C_in=in_channels), + ops.Stem(base_channels, c_in=in_channels), model, nn.AdaptiveAvgPool2d(1), nn.Flatten(), diff --git a/pyproject.toml b/pyproject.toml index 3e304c55..92765560 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,19 +3,19 @@ name = "neural-pipeline-search" version = "v0.12.2" description = "Neural Pipeline Search helps deep learning experts find the best neural pipeline." authors = [ - "Danny Stoll ", - "Neeratyoy Mallik ", - "Simon Schrodi", - "Eddie Bergman", - "Maciej Janowski", - "Samir Garibov", - "Tarek Abou Chakra", - "Daniel Rogalla", - "Carl Hvarfner", - "Binxin Ru", - "Nils Kober", - "Théophane Vallaeys", - "Frank Hutter", + "Danny Stoll ", + "Neeratyoy Mallik ", + "Simon Schrodi", + "Eddie Bergman", + "Maciej Janowski", + "Samir Garibov", + "Tarek Abou Chakra", + "Daniel Rogalla", + "Carl Hvarfner", + "Binxin Ru", + "Nils Kober", + "Théophane Vallaeys", + "Frank Hutter", ] readme = "README.md" license = "Apache-2.0" @@ -23,10 +23,10 @@ homepage = "https://github.com/automl/neps" repository = "https://github.com/automl/neps" documentation = "https://automl.github.io/neps/" keywords = [ - "Neural Pipeline Search", - "Neural Architecture Search", - "Hyperparameter Optimization", - "AutoML", + "Neural Pipeline Search", + "Neural Architecture Search", + "Hyperparameter Optimization", + "AutoML", ] classifiers = [ "Development Status :: 4 - Beta", @@ -97,33 +97,34 @@ src = ["neps"] # TODO(eddiebergman): Include more of these as we go on in migration exclude = [ - "neps/search_spaces/architecture/**/*.py", - "neps/search_spaces/yaml_search_space_utils.py", - "neps/utils/run_args_from_yaml.py", - "neps/utils/common.py", - "neps/api.py", - "tests", - "neps_examples", - ".bzr", - ".direnv", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".nox", - ".pants.d", - ".ruff_cache", - ".svn", - ".tox", - ".venv", - "__pypackages__", - "_build", - "buck-out", - "build", - "dist", - "node_modules", - "venv", - "docs", + "neps/optimizers/**/*.py", + "neps/search_spaces/architecture/**/*.py", + "neps/search_spaces/yaml_search_space_utils.py", + "neps/search_spaces/architecture", + "neps/utils/run_args_from_yaml.py", + "neps/api.py", + "tests", + "neps_examples", + ".bzr", + ".direnv", + ".eggs", + ".git", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv", + "docs", ] [tool.ruff.lint] @@ -137,52 +138,52 @@ extend-safe-fixes = ["ALL"] dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" select = [ - "A", - # "ANN", # Handled by mypy - "ARG", - "B", - "BLE", - "COM", - "C4", - "D", - # "DTZ", # One day I should know how to utilize timezones and dates... - "E", - # "EXE", Meh - "ERA", - "F", - "FBT", - "I", - # "ISC", # Favours implicit string concatenation - "INP", - # "INT", # I don't understand this one - "N", - "NPY", - "PD", - "PLC", - "PLE", - "PLR", - "PLW", - "PIE", - "PT", - "PTH", - # "PYI", # Specific to .pyi files for type stubs - "Q", - "PGH004", - "RET", - "RUF", - "C90", - "S", - # "SLF", # Private member accessed (sure, it's python) - "SIM", - # "TRY", # Good in principle, would take a lot of work to statisfy - "T10", - "T20", - "TID", - "TCH", - "UP", - "N", - "W", - "YTT", + "A", + # "ANN", # Handled by mypy + "ARG", + "B", + "BLE", + "COM", + "C4", + "D", + # "DTZ", # One day I should know how to utilize timezones and dates... + "E", + # "EXE", Meh + "ERA", + "F", + "FBT", + "I", + # "ISC", # Favours implicit string concatenation + "INP", + # "INT", # I don't understand this one + "N", + "NPY", + "PD", + "PLC", + "PLE", + "PLR", + "PLW", + "PIE", + "PT", + "PTH", + # "PYI", # Specific to .pyi files for type stubs + "Q", + "PGH004", + "RET", + "RUF", + "C90", + "S", + # "SLF", # Private member accessed (sure, it's python) + "SIM", + # "TRY", # Good in principle, would take a lot of work to statisfy + "T10", + "T20", + "TID", + "TCH", + "UP", + "N", + "W", + "YTT", ] ignore = [ @@ -213,31 +214,31 @@ ignore = [ # Exclude a variety of commonly ignored directories. [tool.ruff.lint.per-file-ignores] "tests/*.py" = [ - "S101", - "D101", - "D102", - "D103", - "ANN001", - "ANN201", - "FBT001", - "D100", - "PD901", # X is a bad variable name. (pandas) - "TCH", - "N803", - "C901", # Too complex + "S101", + "D101", + "D102", + "D103", + "ANN001", + "ANN201", + "FBT001", + "D100", + "PD901", # X is a bad variable name. (pandas) + "TCH", + "N803", + "C901", # Too complex ] "__init__.py" = ["I002"] "neps_examples/*" = [ - "INP001", - "I002", - "E741", - "D101", - "D103", - "T20", - "D415", - "ERA001", - "E402", - "E501", + "INP001", + "I002", + "E741", + "D101", + "D103", + "T20", + "D415", + "ERA001", + "E402", + "E501", ] "docs/*" = ["INP001"] @@ -260,12 +261,12 @@ max-args = 10 # Changed from default of 5 [tool.pytest.ini_options] addopts = "--basetemp ./tests_tmpdir -m 'not ci_examples'" markers = [ - "ci_examples", - "core_examples", - "regression_all", - "runtime", - "neps_api", - "summary_csv", + "ci_examples", + "core_examples", + "regression_all", + "runtime", + "neps_api", + "summary_csv", ] filterwarnings = "ignore::DeprecationWarning:torch.utils.tensorboard.*:" @@ -296,10 +297,10 @@ check_untyped_defs = true # TODO(eddiebergman): Improve coverage on these modules [[tool.mypy.overrides]] module = [ - "neps.api", - "neps.optimizers.*", - "neps.search_spaces.architecture.*", - "neps.utils.run_args_from_yaml", + "neps.api", + "neps.optimizers.*", + "neps.search_spaces.architecture.*", + "neps.utils.run_args_from_yaml", ] ignore_errors = true From 6303a650d1bfe6adb05089c6b7073a41e9a1f47b Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 18 Sep 2024 15:03:48 +0200 Subject: [PATCH 40/63] reapply stash --- neps/optimizers/bayesian_optimization/models/gp.py | 6 +++--- neps_examples/basic_usage/hyperparameters.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index 0281cd1a..b3f5c2b2 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -7,9 +7,8 @@ from functools import reduce from typing import TYPE_CHECKING, Any, TypeVar -import gpytorch -import gpytorch.constraints import torch +import gpytorch.constraints from botorch.acquisition.analytic import SingleTaskGP from botorch.models.gp_regression import ( get_covar_module_with_dim_scaled_prior, @@ -18,7 +17,8 @@ from botorch.models.transforms.outcome import Standardize from botorch.optim import optimize_acqf, optimize_acqf_mixed from gpytorch.kernels import ScaleKernel -from torch._dynamo.utils import product +from botorch.optim import optimize_acqf, optimize_acqf_mixed +from itertools import product from neps.search_spaces.encoding import ( CategoricalToIntegerTransformer, diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py index 6ea897f8..3f346949 100644 --- a/neps_examples/basic_usage/hyperparameters.py +++ b/neps_examples/basic_usage/hyperparameters.py @@ -48,5 +48,5 @@ def run_pipeline(float1, float2, float3, integer1, integer2): root_directory="results/hyperparameters_example", post_run_summary=True, max_evaluations_total=50, - use_prior=True, + use_priors=True, ) From 93943503b61d71c53aaa504782d9093031fc63a8 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 18 Sep 2024 18:20:51 +0200 Subject: [PATCH 41/63] refactor: Simpler ifbo --- neps/optimizers/base_optimizer.py | 18 +- .../acquisition_functions/mf_pi.py | 8 +- .../freeze_thaw_sampler.py | 2 +- .../kernels/grakel_replace/utils.py | 4 +- .../grakel_replace/weisfeiler_lehman.py | 2 +- .../bayesian_optimization/models/ftpfn.py | 95 +-- .../bayesian_optimization/optimizer.py | 11 +- .../bayesian_optimization.yaml | 2 +- .../optimizers/default_searchers/mobster.yaml | 2 +- neps/optimizers/default_searchers/pibo.yaml | 2 +- .../default_searchers/priorband_bo.yaml | 2 +- neps/optimizers/multi_fidelity/ifbo.py | 547 +++++---------- neps/optimizers/multi_fidelity/mf_bo.py | 76 +-- neps/optimizers/multi_fidelity/utils.py | 45 ++ .../multi_fidelity_prior/priorband.py | 4 - neps/utils/common.py | 4 +- ...erarchical_architecture_hierarchical_GP.py | 143 ---- .../user_priors_from_arbitrary_densities.py | 151 ----- .../testing_scripts/default_neps.py | 5 +- tests/test_settings/test_settings.py | 621 +++++++++--------- .../test_yaml_run_args/test_yaml_run_args.py | 75 ++- 21 files changed, 671 insertions(+), 1148 deletions(-) delete mode 100644 neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py delete mode 100644 neps_examples/experimental/user_priors_from_arbitrary_densities.py diff --git a/neps/optimizers/base_optimizer.py b/neps/optimizers/base_optimizer.py index a80f9f75..8dd9e96f 100644 --- a/neps/optimizers/base_optimizer.py +++ b/neps/optimizers/base_optimizer.py @@ -6,7 +6,7 @@ from dataclasses import asdict, dataclass from typing import TYPE_CHECKING, Any -from neps.state.trial import Trial +from neps.state.trial import Report, Trial from neps.utils.data_loading import _get_cost, _get_learning_curve, _get_loss from neps.utils.types import ERROR, ConfigResult, RawConfig, ResultDict @@ -144,16 +144,14 @@ def update_state_post_evaluation( # state["key"] = "value" return state - def get_loss( - self, result: ERROR | ResultDict | float | Trial.Report - ) -> float | ERROR: + def get_loss(self, result: ERROR | ResultDict | float | Report) -> float | ERROR: """Calls result.utils.get_loss() and passes the error handling through. Please use self.get_loss() instead of get_loss() in all optimizer classes. """ # TODO(eddiebergman): This is a forward change for whenever we can have optimizers # use `Trial` and `Report`, they already take care of this and save having to do this # `_get_loss` at every call. We can also then just use `None` instead of the string `"error"` - if isinstance(result, Trial.Report): + if isinstance(result, Report): return result.loss if result.loss is not None else "error" return _get_loss( @@ -162,16 +160,14 @@ def get_loss( ignore_errors=self.ignore_errors, ) - def get_cost( - self, result: ERROR | ResultDict | float | Trial.Report - ) -> float | ERROR: + def get_cost(self, result: ERROR | ResultDict | float | Report) -> float | ERROR: """Calls result.utils.get_cost() and passes the error handling through. Please use self.get_cost() instead of get_cost() in all optimizer classes. """ # TODO(eddiebergman): This is a forward change for whenever we can have optimizers # use `Trial` and `Report`, they already take care of this and save having to do this # `_get_loss` at every call - if isinstance(result, Trial.Report): + if isinstance(result, Report): return result.loss if result.loss is not None else "error" return _get_cost( @@ -181,7 +177,7 @@ def get_cost( ) def get_learning_curve( - self, result: str | dict | float | Trial.Report + self, result: str | dict | float | Report ) -> list[float] | Any: """Calls result.utils.get_loss() and passes the error handling through. Please use self.get_loss() instead of get_loss() in all optimizer classes. @@ -189,7 +185,7 @@ def get_learning_curve( # TODO(eddiebergman): This is a forward change for whenever we can have optimizers # use `Trial` and `Report`, they already take care of this and save having to do this # `_get_loss` at every call - if isinstance(result, Trial.Report): + if isinstance(result, Report): return result.learning_curve return _get_learning_curve( diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py index ba2e886b..75c7f1e3 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py @@ -125,10 +125,12 @@ def set_state( surrogate_model: Any, observations: MFObservedData, b_step: int | float, - **kwargs, - ): + seed: int = 42, + ) -> None: # set RNG - self.rng = np.random.RandomState(seed=42) + self.rng = np.random.RandomState(seed=seed) + + # TODO: wut is this? for _i in range(len(observations.completed_runs)): self.rng.uniform(-4, -1) self.rng.randint(1, 51) diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py index 3021bfe0..ea22c5b1 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py +++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py @@ -120,7 +120,7 @@ def sample( acquisition_function: Callable | None = None, n: int | None = None, set_new_sample_fidelity: int | float | None = None, - ) -> pd.DataFrame: + ) -> pd.Series: """Samples a new set and returns the total set of observed + new configs.""" assert self.observations is not None assert self.pipeline_space is not None diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py index e0ad94f3..fe8f8d06 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py @@ -37,7 +37,7 @@ def calculate_kernel_matrix_as_tensor( K = se_kernel.forward(X, X) if se_kernel is not None else X @ X.t() if normalize: K_diag = torch.sqrt(torch.diag(K)) - K_diag_outer = torch.ger(K_diag, K_diag) + K_diag_outer = torch.outer(K_diag, K_diag) return K / K_diag_outer else: assert Y.shape[1] == X.shape[1], ( @@ -51,7 +51,7 @@ def calculate_kernel_matrix_as_tensor( Kyy = calculate_kernel_matrix_as_tensor( Y, Y, oa=oa, se_kernel=se_kernel, normalize=False ) - K_diag_outer = torch.ger( + K_diag_outer = torch.outer( torch.sqrt(torch.diag(Kyy)), torch.sqrt(torch.diag(Kxx)) ) return K / K_diag_outer diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py index f62d0ca0..be35c02a 100644 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py +++ b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py @@ -620,7 +620,7 @@ def generate_graphs_transform(WL_labels_inverse, nl): if self.normalize: X_diag, Y_diag = self.diagonal() if self.as_tensor: - div_ = torch.sqrt(torch.ger(Y_diag, X_diag)) + div_ = torch.sqrt(torch.outer(Y_diag, X_diag)) K /= div_ else: old_settings = np.seterr(divide="ignore") diff --git a/neps/optimizers/bayesian_optimization/models/ftpfn.py b/neps/optimizers/bayesian_optimization/models/ftpfn.py index 95b02ba0..6f697033 100644 --- a/neps/optimizers/bayesian_optimization/models/ftpfn.py +++ b/neps/optimizers/bayesian_optimization/models/ftpfn.py @@ -58,11 +58,21 @@ def _download_workaround_for_ifbo_issue_10(path: Path | None, version: str) -> P return target_path +def _cast_tensor_shapes(x: torch.Tensor) -> torch.Tensor: + if len(x.shape) == 3 and x.shape[1] == 1: + return x + if len(x.shape) == 2: + return x.reshape(x.shape[0], 1, x.shape[1]) + if len(x.shape) == 1: + return x.reshape(x.shape[0], 1) + raise ValueError(f"Shape not recognized: {x.shape}") + + _CACHED_FTPFN_MODEL: dict[tuple[str, str], FTPFN] = {} -class FTPFNSurrogate: - """Special class to deal with PFN surrogate model and freeze-thaw acquisition.""" +class FTPFNModel: + """Wrapper around the IfBO model.""" def __init__( self, @@ -85,57 +95,64 @@ def __init__( _CACHED_FTPFN_MODEL[key] = ftpfn self.ftpfn = ftpfn - self.target_path = self.ftpfn.target_path - self.version = self.ftpfn.version - self.train_x: torch.Tensor | None = None - self.train_y: torch.Tensor | None = None - - @property - def device(self): - return self.ftpfn.device - - def _get_logits(self, test_x: torch.Tensor) -> torch.Tensor: - assert self.train_x is not None, "Train data is not set." - assert self.train_y is not None, "Train data is not set." + self.device = self.ftpfn.device + + def _get_logits( + self, train_x: torch.Tensor, train_y: torch.Tensor, test_x: torch.Tensor + ) -> torch.Tensor: return self.ftpfn.model( - self._cast_tensor_shapes(self.train_x), - self._cast_tensor_shapes(self.train_y), - self._cast_tensor_shapes(test_x), + _cast_tensor_shapes(train_x), + _cast_tensor_shapes(train_y), + _cast_tensor_shapes(test_x), ) - def _cast_tensor_shapes(self, x: torch.Tensor) -> torch.Tensor: - if len(x.shape) == 3 and x.shape[1] == 1: - return x - if len(x.shape) == 2: - return x.reshape(x.shape[0], 1, x.shape[1]) - if len(x.shape) == 1: - return x.reshape(x.shape[0], 1) - raise ValueError(f"Shape not recognized: {x.shape}") - @torch.no_grad() - def get_mean_performance(self, test_x: torch.Tensor) -> torch.Tensor: - logits = self._get_logits(test_x).squeeze() + def get_mean_performance( + self, + train_x: torch.Tensor, + train_y: torch.Tensor, + test_x: torch.Tensor, + ) -> torch.Tensor: + logits = self._get_logits(train_x, train_y, test_x).squeeze() return self.ftpfn.model.criterion.mean(logits) @torch.no_grad() - def get_pi(self, test_x: torch.Tensor, y_best: torch.Tensor) -> torch.Tensor: - logits = self._get_logits(test_x) + def get_pi( + self, + train_x: torch.Tensor, + train_y: torch.Tensor, + test_x: torch.Tensor, + # TODO: just calculate from train_y? + y_best: torch.Tensor, + ) -> torch.Tensor: + logits = self._get_logits(train_x, train_y, test_x) return self.ftpfn.model.criterion.pi( - logits.squeeze(), best_f=(1 - y_best).unsqueeze(1) + logits.squeeze(), + best_f=(1 - y_best).unsqueeze(1), ) @torch.no_grad() - def get_ei(self, test_x: torch.Tensor, y_best: torch.Tensor) -> torch.Tensor: - logits = self._get_logits(test_x) + def get_ei( + self, + train_x: torch.Tensor, + train_y: torch.Tensor, + test_x: torch.Tensor, + y_best: torch.Tensor, + ) -> torch.Tensor: + logits = self._get_logits(train_x, train_y, test_x) return self.ftpfn.model.criterion.ei( logits.squeeze(), best_f=(1 - y_best).unsqueeze(1) ) @torch.no_grad() def get_lcb( - self, test_x: torch.Tensor, beta: float = (1 - 0.682) / 2 + self, + train_x: torch.Tensor, + train_y: torch.Tensor, + test_x: torch.Tensor, + beta: float = (1 - 0.682) / 2, ) -> torch.Tensor: - logits = self._get_logits(test_x) + logits = self._get_logits(train_x, train_y, test_x) return self.ftpfn.model.criterion.ucb( logits=logits, best_f=None, @@ -145,9 +162,13 @@ def get_lcb( @torch.no_grad() def get_ucb( - self, test_x: torch.Tensor, beta: float = (1 - 0.682) / 2 + self, + train_x: torch.Tensor, + train_y: torch.Tensor, + test_x: torch.Tensor, + beta: float = (1 - 0.682) / 2, ) -> torch.Tensor: - logits = self._get_logits(test_x) + logits = self._get_logits(train_x, train_y, test_x) return self.ftpfn.model.criterion.ucb( logits=logits, best_f=None, diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index d9bd10e3..4188a5fe 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -180,7 +180,6 @@ def __init__( # noqa: D417 device: torch.device | None = None, encoder: TensorEncoder | None = None, treat_fidelity_as_hyperparameters: bool = False, - **kwargs: Any, # TODO: Remove ): """Initialise the BO loop. @@ -250,9 +249,9 @@ def ask( "Seed is not yet implemented for BayesianOptimization" ) - n_trials_completed = len(trials) + n_trials_sampled = len(trials) space = self.pipeline_space - config_id = str(n_trials_completed + 1) + config_id = str(n_trials_sampled + 1) # Fill intitial design data if we don't have any... if self.initial_design_ is None: @@ -278,8 +277,8 @@ def ask( self.initial_design_.extend(configs) # If we havn't passed the intial design phase - if n_trials_completed < len(self.initial_design_): - config = self.initial_design_[n_trials_completed] + if n_trials_sampled < len(self.initial_design_): + config = self.initial_design_[n_trials_sampled] sample = SampledConfig(id=config_id, config=config, previous_config_id=None) return sample, optimizer_state @@ -346,7 +345,7 @@ def ask( # the probability of it being sampled from the prior. if self.prior: pibo_exp_term = _pibo_exp_term( - n_trials_completed, + n_trials_sampled, self.encoder.ncols, self.n_initial_design, ) diff --git a/neps/optimizers/default_searchers/bayesian_optimization.yaml b/neps/optimizers/default_searchers/bayesian_optimization.yaml index fb43f97b..9b5a3f37 100644 --- a/neps/optimizers/default_searchers/bayesian_optimization.yaml +++ b/neps/optimizers/default_searchers/bayesian_optimization.yaml @@ -1,6 +1,6 @@ strategy: bayesian_optimization # Arguments that can be modified by the user -surrogate_model: gp # or {"gp_hierarchy"} +surrogate_model: gp acquisition: EI # or {"LogEI", "AEI"} log_prior_weighted: false acquisition_sampler: mutation # or {"random", "evolution"} diff --git a/neps/optimizers/default_searchers/mobster.yaml b/neps/optimizers/default_searchers/mobster.yaml index 9ce821b3..81afaabb 100644 --- a/neps/optimizers/default_searchers/mobster.yaml +++ b/neps/optimizers/default_searchers/mobster.yaml @@ -8,7 +8,7 @@ sample_default_first: false sample_default_at_target: false # arguments for model -surrogate_model: gp # or {"gp_hierarchy"} +surrogate_model: gp acquisition: EI # or {"LogEI", "AEI"} log_prior_weighted: false acquisition_sampler: random # or {"mutation", "evolution"} diff --git a/neps/optimizers/default_searchers/pibo.yaml b/neps/optimizers/default_searchers/pibo.yaml index 8b514ba8..0dc7a7db 100644 --- a/neps/optimizers/default_searchers/pibo.yaml +++ b/neps/optimizers/default_searchers/pibo.yaml @@ -1,6 +1,6 @@ strategy: pibo # Arguments that can be modified by the user -surrogate_model: gp # or {"gp_hierarchy"} +surrogate_model: gp acquisition: EI # or {"LogEI", "AEI"} log_prior_weighted: false acquisition_sampler: mutation # or {"random", "evolution"} diff --git a/neps/optimizers/default_searchers/priorband_bo.yaml b/neps/optimizers/default_searchers/priorband_bo.yaml index 5a9fd3a9..3deb61d7 100644 --- a/neps/optimizers/default_searchers/priorband_bo.yaml +++ b/neps/optimizers/default_searchers/priorband_bo.yaml @@ -16,7 +16,7 @@ inc_style: dynamic model_based: true # crucial argument to set to allow model-search modelling_type: joint initial_design_size: 10 -surrogate_model: gp # or {"gp_hierarchy"} +surrogate_model: gp acquisition: EI # or {"LogEI", "AEI"} log_prior_weighted: false acquisition_sampler: mutation # or {"random", "evolution"} diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index c1a87862..e8b34d25 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -1,462 +1,223 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING, Any -from typing_extensions import override +from typing import TYPE_CHECKING, Any, Mapping import numpy as np -import pandas as pd -from neps.optimizers.base_optimizer import BaseOptimizer -from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping -from neps.optimizers.bayesian_optimization.acquisition_samplers import ( - AcquisitionSamplerMapping, +from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig +from neps.optimizers.bayesian_optimization.acquisition_functions.mf_pi import MFPI_Random +from neps.optimizers.bayesian_optimization.acquisition_samplers.freeze_thaw_sampler import ( + FreezeThawSampler, ) -from neps.optimizers.multi_fidelity.mf_bo import FreezeThawModel, PFNSurrogate +from neps.optimizers.multi_fidelity.mf_bo import PFNSurrogate from neps.optimizers.multi_fidelity.utils import MFObservedData from neps.search_spaces.search_space import FloatParameter, IntegerParameter, SearchSpace -from neps.utils.common import instance_from_map +from neps.state.trial import Trial if TYPE_CHECKING: - from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( - BaseAcquisition, - ) - from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, - ) from neps.state.optimizer import BudgetInfo - from neps.utils.types import ConfigResult + + +def _adjust_fidelity_for_freeze_thaw_steps( + pipeline_space: SearchSpace, + step_size: int, +) -> SearchSpace: + """Adjusts the fidelity range to be divisible by `step_size` for Freeze-Thaw.""" + assert pipeline_space.fidelity is not None + + # Check if the fidelity range is divided into equal sized steps by `step_size` + fid_range = pipeline_space.fidelity.upper - pipeline_space.fidelity.lower + remainder = fid_range % step_size + if remainder == 0: + return pipeline_space + + # Adjust the fidelity lower bound to be divisible by `step_size` into equal steps + # Pushing the lower bound of the fidelity space by an offset to ensure equal-sized steps + offset = step_size - remainder + pipeline_space.fidelity.lower += offset + + warnings.warn( + f"Adjusted fidelity lower bound to {pipeline_space.fidelity.lower} " + f"for equal-sized steps of {step_size}.", + UserWarning, + stacklevel=3, + ) + return pipeline_space + + +# TODO: Maybe make this a part of searchspace functionality +def get_budget_value( + space: SearchSpace, + step_size: int, + budget_level: int | float, +) -> int | float: + assert space.fidelity is not None + match space.fidelity: + case IntegerParameter(): + return int(step_size * budget_level + space.fidelity.lower) + case FloatParameter(): + return step_size * budget_level + space.fidelity.lower + case _: + raise NotImplementedError( + f"Fidelity parameter: {space.fidelity}" + f"must be one of the types: " + f"[IntegerParameter, FloatParameter], but is type:" + f"{type(space.fidelity)}" + ) class IFBO(BaseOptimizer): """Base class for MF-BO algorithms that use DyHPO-like acquisition and budgeting.""" - acquisition: str = "MFPI-random" - def __init__( self, pipeline_space: SearchSpace, - budget: int | None = None, - step_size: int | float = 1, - optimal_assignment: bool = False, # pylint: disable=unused-argument + step_size: int = 1, use_priors: bool = False, sample_default_first: bool = False, sample_default_at_target: bool = False, - loss_value_on_error: None | float = None, - cost_value_on_error: None | float = None, patience: int = 100, - ignore_errors: bool = False, - logger=None, # arguments for model - surrogate_model: str | Any = "ftpfn", surrogate_model_args: dict | None = None, - domain_se_kernel: str | None = None, - graph_kernels: list | None = None, - hp_kernels: list | None = None, - acquisition: str | BaseAcquisition = acquisition, - acquisition_args: dict | None = None, - acquisition_sampler: str | AcquisitionSampler = "freeze-thaw", - acquisition_sampler_args: dict | None = None, - model_policy: Any = PFNSurrogate, initial_design_size: int = 1, ): """Initialise. Args: pipeline_space: Space in which to search - budget: Maximum budget use_priors: Allows random samples to be generated from a default Samples generated from a Gaussian centered around the default value sampling_policy: The type of sampling procedure to use promotion_policy: The type of promotion procedure to use - loss_value_on_error: Setting this and cost_value_on_error to any float will - supress any error during bayesian optimization and will use given loss - value instead. default: None - cost_value_on_error: Setting this and loss_value_on_error to any float will - supress any error during bayesian optimization and will use given cost - value instead. default: None - logger: logger object, or None to use the neps logger sample_default_first: Whether to sample the default configuration first initial_design_size: Number of configurations to sample before starting optimization """ + assert self.pipeline_space.fidelity is not None + # Adjust pipeline space fidelity steps to be equally spaced - pipeline_space = self._adjust_fidelity_for_freeze_thaw_steps( - pipeline_space, step_size - ) - # Super constructor call - super().__init__( - pipeline_space=pipeline_space, - budget=budget, - patience=patience, - loss_value_on_error=loss_value_on_error, - cost_value_on_error=cost_value_on_error, - ignore_errors=ignore_errors, - logger=logger, - ) - self.raw_tabular_space = None # placeholder, can be populated using pre_load_hook - self._budget_list: list[int | float] = [] - self.step_size: int | float = step_size - self.min_budget = self.pipeline_space.fidelity.lower - # TODO: generalize this to work with real data (not benchmarks) - self.max_budget = self.pipeline_space.fidelity.upper - self._initial_design_size = initial_design_size + pipeline_space = _adjust_fidelity_for_freeze_thaw_steps(pipeline_space, step_size) + super().__init__(pipeline_space=pipeline_space, patience=patience) - # TODO: Write use cases for these parameters - self._model_update_failed = False + self.step_size = step_size + self.use_priors = use_priors + self.surrogate_model_args = surrogate_model_args self.sample_default_first = sample_default_first self.sample_default_at_target = sample_default_at_target - self.surrogate_model_name = surrogate_model + self._initial_design_size = initial_design_size - self.use_priors = use_priors - self.total_fevals: int = 0 + self.min_budget: int | float = self.pipeline_space.fidelity.lower + self.max_budget: int | float = self.pipeline_space.fidelity.upper - self.observed_configs = MFObservedData( - columns=["config", "perf", "learning_curves"], - index_names=["config_id", "budget_id"], - ) + fidelity_name = self.pipeline_space.fidelity_name + assert isinstance(fidelity_name, str) + self.fidelity_name: str = fidelity_name - # Preparing model - self.graph_kernels, self.hp_kernels = get_default_kernels( - pipeline_space=pipeline_space, - domain_se_kernel=domain_se_kernel, - graph_kernels=graph_kernels, - hp_kernels=hp_kernels, - optimal_assignment=optimal_assignment, - ) - self.surrogate_model_args = ( - {} if surrogate_model_args is None else surrogate_model_args - ) - self._prep_model_args(self.hp_kernels, self.graph_kernels, pipeline_space) + self._model_update_failed = False - # TODO: Better solution than branching based on the surrogate name is needed - if surrogate_model in ["gp", "gp_hierarchy"]: - model_policy = FreezeThawModel - elif surrogate_model == "ftpfn": - model_policy = PFNSurrogate - else: - raise ValueError("Invalid model option selected!") + def ask( + self, + trials: Mapping[str, Trial], + budget_info: BudgetInfo, + optimizer_state: dict[str, Any], + seed: int | None = None, + ) -> tuple[SampledConfig, dict[str, Any]]: + if seed is not None: + raise NotImplementedError("Seed is not yet implemented for IFBO") + + observed_configs = MFObservedData.from_trials(trials) - # The surrogate model is initalized here - self.model_policy = model_policy( - pipeline_space=pipeline_space, - surrogate_model=surrogate_model, + in_initial_design_phase = ( + len(observed_configs.completed_runs) < self._initial_design_size + ) + if in_initial_design_phase: + # TODO: Copy BO setup where we can sample SOBOL or from Prior + self.logger.debug("Sampling from initial design...") + config = self.pipeline_space.sample( + patience=self.patience, user_priors=True, ignore_fidelity=False + ) + _config_dict = config.hp_values() + _config_dict.update({self.fidelity_name: self.min_budget}) + config.set_hyperparameters_from_dict(_config_dict) + _config_id = observed_configs.next_config_id() + return SampledConfig( + config=config.hp_values(), id=_config_id, previous_config_id=None + ), optimizer_state + + # TODO: Maybe just remove `PFNSurrogate` as a whole and use FTPFN directly... + # this depends on whether we can actually create a proper surrogate model abstraction + # TODO: Really all of this should just be passed into an __init__ instead of 3 stage process + model_policy = PFNSurrogate( + pipeline_space=self.pipeline_space, surrogate_model_args=self.surrogate_model_args, step_size=self.step_size, ) - self.acquisition_args = {} if acquisition_args is None else acquisition_args - self.acquisition_args.update( - { - "pipeline_space": self.pipeline_space, - "surrogate_model_name": self.surrogate_model_name, - } - ) - self.acquisition = instance_from_map( - AcquisitionMapping, - acquisition, - name="acquisition function", - kwargs=self.acquisition_args, - ) - self.acquisition_sampler_args = ( - {} if acquisition_sampler_args is None else acquisition_sampler_args - ) - self.acquisition_sampler_args.update( - {"patience": self.patience, "pipeline_space": self.pipeline_space} - ) - self.acquisition_sampler = instance_from_map( - AcquisitionSamplerMapping, - acquisition_sampler, - name="acquisition sampler function", - kwargs=self.acquisition_sampler_args, - ) - self.count = 0 - - def _adjust_fidelity_for_freeze_thaw_steps( - self, pipeline_space: SearchSpace, step_size: int - ) -> SearchSpace: - """Adjusts the fidelity range to be divisible by `step_size` for Freeze-Thaw.""" - if not pipeline_space.has_fidelity: - return pipeline_space - # Check if the fidelity range is divided into equal sized steps by `step_size` - remainder = ( - pipeline_space.fidelity.upper - pipeline_space.fidelity.lower - ) % step_size - if remainder == 0: - return pipeline_space - # Adjust the fidelity lower bound to be divisible by `step_size` into equal steps - offset = step_size - remainder - # Pushing the lower bound of the fidelity space by an offset to ensure equal-sized steps - pipeline_space.fidelity.lower += offset - warnings.warn( - f"Adjusted fidelity lower bound to {pipeline_space.fidelity.lower} " - f"for equal-sized steps of {step_size}." - ) - return pipeline_space - - def _prep_model_args(self, hp_kernels, graph_kernels, pipeline_space): - if self.surrogate_model_name in ["gp", "gp_hierarchy"]: - # setup for GP implemented in NePS - self.surrogate_model_args.update( - { - # domain_se_kernel=domain_se_kernel, - "hp_kernels": hp_kernels, - "graph_kernels": graph_kernels, - } - ) - if not self.surrogate_model_args["hp_kernels"]: - raise ValueError("No kernels are provided!") - # if "vectorial_features" not in self.surrogate_model_args: - self.surrogate_model_args["vectorial_features"] = ( - pipeline_space.raw_tabular_space.get_vectorial_dim() - if pipeline_space.has_tabular - else pipeline_space.get_vectorial_dim() - ) + model_policy.observed_configs = observed_configs + model_policy.update_model() - def get_budget_level(self, config: SearchSpace) -> int: - """Calculates the discretized (int) budget level for a given configuration.""" - return int( - np.ceil((config.fidelity.value - config.fidelity.lower) / self.step_size) + # TODO: Replace with more efficient samplers we have from BO + # TODO: Just make this take in everything at __init__ instead of a 2 stage init + acquisition_sampler = FreezeThawSampler( + pipeline_space=self.pipeline_space, patience=self.patience ) - - def get_budget_value(self, budget_level: int | float) -> int | float: - if isinstance(self.pipeline_space.fidelity, IntegerParameter): - budget_val = int( - self.step_size * budget_level + self.pipeline_space.fidelity.lower - ) - elif isinstance(self.pipeline_space.fidelity, FloatParameter): - budget_val = ( - self.step_size * budget_level + self.pipeline_space.fidelity.lower - ) - else: - raise NotImplementedError( - f"Fidelity parameter: {self.pipeline_space.fidelity}" - f"must be one of the types: " - f"[IntegerParameter, FloatParameter], but is type:" - f"{type(self.pipeline_space.fidelity)}" - ) - self._budget_list.append(budget_val) - return budget_val - - def total_budget_spent(self) -> int | float: - """Calculates the toal budget spent so far, in the unit of fidelity specified. - - This is calculated as a function of the fidelity range provided, that takes into - account the minimum budget and the step size. - """ - if len(self.observed_configs.df) == 0: - return 0 - - n_configs = len(self.observed_configs.seen_config_ids) - total_budget_level = sum(self.observed_configs.seen_budget_levels) - total_initial_budget_spent = n_configs * self.pipeline_space.fidelity.lower - return total_initial_budget_spent + total_budget_level * self.step_size - - def is_init_phase(self) -> bool: - return self.num_train_configs < self._initial_design_size - - @property - def num_train_configs(self): - return len(self.observed_configs.completed_runs) - - @override - def load_optimization_state( - self, - previous_results: dict[str, ConfigResult], - pending_evaluations: dict[str, SearchSpace], - budget_info: BudgetInfo | None, - optimizer_state: dict[str, Any], - ) -> None: - """This is basically the fit method. - - Args: - previous_results (dict[str, ConfigResult]): [description] - pending_evaluations (dict[str, ConfigResult]): [description] - """ - self.observed_configs = MFObservedData( - columns=["config", "perf", "learning_curves"], - index_names=["config_id", "budget_id"], + acquisition_sampler.set_state( + self.pipeline_space, observed_configs, self.step_size ) - # previous optimization run exists and needs to be loaded - self._load_previous_observations(previous_results) - self.total_fevals = len(previous_results) + len(pending_evaluations) - # account for pending evaluations - self._handle_pending_evaluations(pending_evaluations) + samples = acquisition_sampler.sample(set_new_sample_fidelity=self.min_budget) - # an aesthetic choice more than a functional choice - self.observed_configs.df = self.observed_configs.df.sort_index( - level=self.observed_configs.df.index.names + # TODO: See if we can get away from `set_state` style things + # and just instantiate it with what it needs + acquisition = MFPI_Random( + pipeline_space=self.pipeline_space, surrogate_model_name="ftpfn" ) - # TODO: can we do better than keeping a copy of the observed configs? - # TODO: can we not hide this in load_results and have something that pops out - # more, like a set_state or policy_args - self.model_policy.observed_configs = self.observed_configs - # fit any model/surrogates - init_phase = self.is_init_phase() - if not init_phase: - self._fit_models() - - @classmethod - def _get_config_id_split(cls, config_id: str) -> tuple[str, str]: - # assumes config IDs of the format `[unique config int ID]_[int rung ID]` - ids = config_id.split("_") - _config, _budget = ids[0], ids[1] - return _config, _budget - - def _load_previous_observations(self, previous_results): - def index_data_split(config_id: str, config_val): - _config_id, _budget_id = IFBO._get_config_id_split(config_id) - index = int(_config_id), int(_budget_id) - _data = [ - config_val.config, - self.get_loss(config_val.result), - self.get_learning_curve(config_val.result), - ] - return index, _data - - if len(previous_results) > 0: - index_row = [ - tuple(index_data_split(config_id, config_val)) - for config_id, config_val in previous_results.items() - ] - indices, rows = zip(*index_row, strict=False) - self.observed_configs.add_data(data=list(rows), index=list(indices)) - - def _handle_pending_evaluations(self, pending_evaluations): - for config_id, config_val in pending_evaluations.items(): - _config, _budget_level = config_id.split("_") - index = (int(_config), int(_budget_level)) - - if index not in self.observed_configs.df.index: - # TODO: Validate this - self.observed_configs.add_data( - [config_val, np.nan, [np.nan]], index=index - ) - else: - self.observed_configs.update_data( - { - self.observed_configs.config_col: config_val, - self.observed_configs.perf_col: np.nan, - self.observed_configs.lc_col_name: [np.nan], - }, - index=index, - ) - - def _fit_models(self): - # TODO: Once done with development catch the model update exceptions - # and skip model based suggestions if failed (karibbov) - self._prep_model_args(self.hp_kernels, self.graph_kernels, self.pipeline_space) - self.model_policy.set_state(self.pipeline_space, self.surrogate_model_args) - self.model_policy.update_model() - self.acquisition.set_state( + acquisition.set_state( self.pipeline_space, - self.model_policy.surrogate_model, - self.observed_configs, + model_policy.surrogate_model, + observed_configs, self.step_size, ) - self.acquisition_sampler.set_state( - self.pipeline_space, self.observed_configs, self.step_size - ) - def _randomly_promote(self) -> tuple[SearchSpace, int]: - """Samples the initial design. + # `_samples` should have new configs with fidelities set to as required + acq, _samples = acquisition.eval(x=samples, asscalar=True) + # NOTE: len(samples) need not be equal to len(_samples) as `samples` contain + # all (partials + new) configurations obtained from the sampler, but + # in `_samples`, configs are removed that have reached maximum epochs allowed - With an unbiased coin toss (p=0.5) it decides whether to sample a new - configuration or continue a partial configuration, until initial_design_size - configurations have been sampled. - """ - # sampling a configuration ID from the observed ones - _config_ids = np.unique( - self.observed_configs.df.index.get_level_values("config_id").values - ) - _config_id = np.random.choice(_config_ids) - # extracting the config - config = self.observed_configs.df.loc[ - _config_id, self.observed_configs.config_col - ].iloc[0] - # extracting the budget level - budget = self.observed_configs.df.loc[_config_id].index.values[-1] - # calculating fidelity value - new_fidelity = self.get_budget_value(budget + 1) - # setting the config fidelity - config.update_hp_values({config.fidelity_name: new_fidelity}) - return config, _config_id - - def get_config_and_ids( # pylint: disable=no-self-use - self, - ) -> tuple[SearchSpace, str, str | None]: - """...and this is the method that decides which point to query. + best_idx = acq.argmax() + _config_id = best_idx - Returns: - [type]: [description] - """ - config_id = None - previous_config_id = None - if self.is_init_phase(): - # sample a new config till initial design size is satisfied - self.logger.info("sampling...") - config = self.pipeline_space.sample( - patience=self.patience, user_priors=True, ignore_fidelity=False - ) - _config_dict = config.hp_values() - _config_dict.update({config.fidelity_name: self.min_budget}) - config.set_hyperparameters_from_dict(_config_dict) - _config_id = self.observed_configs.next_config_id() - elif self.is_init_phase() or self._model_update_failed: - # promote a config randomly if initial design size is satisfied but the - # initial design budget has not been exhausted - self.logger.info("promoting...") - config, _config_id = self._randomly_promote() + # NOTE: `samples` and `_samples` should share the same index values, hence, + # avoid using `.iloc` and work with `.loc` on these pandas DataFrame/Series + config: SearchSpace = samples.loc[_config_id] + config = config.clone() + + # IMPORTANT: setting the fidelity value appropriately + if best_idx > max(observed_configs.seen_config_ids): + next_fid_value = self.min_budget else: - if self.count == 0: - self.logger.info("\nPartial learning curves as initial design:\n") - self.logger.info(f"{self.observed_configs.get_learning_curves()}\n") - self.count += 1 - # main acquisition call here after initial design is turned off - self.logger.info("acquiring...") - # generates candidate samples for acquisition calculation - samples = self.acquisition_sampler.sample( - set_new_sample_fidelity=self.pipeline_space.fidelity.lower - ) # fidelity values here should be the observations or min. fidelity - - # calculating acquisition function values for the candidate samples - acq, _samples = self.acquisition.eval( # type: ignore[attr-defined] - x=samples, asscalar=True + max_observed_fids = ( + observed_configs.get_max_observed_fidelity_level_per_config() ) - acq = pd.Series(acq, index=_samples.index) - - # maximizing acquisition function - best_idx = acq.sort_values().index[-1] - # extracting the config ID for the selected maximizer - _config_id = best_idx # samples.index[_samples.index.values[_idx]] - # `_samples` should have new configs with fidelities set to as required - # NOTE: len(samples) need not be equal to len(_samples) as `samples` contain - # all (partials + new) configurations obtained from the sampler, but - # in `_samples`, configs are removed that have reached maximum epochs allowed - # NOTE: `samples` and `_samples` should share the same index values, hence, - # avoid using `.iloc` and work with `.loc` on these pandas DataFrame/Series - - # assigning config hyperparameters - config = samples.loc[_config_id] - # IMPORTANT: setting the fidelity value appropriately - _fid_value = ( - config.fidelity.lower - if best_idx > max(self.observed_configs.seen_config_ids) - else ( - self.get_budget_value( - self.observed_configs.get_max_observed_fidelity_level_per_config().loc[ - best_idx - ] - ) - + self.step_size # ONE-STEP FIDELITY QUERY for freeze-thaw - ) + best_configs_max_fid = max_observed_fids.loc[best_idx] + budget_value = get_budget_value( + space=self.pipeline_space, + step_size=self.step_size, + budget_level=best_configs_max_fid, ) - config.update_hp_values({config.fidelity_name: _fid_value}) - # generating correct IDs - if _config_id in self.observed_configs.seen_config_ids: - config_id = f"{_config_id}_{self.get_budget_level(config)}" - previous_config_id = f"{_config_id}_{self.get_budget_level(config) - 1}" + next_fid_value = budget_value + self.step_size + + config.update_hp_values({self.fidelity_name: next_fid_value}) + + # Lastly, we need to generate config id for it. + budget_level = int(np.ceil((next_fid_value - self.min_budget) / self.step_size)) + if _config_id in observed_configs.seen_config_ids: + config_id = f"{_config_id}_{budget_level}" + previous_config_id = f"{_config_id}_{budget_value - 1}" else: - config_id = f"{self.observed_configs.next_config_id()}_{self.get_budget_level(config)}" + config_id = f"{observed_configs.next_config_id()}_{budget_level}" - return config.hp_values(), config_id, previous_config_id # type: ignore + return SampledConfig( + config=config.hp_values(), id=config_id, previous_config_id=previous_config_id + ), optimizer_state diff --git a/neps/optimizers/multi_fidelity/mf_bo.py b/neps/optimizers/multi_fidelity/mf_bo.py index 790c833a..729e3718 100755 --- a/neps/optimizers/multi_fidelity/mf_bo.py +++ b/neps/optimizers/multi_fidelity/mf_bo.py @@ -6,7 +6,9 @@ import torch from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping +from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate from neps.optimizers.multi_fidelity.utils import ( + MFObservedData, get_tokenized_data, get_training_data_for_freeze_thaw, ) @@ -183,66 +185,30 @@ def sample_new_config( return config -class FreezeThawModel: - """Designed to work with model search in unit step multi-fidelity algorithms.""" +class PFNSurrogate: + """Special class to deal with PFN surrogate model and freeze-thaw acquisition.""" def __init__( self, - pipeline_space, + pipeline_space: SearchSpace, surrogate_model: str = "ftpfn", surrogate_model_args: dict | None = None, step_size: int = 1, ): - self.observed_configs = None + self.train_x = None + self.train_y = None + self.observed_configs: MFObservedData | None = None self.pipeline_space = pipeline_space self.surrogate_model_name = surrogate_model self.surrogate_model_args = ( surrogate_model_args if surrogate_model_args is not None else {} ) - self.surrogate_model = instance_from_map( - SurrogateModelMapping, - self.surrogate_model_name, - name="surrogate model", - kwargs=self.surrogate_model_args, - ) - self.step_size = step_size - - def _fantasize_pending(self, train_x, train_y, pending_x): - raise NotImplementedError("Fantasization not implemented yet!") - - def _fit(self, train_x, train_y, train_lcs): - raise NotImplementedError("Predict not implemented yet!") - - def _predict(self, test_x) -> torch.Tensor: - raise NotImplementedError("Predict not implemented yet!") - def set_state( - self, - pipeline_space, - surrogate_model_args, - **kwargs, # pylint: disable=unused-argument - ): - self.pipeline_space = pipeline_space - self.surrogate_model_args = ( - surrogate_model_args if surrogate_model_args is not None else {} - ) - self.surrogate_model = instance_from_map( - SurrogateModelMapping, - self.surrogate_model_name, - name="surrogate model", - kwargs=self.surrogate_model_args, - ) - - -class PFNSurrogate(FreezeThawModel): - """Special class to deal with PFN surrogate model and freeze-thaw acquisition.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.train_x = None - self.train_y = None + # TODO: Lift this into the responsility of the caller of this function. + self.surrogate_model = FTPFNSurrogate(**surrogate_model_args) + self.step_size = step_size - def update_model(self): + def update_model(self) -> None: # tokenize the observations idxs, steps, configs, performance = get_training_data_for_freeze_thaw( self.observed_configs.df.loc[self.observed_configs.completed_runs_index], @@ -295,7 +261,7 @@ def update_model(self): # refit the model, on completed runs + fantasized pending runs self._fit(train_x, train_y) - def _fit(self, train_x: torch.Tensor, train_y: torch.Tensor): # pylint: disable=unused-argument + def _fit(self, train_x: torch.Tensor, train_y: torch.Tensor) -> None: # no training required,, only preprocessing the training data as context during inference assert self.surrogate_model is not None, "Surrogate model not set!" self.surrogate_model.train_x = train_x @@ -306,10 +272,12 @@ def _predict(self, test_x: torch.Tensor) -> torch.Tensor: self.surrogate_model.train_x is not None and self.surrogate_model.train_y is not None ), "Model not trained yet!" - if self.surrogate_model_name == "ftpfn": - mean = self.surrogate_model.get_mean_performance(test_x) - if mean.is_cuda: - mean = mean.cpu() - return mean - # check neps/optimizers/bayesian_optimization/models/__init__.py for options - raise ValueError(f"Surrogate model {self.surrogate_model_name} not supported!") + return self.surrogate_model.get_mean_performance(test_x) + + def set_state( + self, + pipeline_space, + surrogate_model_args, + **kwargs, # pylint: disable=unused-argument + ): + self.pipeline_space = pipeline_space diff --git a/neps/optimizers/multi_fidelity/utils.py b/neps/optimizers/multi_fidelity/utils.py index 0158fbdf..8e7b4910 100644 --- a/neps/optimizers/multi_fidelity/utils.py +++ b/neps/optimizers/multi_fidelity/utils.py @@ -4,6 +4,7 @@ from collections.abc import Sequence from copy import deepcopy from typing import TYPE_CHECKING, Any +from typing_extensions import Self import numpy as np import pandas as pd @@ -353,6 +354,50 @@ def get_max_observed_fidelity_level_per_config(self) -> pd.Series: def token_ids(self) -> np.ndarray: return self.df.index.values + @classmethod + def from_trials(cls, trials: Mapping[str, Trial]) -> Self: + observed_configs = MFObservedData( + columns=["config", "perf", "learning_curves"], + index_names=["config_id", "budget_id"], + ) + + def _data(trial: Trial) -> Any: + # Considered pending + if report is None: + loss = np.nan + lc = [np.nan] + else: + loss = report.loss if report.loss is not None else "error" + lc = ( + report.learning_curve + if report.learning_curve is not None + else "error" + ) + + return [trial.config, loss, lc] + + # previous optimization run exists and needs to be loaded + def index_data_split( + config_id: str, trial: Trial + ) -> tuple[tuple[int, int], list]: + _config_id, _budget_id = config_id.split("_") + index = int(_config_id), int(_budget_id) + return index, _data(trial) + + if len(trials) > 0: + index_row = [ + tuple(index_data_split(trial_id, trial)) + for trial_id, trial in trials.items() + ] + indices, rows = zip(*index_row, strict=True) + observed_configs.add_data(data=list(rows), index=list(indices)) + + # an aesthetic choice more than a functional choice + observed_configs.df = observed_configs.df.sort_index( + level=self.observed_configs.df.index.names, inplace=True + ) + return observed_configs + if __name__ == "__main__": # TODO: Either delete these or convert them to tests (karibbov) diff --git a/neps/optimizers/multi_fidelity_prior/priorband.py b/neps/optimizers/multi_fidelity_prior/priorband.py index bdf3a567..f4bc067b 100644 --- a/neps/optimizers/multi_fidelity_prior/priorband.py +++ b/neps/optimizers/multi_fidelity_prior/priorband.py @@ -296,8 +296,6 @@ def __init__( initial_design_size: int | None = None, model_policy: typing.Any = ModelPolicy, surrogate_model: str | typing.Any = "gp", - domain_se_kernel: str | None = None, - hp_kernels: list | None = None, surrogate_model_args: dict | None = None, acquisition: str | BaseAcquisition = "EI", log_prior_weighted: bool = False, @@ -339,8 +337,6 @@ def __init__( bo_args = { "surrogate_model": surrogate_model, - "domain_se_kernel": domain_se_kernel, - "hp_kernels": hp_kernels, "surrogate_model_args": surrogate_model_args, "acquisition": acquisition, "log_prior_weighted": log_prior_weighted, diff --git a/neps/utils/common.py b/neps/utils/common.py index 2c6f9d35..d0fb2137 100644 --- a/neps/utils/common.py +++ b/neps/utils/common.py @@ -53,7 +53,7 @@ def load_checkpoint( if not checkpoint_path.exists(): return None - checkpoint = torch.load(checkpoint_path) + checkpoint = torch.load(checkpoint_path, weights_only=True) if model is not None and "model_state_dict" in checkpoint: model.load_state_dict(checkpoint["model_state_dict"]) @@ -141,7 +141,7 @@ def load_lightning_checkpoint( assert len(ckpt_files) == 1 checkpoint_path = ckpt_files[0] - checkpoint = torch.load(checkpoint_path) + checkpoint = torch.load(checkpoint_path, weights_only=True) return checkpoint_path, checkpoint diff --git a/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py b/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py deleted file mode 100644 index 0d2acfb0..00000000 --- a/neps_examples/experimental/hierarchical_architecture_hierarchical_GP.py +++ /dev/null @@ -1,143 +0,0 @@ -import logging -import time - -from torch import nn - -import neps -from neps.optimizers.bayesian_optimization.kernels import GraphKernelMapping -from neps.optimizers.bayesian_optimization.models.gp_hierarchy import ( - ComprehensiveGPHierarchy, -) -from neps.search_spaces.architecture import primitives as ops -from neps.search_spaces.architecture import topologies as topos - -primitives = { - "id": ops.Identity(), - "conv3x3": {"op": ops.ReLUConvBN, "kernel_size": 3, "stride": 1, "padding": 1}, - "conv1x1": {"op": ops.ReLUConvBN, "kernel_size": 1}, - "avg_pool": {"op": ops.AvgPool1x1, "kernel_size": 3, "stride": 1}, - "downsample": {"op": ops.ResNetBasicblock, "stride": 2}, - "residual": topos.Residual, - "diamond": topos.Diamond, - "linear": topos.get_sequential_n_edge(2), - "diamond_mid": topos.DiamondMid, -} - -structure = { - "S": [ - "diamond D2 D2 D1 D1", - "diamond D1 D2 D2 D1", - "diamond D1 D1 D2 D2", - "linear D2 D1", - "linear D1 D2", - "diamond_mid D1 D2 D1 D2 D1", - "diamond_mid D2 D2 Cell D1 D1", - ], - "D2": [ - "diamond D1 D1 D1 D1", - "linear D1 D1", - "diamond_mid D1 D1 Cell D1 D1", - ], - "D1": [ - "diamond D1Helper D1Helper Cell Cell", - "diamond Cell Cell D1Helper D1Helper", - "diamond D1Helper Cell Cell D1Helper", - "linear D1Helper Cell", - "linear Cell D1Helper", - "diamond_mid D1Helper D1Helper Cell Cell Cell", - "diamond_mid Cell D1Helper D1Helper D1Helper Cell", - ], - "D1Helper": ["linear Cell downsample"], - "Cell": [ - "residual OPS OPS OPS", - "diamond OPS OPS OPS OPS", - "linear OPS OPS", - "diamond_mid OPS OPS OPS OPS OPS", - ], - "OPS": ["conv3x3", "conv1x1", "avg_pool", "id"], -} - - -def set_recursive_attribute(op_name, predecessor_values): - in_channels = 64 if predecessor_values is None else predecessor_values["c_out"] - out_channels = in_channels * 2 if op_name == "ResNetBasicblock" else in_channels - return dict(c_in=in_channels, c_out=out_channels) - - -def run_pipeline(architecture): - start = time.time() - - in_channels = 3 - n_classes = 20 - base_channels = 64 - out_channels = 512 - - model = architecture.to_pytorch() - model = nn.Sequential( - ops.Stem(base_channels, c_in=in_channels), - model, - nn.AdaptiveAvgPool2d(1), - nn.Flatten(), - nn.Linear(out_channels, n_classes), - ) - - number_of_params = sum(p.numel() for p in model.parameters()) - y = abs(1.5e7 - number_of_params) / 1.5e7 - - end = time.time() - - return { - "loss": y, - "info_dict": { - "test_score": y, - "train_time": end - start, - }, - } - - -pipeline_space = dict( - architecture=neps.FunctionParameter( - set_recursive_attribute=set_recursive_attribute, - structure=structure, - primitives=primitives, - name="makrograph", - return_graph_per_hierarchy=True, - ) -) - -early_hierarchies_considered = "0_1_2_3" -hierarchy_considered = [int(hl) for hl in early_hierarchies_considered.split("_")] -graph_kernels = ["wl"] * (len(hierarchy_considered) + 1) -wl_h = [2, 1] + [2] * (len(hierarchy_considered) - 1) -graph_kernels = [ - GraphKernelMapping[kernel]( - h=wl_h[j], - oa=False, - se_kernel=None, - ) - for j, kernel in enumerate(graph_kernels) -] -surrogate_model = ComprehensiveGPHierarchy -surrogate_model_args = { - "graph_kernels": graph_kernels, - "hp_kernels": [], - "verbose": False, - "hierarchy_consider": hierarchy_considered, - "d_graph_features": 0, - "vectorial_features": None, -} - -logging.basicConfig(level=logging.INFO) -neps.run( - run_pipeline=run_pipeline, - pipeline_space=pipeline_space, - root_directory="results/hierarchical_architecture_example_new", - max_evaluations_total=15, - searcher="bayesian_optimization", - surrogate_model=surrogate_model, - surrogate_model_args=surrogate_model_args, -) - -previous_results, pending_configs = neps.status( - "results/hierarchical_architecture_example_new" -) diff --git a/neps_examples/experimental/user_priors_from_arbitrary_densities.py b/neps_examples/experimental/user_priors_from_arbitrary_densities.py deleted file mode 100644 index 4c734cd2..00000000 --- a/neps_examples/experimental/user_priors_from_arbitrary_densities.py +++ /dev/null @@ -1,151 +0,0 @@ -import neps - -def run_pipeline(some_float, some_integer, some_cat): - if some_cat != "a": - y = some_float + some_integer - else: - y = -some_float - some_integer - return y - -# ======================================================================================== -# Current API -# User prior is given as a default value and a confidence level specified in the parameter itself -pipeline_space = dict( - some_float=neps.FloatParameter( - lower=1, upper=1000, log=True, default=900, default_confidence="medium" - ), - some_integer=neps.IntegerParameter( - lower=0, upper=50, default=35, default_confidence="low" - ), - some_cat=neps.CategoricalParameter( - choices=["a", "b", "c"], default="a", default_confidence="high" - ) -) -neps.run( - run_pipeline=run_pipeline, - pipeline_space=pipeline_space, - root_directory="results", - max_evaluations_total=15, -) - -# ======================================================================================== -# New API, variant 01 -# User prior is passed to neps.run and not specified in the pipeline_space -# The prior is given as one of the following: -# 1) A (non-factorized) density function that returns the likelihood of a given parameter configuration -# 2) A dicttionary of marginal densities for each parameter. Then the factorized density is used. -# 3) A dictionary of default values and confidence levels for each parameter. Then a gaussian prior is used. - -pipeline_space = dict( - some_float=neps.FloatParameter(lower=1, upper=1000, log=True), - some_integer=neps.IntegerParameter(lower=0, upper=50), - some_cat=neps.CategoricalParameter(choices=["a", "b", "c"]) -) - -# 1) A (non-factorized) density function that returns the likelihood of a given parameter configuration -def prior_01(some_float, some_integer, some_cat): - # some exponential distribution - if some_cat != "a": - return np.exp(-(some_float + some_integer - 1)) - else: - return np.exp(-(-some_float - some_integer + 1050)) - -# 2) A dictionary of marginal densities for each parameter. Then the factorized density is used. -prior_02 = dict( - some_float=lambda x: 1/400 if 800 < x < 1000 else 1/1600, # prior on interval [800, 1000] - some_integer=lambda k: 30**k/np.math.factorial(k) * np.exp(-k), # poisson prior on integers k=30 - some_cat=lambda x: 1/2*(x=="b") + 1/3*(x=="c") + 1/6*(x=="a") -) - -# 3) A dictionary of default values and confidence levels for each parameter. Then a gaussian prior is used. -prior_03 = dict( - some_float=dict(default=900, default_confidence="medium"), - some_integer=dict(default=35, default_confidence="low"), - some_cat=dict(default="a", default_confidence="high") -) - -# Combination of 2) and 3) -prior_04 = dict( - some_float=dict(default=900, default_confidence="medium"), - some_integer=lambda k: 30**k/np.math.factorial(k) * np.exp(-k), # poisson prior on integers k=30 - some_cat=dict(default="a", default_confidence="high") -) - -# Pass the prior to neps.run - -neps.run( - prior=prior_01, # or prior_02 or prior_03 or prior_04 - run_pipeline=run_pipeline, - pipeline_space=pipeline_space, - root_directory="results", - max_evaluations_total=15, -) - -# ======================================================================================== -# New API, variant 02 -# User prior is specfied in the pipeline_space and not directly passed to neps.run -# Same possibiities for priors as in variant 01 - -# 1) A (non-factorized) density function that returns the likelihood of a given parameter configuration -def prior_01(some_float, some_integer, some_cat): - # some exponential distribution - if some_cat != "a": - return np.exp(-(some_float + some_integer - 1)) - else: - return np.exp(-(-some_float - some_integer + 1050)) - -pipeline_space_01 = dict( - some_float=neps.FloatParameter(lower=1, upper=1000, log=True), - some_integer=neps.IntegerParameter(lower=0, upper=50), - some_cat=neps.CategoricalParameter(choices=["a", "b", "c"]), - _prior=prior_01 -) - -# 2) A dictionary of marginal densities for each parameter. Then the factorized density is used. -pipeline_space_02 = dict( - some_float=neps.FloatParameter( - lower=1, upper=1000, log=True, - prior_fun=lambda x: 1/400 if 800 < x < 1000 else 1/1600 - ), - some_integer=neps.IntegerParameter(lower=0, upper=50, - prior_fun=lambda k: 30**k/np.math.factorial(k) * np.exp(-k) -), - some_cat=neps.CategoricalParameter(choices=["a", "b", "c"], - prior_fun=lambda x: 1/2*(x=="b") + 1/3*(x=="c") + 1/6*(x=="a") - ) -) - -# 3) A dictionary of default values and confidence levels for each parameter. Then a gaussian prior is used. -# Same as in the current API -pipeline_space_03 = dict( - some_float=neps.FloatParameter( - lower=1, upper=1000, log=True, default=900, default_confidence="medium" - ), - some_integer=neps.IntegerParameter( - lower=0, upper=50, default=35, default_confidence="low" - ), - some_cat=neps.CategoricalParameter( - choices=["a", "b", "c"], default="a", default_confidence="high" - ) -) - -# Combination of 2) and 3) -pipeline_space_04 = dict( - some_float=neps.FloatParameter( - lower=1, upper=1000, log=True, default=900, default_confidence="medium", - ), - some_integer=neps.IntegerParameter( - lower=0, upper=50, - prior_fun=lambda k: 30**k/np.math.factorial(k) * np.exp(-k) - ), - some_cat=neps.CategoricalParameter( - choices=["a", "b", "c"], default="a", default_confidence="high") -) - -# Pass the pipeline_space to neps.run -neps.run( - run_pipeline=run_pipeline, - pipeline_space=pipeline_space_01, # or pipeline_space_02 or pipeline_space_03 or pipeline_space_04 - root_directory="results", - max_evaluations_total=15, -) diff --git a/tests/test_neps_api/testing_scripts/default_neps.py b/tests/test_neps_api/testing_scripts/default_neps.py index 5384042a..370c6255 100644 --- a/tests/test_neps_api/testing_scripts/default_neps.py +++ b/tests/test_neps_api/testing_scripts/default_neps.py @@ -2,9 +2,6 @@ import neps from neps.optimizers.bayesian_optimization.kernels import GraphKernelMapping -from neps.optimizers.bayesian_optimization.models.gp_hierarchy import ( - ComprehensiveGPHierarchy, -) pipeline_space_fidelity_priors = dict( val1=neps.FloatParameter(lower=-10, upper=10, default=1), @@ -63,7 +60,7 @@ def run_pipeline(val1, val2): ) for j, kernel in enumerate(graph_kernels) ] -surrogate_model = ComprehensiveGPHierarchy +surrogate_model = surrogate_model_args = { "graph_kernels": graph_kernels, "hp_kernels": [], diff --git a/tests/test_settings/test_settings.py b/tests/test_settings/test_settings.py index fcdac758..fe649563 100644 --- a/tests/test_settings/test_settings.py +++ b/tests/test_settings/test_settings.py @@ -2,8 +2,12 @@ import pytest import neps from neps.utils.run_args import get_run_args_from_yaml -from tests.test_yaml_run_args.test_yaml_run_args import (run_pipeline, hook1, hook2, - pipeline_space) +from tests.test_yaml_run_args.test_yaml_run_args import ( + run_pipeline, + hook1, + hook2, + pipeline_space, +) from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization from typing import Union, Callable, Dict, List, Type @@ -16,276 +20,291 @@ @pytest.mark.neps_api -@pytest.mark.parametrize("func_args, yaml_args, expected_output", [ - ( - { # only essential arguments provided by func_args, no yaml - "run_pipeline": run_pipeline, - "root_directory": "path/to/root_directory", - "pipeline_space": pipeline_space, - "run_args": Default(None), - "overwrite_working_directory": Default(False), - "post_run_summary": Default(True), - "development_stage_id": Default(None), - "task_id": Default(None), - "max_evaluations_total": 10, - "max_evaluations_per_run": Default(None), - "continue_until_max_evaluation_completed": Default(False), - "max_cost_total": Default(None), - "ignore_errors": Default(False), - "loss_value_on_error": Default(None), - "cost_value_on_error": Default(None), - "pre_load_hooks": Default(None), - "searcher": Default("default"), - "searcher_kwargs": {}, - } - , - Default(None), - { - "run_pipeline": run_pipeline, - "root_directory": "path/to/root_directory", - "pipeline_space": pipeline_space, - "overwrite_working_directory": False, - "post_run_summary": True, - "development_stage_id": None, - "task_id": None, - "max_evaluations_total": 10, - "max_evaluations_per_run": None, - "continue_until_max_evaluation_completed": False, - "max_cost_total": None, - "ignore_errors": False, - "loss_value_on_error": None, - "cost_value_on_error": None, - "pre_load_hooks": None, - "searcher": "default", - "searcher_kwargs": {} - } - ), - ({ # only required elements of run_args - "run_pipeline": Default(None), - "root_directory": Default(None), - "pipeline_space": Default(None), - "run_args": Default(None), - "overwrite_working_directory": Default(False), - "post_run_summary": Default(True), - "development_stage_id": Default(None), - "task_id": Default(None), - "max_evaluations_total": Default(None), - "max_evaluations_per_run": Default(None), - "continue_until_max_evaluation_completed": Default(False), - "max_cost_total": Default(None), - "ignore_errors": Default(False), - "loss_value_on_error": Default(None), - "cost_value_on_error": Default(None), - "pre_load_hooks": Default(None), - "searcher": Default("default"), - "searcher_kwargs": {}, - }, - "/run_args_required.yaml", - { - "run_pipeline": run_pipeline, - "root_directory": "path/to/root_directory", - "pipeline_space": pipeline_space, - "overwrite_working_directory": False, - "post_run_summary": True, - "development_stage_id": None, - "task_id": None, - "max_evaluations_total": 10, - "max_evaluations_per_run": None, - "continue_until_max_evaluation_completed": False, - "max_cost_total": None, - "ignore_errors": False, - "loss_value_on_error": None, - "cost_value_on_error": None, - "pre_load_hooks": None, - "searcher": "default", - "searcher_kwargs": {} - }), - ({ # required via func_args, optional via yaml - "run_pipeline": run_pipeline, - "root_directory": "path/to/root_directory", - "pipeline_space": pipeline_space, - "run_args": "tests/path/to/run_args", # will be ignored by Settings - "overwrite_working_directory": Default(False), - "post_run_summary": Default(True), - "development_stage_id": Default(None), - "task_id": Default(None), - "max_evaluations_total": 10, - "max_evaluations_per_run": Default(None), - "continue_until_max_evaluation_completed": Default(False), - "max_cost_total": Default(None), - "ignore_errors": Default(False), - "loss_value_on_error": Default(None), - "cost_value_on_error": Default(None), - "pre_load_hooks": Default(None), - "searcher": Default("default"), - "searcher_kwargs": {}, - }, - "/run_args_optional.yaml", - { - "run_pipeline": run_pipeline, - "root_directory": "path/to/root_directory", - "pipeline_space": pipeline_space, - "overwrite_working_directory": True, - "post_run_summary": False, - "development_stage_id": None, - "task_id": None, - "max_evaluations_total": 10, - "max_evaluations_per_run": None, - "continue_until_max_evaluation_completed": False, - "max_cost_total": None, - "ignore_errors": False, - "loss_value_on_error": None, - "cost_value_on_error": None, - "pre_load_hooks": None, - "searcher": "hyperband", - "searcher_kwargs": {} - }), - ({ # overwrite all yaml values - "run_pipeline": run_pipeline, - "root_directory": "path/to/root_directory", - "pipeline_space": pipeline_space, - "run_args": "test", - "overwrite_working_directory": False, - "post_run_summary": True, - "development_stage_id": 5, - "task_id": None, - "max_evaluations_total": 17, - "max_evaluations_per_run": None, - "continue_until_max_evaluation_completed": False, - "max_cost_total": None, - "ignore_errors": False, - "loss_value_on_error": None, - "cost_value_on_error": None, - "pre_load_hooks": None, - "searcher": "default", - "searcher_kwargs": {}, - } - , - "/overwrite_run_args.yaml", - { - "run_pipeline": run_pipeline, - "root_directory": "path/to/root_directory", - "pipeline_space": pipeline_space, - "overwrite_working_directory": False, - "post_run_summary": True, - "development_stage_id": 5, - "task_id": None, - "max_evaluations_total": 17, - "max_evaluations_per_run": None, - "continue_until_max_evaluation_completed": False, - "max_cost_total": None, - "ignore_errors": False, - "loss_value_on_error": None, - "cost_value_on_error": None, - "pre_load_hooks": None, - "searcher": "default", - "searcher_kwargs": {}, - } - ), - ({ # optimizer args special case - "run_pipeline": run_pipeline, - "root_directory": "path/to/root_directory", - "pipeline_space": pipeline_space, - "run_args": "test", - "overwrite_working_directory": False, - "post_run_summary": True, - "development_stage_id": 5, - "task_id": None, - "max_evaluations_total": 17, - "max_evaluations_per_run": None, - "continue_until_max_evaluation_completed": False, - "max_cost_total": None, - "ignore_errors": False, - "loss_value_on_error": None, - "cost_value_on_error": None, - "pre_load_hooks": None, - "searcher": Default("default"), - "searcher_kwargs": {"initial_design_type": "max_budget", - "use_priors": False, - "random_interleave_prob": 0.0, - "sample_default_first": False, - "sample_default_at_target": False}, - } - , - "/run_args_optimizer_settings.yaml", - { - "run_pipeline": run_pipeline, - "root_directory": "path/to/root_directory", - "pipeline_space": pipeline_space, - "overwrite_working_directory": False, - "post_run_summary": True, - "development_stage_id": 5, - "task_id": None, - "max_evaluations_total": 17, - "max_evaluations_per_run": None, - "continue_until_max_evaluation_completed": False, - "max_cost_total": None, - "ignore_errors": False, - "loss_value_on_error": None, - "cost_value_on_error": None, - "pre_load_hooks": None, - "searcher": { - "strategy": "hyperband", - "eta": 3, - "initial_design_type": "max_budget", - "use_priors": False, - "random_interleave_prob": 0.0, - "sample_default_first": False, - "sample_default_at_target": False}, - "searcher_kwargs": {"initial_design_type": "max_budget", - "use_priors": False, - "random_interleave_prob": 0.0, - "sample_default_first": False, - "sample_default_at_target": False}, - }), -({ # load optimizer with args - "run_pipeline": Default(None), - "root_directory": Default(None), - "pipeline_space": Default(None), - "run_args": Default(None), - "overwrite_working_directory": Default(False), - "post_run_summary": Default(True), - "development_stage_id": Default(None), - "task_id": Default(None), - "max_evaluations_total": Default(None), - "max_evaluations_per_run": Default(None), - "continue_until_max_evaluation_completed": Default(False), - "max_cost_total": Default(None), - "ignore_errors": Default(False), - "loss_value_on_error": Default(None), - "cost_value_on_error": Default(None), - "pre_load_hooks": Default(None), - "searcher": Default("default"), - "searcher_kwargs": {"random_interleave_prob": 0.2, - "initial_design_size": 9}, - } - , - "/run_args_optimizer_outside.yaml", - { - "run_pipeline": run_pipeline, - "root_directory": "path/to/root_directory", - "pipeline_space": pipeline_space, - "overwrite_working_directory": True, - "post_run_summary": True, - "development_stage_id": None, - "task_id": None, - "max_evaluations_total": 10, - "max_evaluations_per_run": None, - "continue_until_max_evaluation_completed": False, - "max_cost_total": None, - "ignore_errors": False, - "loss_value_on_error": None, - "cost_value_on_error": None, - "pre_load_hooks": None, - "searcher": my_bayesian, - "searcher_kwargs": {"acquisition": "EI", - "acquisition_sampler": "random", - "random_interleave_prob": 0.2, - "initial_design_size": 9, - "surrogate_model": "gp" - }, - }) -]) +@pytest.mark.parametrize( + "func_args, yaml_args, expected_output", + [ + ( + { # only essential arguments provided by func_args, no yaml + "run_pipeline": run_pipeline, + "root_directory": "path/to/root_directory", + "pipeline_space": pipeline_space, + "run_args": Default(None), + "overwrite_working_directory": Default(False), + "post_run_summary": Default(True), + "development_stage_id": Default(None), + "task_id": Default(None), + "max_evaluations_total": 10, + "max_evaluations_per_run": Default(None), + "continue_until_max_evaluation_completed": Default(False), + "max_cost_total": Default(None), + "ignore_errors": Default(False), + "loss_value_on_error": Default(None), + "cost_value_on_error": Default(None), + "pre_load_hooks": Default(None), + "searcher": Default("default"), + "searcher_kwargs": {}, + }, + Default(None), + { + "run_pipeline": run_pipeline, + "root_directory": "path/to/root_directory", + "pipeline_space": pipeline_space, + "overwrite_working_directory": False, + "post_run_summary": True, + "development_stage_id": None, + "task_id": None, + "max_evaluations_total": 10, + "max_evaluations_per_run": None, + "continue_until_max_evaluation_completed": False, + "max_cost_total": None, + "ignore_errors": False, + "loss_value_on_error": None, + "cost_value_on_error": None, + "pre_load_hooks": None, + "searcher": "default", + "searcher_kwargs": {}, + }, + ), + ( + { # only required elements of run_args + "run_pipeline": Default(None), + "root_directory": Default(None), + "pipeline_space": Default(None), + "run_args": Default(None), + "overwrite_working_directory": Default(False), + "post_run_summary": Default(True), + "development_stage_id": Default(None), + "task_id": Default(None), + "max_evaluations_total": Default(None), + "max_evaluations_per_run": Default(None), + "continue_until_max_evaluation_completed": Default(False), + "max_cost_total": Default(None), + "ignore_errors": Default(False), + "loss_value_on_error": Default(None), + "cost_value_on_error": Default(None), + "pre_load_hooks": Default(None), + "searcher": Default("default"), + "searcher_kwargs": {}, + }, + "/run_args_required.yaml", + { + "run_pipeline": run_pipeline, + "root_directory": "path/to/root_directory", + "pipeline_space": pipeline_space, + "overwrite_working_directory": False, + "post_run_summary": True, + "development_stage_id": None, + "task_id": None, + "max_evaluations_total": 10, + "max_evaluations_per_run": None, + "continue_until_max_evaluation_completed": False, + "max_cost_total": None, + "ignore_errors": False, + "loss_value_on_error": None, + "cost_value_on_error": None, + "pre_load_hooks": None, + "searcher": "default", + "searcher_kwargs": {}, + }, + ), + ( + { # required via func_args, optional via yaml + "run_pipeline": run_pipeline, + "root_directory": "path/to/root_directory", + "pipeline_space": pipeline_space, + "run_args": "tests/path/to/run_args", # will be ignored by Settings + "overwrite_working_directory": Default(False), + "post_run_summary": Default(True), + "development_stage_id": Default(None), + "task_id": Default(None), + "max_evaluations_total": 10, + "max_evaluations_per_run": Default(None), + "continue_until_max_evaluation_completed": Default(False), + "max_cost_total": Default(None), + "ignore_errors": Default(False), + "loss_value_on_error": Default(None), + "cost_value_on_error": Default(None), + "pre_load_hooks": Default(None), + "searcher": Default("default"), + "searcher_kwargs": {}, + }, + "/run_args_optional.yaml", + { + "run_pipeline": run_pipeline, + "root_directory": "path/to/root_directory", + "pipeline_space": pipeline_space, + "overwrite_working_directory": True, + "post_run_summary": False, + "development_stage_id": None, + "task_id": None, + "max_evaluations_total": 10, + "max_evaluations_per_run": None, + "continue_until_max_evaluation_completed": False, + "max_cost_total": None, + "ignore_errors": False, + "loss_value_on_error": None, + "cost_value_on_error": None, + "pre_load_hooks": None, + "searcher": "hyperband", + "searcher_kwargs": {}, + }, + ), + ( + { # overwrite all yaml values + "run_pipeline": run_pipeline, + "root_directory": "path/to/root_directory", + "pipeline_space": pipeline_space, + "run_args": "test", + "overwrite_working_directory": False, + "post_run_summary": True, + "development_stage_id": 5, + "task_id": None, + "max_evaluations_total": 17, + "max_evaluations_per_run": None, + "continue_until_max_evaluation_completed": False, + "max_cost_total": None, + "ignore_errors": False, + "loss_value_on_error": None, + "cost_value_on_error": None, + "pre_load_hooks": None, + "searcher": "default", + "searcher_kwargs": {}, + }, + "/overwrite_run_args.yaml", + { + "run_pipeline": run_pipeline, + "root_directory": "path/to/root_directory", + "pipeline_space": pipeline_space, + "overwrite_working_directory": False, + "post_run_summary": True, + "development_stage_id": 5, + "task_id": None, + "max_evaluations_total": 17, + "max_evaluations_per_run": None, + "continue_until_max_evaluation_completed": False, + "max_cost_total": None, + "ignore_errors": False, + "loss_value_on_error": None, + "cost_value_on_error": None, + "pre_load_hooks": None, + "searcher": "default", + "searcher_kwargs": {}, + }, + ), + ( + { # optimizer args special case + "run_pipeline": run_pipeline, + "root_directory": "path/to/root_directory", + "pipeline_space": pipeline_space, + "run_args": "test", + "overwrite_working_directory": False, + "post_run_summary": True, + "development_stage_id": 5, + "task_id": None, + "max_evaluations_total": 17, + "max_evaluations_per_run": None, + "continue_until_max_evaluation_completed": False, + "max_cost_total": None, + "ignore_errors": False, + "loss_value_on_error": None, + "cost_value_on_error": None, + "pre_load_hooks": None, + "searcher": Default("default"), + "searcher_kwargs": { + "initial_design_type": "max_budget", + "use_priors": False, + "random_interleave_prob": 0.0, + "sample_default_first": False, + "sample_default_at_target": False, + }, + }, + "/run_args_optimizer_settings.yaml", + { + "run_pipeline": run_pipeline, + "root_directory": "path/to/root_directory", + "pipeline_space": pipeline_space, + "overwrite_working_directory": False, + "post_run_summary": True, + "development_stage_id": 5, + "task_id": None, + "max_evaluations_total": 17, + "max_evaluations_per_run": None, + "continue_until_max_evaluation_completed": False, + "max_cost_total": None, + "ignore_errors": False, + "loss_value_on_error": None, + "cost_value_on_error": None, + "pre_load_hooks": None, + "searcher": { + "strategy": "hyperband", + "eta": 3, + "initial_design_type": "max_budget", + "use_priors": False, + "random_interleave_prob": 0.0, + "sample_default_first": False, + "sample_default_at_target": False, + }, + "searcher_kwargs": { + "initial_design_type": "max_budget", + "use_priors": False, + "random_interleave_prob": 0.0, + "sample_default_first": False, + "sample_default_at_target": False, + }, + }, + ), + ( + { # load optimizer with args + "run_pipeline": Default(None), + "root_directory": Default(None), + "pipeline_space": Default(None), + "run_args": Default(None), + "overwrite_working_directory": Default(False), + "post_run_summary": Default(True), + "development_stage_id": Default(None), + "task_id": Default(None), + "max_evaluations_total": Default(None), + "max_evaluations_per_run": Default(None), + "continue_until_max_evaluation_completed": Default(False), + "max_cost_total": Default(None), + "ignore_errors": Default(False), + "loss_value_on_error": Default(None), + "cost_value_on_error": Default(None), + "pre_load_hooks": Default(None), + "searcher": Default("default"), + "searcher_kwargs": { + "random_interleave_prob": 0.2, + "initial_design_size": 9, + }, + }, + "/run_args_optimizer_outside.yaml", + { + "run_pipeline": run_pipeline, + "root_directory": "path/to/root_directory", + "pipeline_space": pipeline_space, + "overwrite_working_directory": True, + "post_run_summary": True, + "development_stage_id": None, + "task_id": None, + "max_evaluations_total": 10, + "max_evaluations_per_run": None, + "continue_until_max_evaluation_completed": False, + "max_cost_total": None, + "ignore_errors": False, + "loss_value_on_error": None, + "cost_value_on_error": None, + "pre_load_hooks": None, + "searcher": my_bayesian, + "searcher_kwargs": { + "acquisition": "EI", + "acquisition_sampler": "random", + "random_interleave_prob": 0.2, + "initial_design_size": 9, + }, + }, + ), + ], +) def test_check_settings(func_args: Dict, yaml_args: str, expected_output: Dict) -> None: """ Check if expected settings are set @@ -299,33 +318,37 @@ def test_check_settings(func_args: Dict, yaml_args: str, expected_output: Dict) @pytest.mark.neps_api -@pytest.mark.parametrize("func_args, yaml_args, error", [ - ( - { - "root_directory": Default(None), - "pipeline_space": Default(None), - "run_args": Default(None), - "overwrite_working_directory": Default(False), - "post_run_summary": Default(True), - "development_stage_id": Default(None), - "task_id": Default(None), - "max_evaluations_total": Default(None), - "max_evaluations_per_run": Default(None), - "continue_until_max_evaluation_completed": Default(False), - "max_cost_total": Default(None), - "ignore_errors": Default(False), - "loss_value_on_error": Default(None), - "cost_value_on_error": Default(None), - "pre_load_hooks": Default(None), - "searcher": Default("default"), - "searcher_kwargs": {}, - }, - Default(None), - ValueError - ) -]) -def test_settings_initialization_error(func_args: Dict, yaml_args: Union[str, Default], - error: Exception) -> None: +@pytest.mark.parametrize( + "func_args, yaml_args, error", + [ + ( + { + "root_directory": Default(None), + "pipeline_space": Default(None), + "run_args": Default(None), + "overwrite_working_directory": Default(False), + "post_run_summary": Default(True), + "development_stage_id": Default(None), + "task_id": Default(None), + "max_evaluations_total": Default(None), + "max_evaluations_per_run": Default(None), + "continue_until_max_evaluation_completed": Default(False), + "max_cost_total": Default(None), + "ignore_errors": Default(False), + "loss_value_on_error": Default(None), + "cost_value_on_error": Default(None), + "pre_load_hooks": Default(None), + "searcher": Default("default"), + "searcher_kwargs": {}, + }, + Default(None), + ValueError, + ) + ], +) +def test_settings_initialization_error( + func_args: Dict, yaml_args: Union[str, Default], error: Exception +) -> None: """ Test if Settings raises Error when essential arguments are missing """ diff --git a/tests/test_yaml_run_args/test_yaml_run_args.py b/tests/test_yaml_run_args/test_yaml_run_args.py index 5a0c5d22..8200b2fd 100644 --- a/tests/test_yaml_run_args/test_yaml_run_args.py +++ b/tests/test_yaml_run_args/test_yaml_run_args.py @@ -5,11 +5,12 @@ from typing import Union, Callable, Dict, List, Type BASE_PATH = "tests/test_yaml_run_args/" -pipeline_space = dict(lr=neps.FloatParameter(lower=1e-3, upper=0.1), - optimizer=neps.CategoricalParameter(choices=["adam", "sgd", - "adamw"]), - epochs=neps.IntegerParameter(lower=1, upper=10), - batch_size=neps.ConstantParameter(value=64)) +pipeline_space = dict( + lr=neps.FloatParameter(lower=1e-3, upper=0.1), + optimizer=neps.CategoricalParameter(choices=["adam", "sgd", "adamw"]), + epochs=neps.IntegerParameter(lower=1, upper=10), + batch_size=neps.ConstantParameter(value=64), +) def run_pipeline(): @@ -44,8 +45,9 @@ def check_run_args(yaml_path_run_args: str, expected_output: Dict) -> None: """ output = get_run_args_from_yaml(BASE_PATH + yaml_path_run_args) - def are_functions_equivalent(f1: Union[Callable, List[Callable]], - f2: Union[Callable, List[Callable]]) -> bool: + def are_functions_equivalent( + f1: Union[Callable, List[Callable]], f2: Union[Callable, List[Callable]] + ) -> bool: """ Compares functions or lists of functions for equivalence by their bytecode, useful when identical functions have different memory addresses. This method @@ -111,8 +113,10 @@ def are_functions_equivalent(f1: Union[Callable, List[Callable]], "loss_value_on_error": 4.2, "cost_value_on_error": 3.7, "ignore_errors": True, - "searcher": {"strategy": "bayesian_optimization", - "initial_design_size": 5, "surrogate_model": "gp"}, + "searcher": { + "strategy": "bayesian_optimization", + "initial_design_size": 5, + }, "pre_load_hooks": [hook1, hook2], }, ), @@ -133,8 +137,10 @@ def are_functions_equivalent(f1: Union[Callable, List[Callable]], "loss_value_on_error": 2.4, "cost_value_on_error": 2.1, "ignore_errors": False, - "searcher": {"strategy": "bayesian_optimization", - "initial_design_size": 5, "surrogate_model": "gp"}, + "searcher": { + "strategy": "bayesian_optimization", + "initial_design_size": 5, + }, "pre_load_hooks": [hook1], }, ), @@ -147,8 +153,10 @@ def are_functions_equivalent(f1: Union[Callable, List[Callable]], "overwrite_working_directory": True, "post_run_summary": False, "continue_until_max_evaluation_completed": False, - "searcher": {"strategy": "bayesian_optimization", - "initial_design_size": 5, "surrogate_model": "gp"}, + "searcher": { + "strategy": "bayesian_optimization", + "initial_design_size": 5, + }, }, ), ( @@ -164,26 +172,27 @@ def are_functions_equivalent(f1: Union[Callable, List[Callable]], }, ), ("run_args_empty.yaml", {}), - ("run_args_optional_loading_format.yaml", { - "run_pipeline": run_pipeline, - "pipeline_space": pipeline_space, - "root_directory": "test_yaml", - "max_evaluations_total": 20, - "max_cost_total": 4.2, - "overwrite_working_directory": True, - "post_run_summary": False, - "development_stage_id": 9, - "max_evaluations_per_run": 5, - "continue_until_max_evaluation_completed": True, - "loss_value_on_error": 2.4, - "cost_value_on_error": 2.1, - "ignore_errors": False, - "searcher": BayesianOptimization, - "searcher_kwargs": {'initial_design_size': 5, - 'surrogate_model': 'gp'}, - "pre_load_hooks": [hook1] - - }) + ( + "run_args_optional_loading_format.yaml", + { + "run_pipeline": run_pipeline, + "pipeline_space": pipeline_space, + "root_directory": "test_yaml", + "max_evaluations_total": 20, + "max_cost_total": 4.2, + "overwrite_working_directory": True, + "post_run_summary": False, + "development_stage_id": 9, + "max_evaluations_per_run": 5, + "continue_until_max_evaluation_completed": True, + "loss_value_on_error": 2.4, + "cost_value_on_error": 2.1, + "ignore_errors": False, + "searcher": BayesianOptimization, + "searcher_kwargs": {"initial_design_size": 5}, + "pre_load_hooks": [hook1], + }, + ), ], ) def test_yaml_config(yaml_path: str, expected_output: Dict) -> None: From d1c7a859d0eccfbb259b4fe0a1f708c84ae9da61 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Tue, 24 Sep 2024 19:03:38 +0200 Subject: [PATCH 42/63] refactor: Ifbo --- neps/optimizers/base_optimizer.py | 6 +- .../freeze_thaw_sampler.py | 56 --- .../bayesian_optimization/models/__init__.py | 6 +- .../bayesian_optimization/models/ftpfn.py | 6 +- .../bayesian_optimization/optimizer.py | 100 +---- neps/optimizers/intial_design.py | 130 ++++++ neps/optimizers/multi_fidelity/ifbo.py | 424 +++++++++++------- neps/optimizers/multi_fidelity/utils.py | 78 ++-- neps/sampling/priors.py | 59 ++- neps/sampling/samplers.py | 29 +- neps/search_spaces/domain.py | 45 +- neps/search_spaces/encoding.py | 73 ++- neps/state/neps_state.py | 8 +- neps/state/trial.py | 2 +- 14 files changed, 669 insertions(+), 353 deletions(-) create mode 100644 neps/optimizers/intial_design.py diff --git a/neps/optimizers/base_optimizer.py b/neps/optimizers/base_optimizer.py index 8dd9e96f..41dc6962 100644 --- a/neps/optimizers/base_optimizer.py +++ b/neps/optimizers/base_optimizer.py @@ -19,7 +19,7 @@ class SampledConfig: id: Trial.ID config: Mapping[str, Any] - previous_config_id: Trial.ID | None + previous_config_id: Trial.ID | None = None class BaseOptimizer: @@ -76,7 +76,7 @@ def ask( trials: Mapping[str, Trial], budget_info: BudgetInfo | None, optimizer_state: dict[str, Any], - ) -> tuple[SampledConfig, dict[str, Any]]: + ) -> SampledConfig | tuple[SampledConfig, dict[str, Any]]: """Sample a new configuration. !!! note @@ -134,7 +134,7 @@ def ask( config, config_id, previous_config_id = self.get_config_and_ids() return SampledConfig( id=config_id, config=config, previous_config_id=previous_config_id - ), optimizer_state + ) def update_state_post_evaluation( self, state: dict[str, Any], report: Trial.Report diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py index ea22c5b1..cae5bee8 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py +++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py @@ -59,62 +59,6 @@ def _sample_new( new_configs, index=range(index_from, index_from + len(new_configs)) ) - def _sample_new_unique( - self, - index_from: int, - n: int | None = None, - patience: int = 10, - ignore_fidelity: bool = False, - ) -> pd.Series: - n = n if n is not None else self.samples_to_draw - assert ( - patience > 0 and n > 0 - ), "Patience and `samples_to_draw` must be larger than 0" - - assert self.observations is not None - assert self.pipeline_space is not None - - existing_configs = self.observations.all_configs_list() - new_configs = [] - for _ in range(n): - # Sample patience times for an unobserved configuration - for _ in range(patience): - _config = self.pipeline_space.sample( - patience=self.patience, - user_priors=False, - ignore_fidelity=ignore_fidelity, - ) - # # Convert continuous into tabular if the space is tabular - # _config = continuous_to_tabular(_config, self.tabular_space) - # Iterate over all observed configs - for config in existing_configs: - if _config.is_equal_value( - config, include_fidelity=not ignore_fidelity - ): - # if the sampled config already exists - # do the next iteration of patience - break - else: - # If the new sample is not equal to any previous - # then it's a new config - new_config = _config - break - else: - # TODO: use logger.warn here instead (karibbov) - warnings.warn( - f"Couldn't find an unobserved configuration in {patience} " - f"iterations. Using an observed config instead" - ) - # patience budget exhausted use the last sampled config anyway - new_config = _config - - # append the new config to the list - new_configs.append(new_config) - - return pd.Series( - new_configs, index=range(index_from, index_from + len(new_configs)) - ) - def sample( self, acquisition_function: Callable | None = None, diff --git a/neps/optimizers/bayesian_optimization/models/__init__.py b/neps/optimizers/bayesian_optimization/models/__init__.py index 35ae2120..49ac7258 100755 --- a/neps/optimizers/bayesian_optimization/models/__init__.py +++ b/neps/optimizers/bayesian_optimization/models/__init__.py @@ -1,5 +1,9 @@ -from .ftpfn import FTPFNSurrogate +from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate +# TODO: Need the GP back here +# * What actually uses the GP SurrogateModelMapping = { "ftpfn": FTPFNSurrogate, } + +__all__ = ["FTPFNSurrogate", "SurrogateModelMapping"] diff --git a/neps/optimizers/bayesian_optimization/models/ftpfn.py b/neps/optimizers/bayesian_optimization/models/ftpfn.py index 6f697033..2041396f 100644 --- a/neps/optimizers/bayesian_optimization/models/ftpfn.py +++ b/neps/optimizers/bayesian_optimization/models/ftpfn.py @@ -71,14 +71,14 @@ def _cast_tensor_shapes(x: torch.Tensor) -> torch.Tensor: _CACHED_FTPFN_MODEL: dict[tuple[str, str], FTPFN] = {} -class FTPFNModel: +class FTPFNSurrogate: """Wrapper around the IfBO model.""" def __init__( self, target_path: Path | None = None, version: str = "0.0.1", - **kwargs: Any, + device: torch.device | None = None, ): if target_path is None: # TODO: We also probably want to link this to the actual root directory @@ -91,7 +91,7 @@ def __init__( key = (str(target_path), version) ftpfn = _CACHED_FTPFN_MODEL.get(key) if ftpfn is None: - ftpfn = FTPFN(target_path=target_path, version=version) + ftpfn = FTPFN(target_path=target_path, version=version, device=device) _CACHED_FTPFN_MODEL[key] = ftpfn self.ftpfn = ftpfn diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 4188a5fe..11eca577 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -22,7 +22,8 @@ default_single_obj_gp, optimize_acq, ) -from neps.sampling import Prior, Sampler +from neps.optimizers.intial_design import make_initial_design +from neps.sampling import Prior from neps.search_spaces.encoding import TensorEncoder from neps.search_spaces.hyperparameters.categorical import CategoricalParameter @@ -129,43 +130,6 @@ def _cost_used_budget_percentage(budget_info: BudgetInfo) -> float: raise ValueError("No cost budget provided!") -# TODO: This needs to be moved to the search space class, however -# to not break the current prior based APIs used elsewhere, we can -# just manually create this here. -# We use confidence here where `0` means no confidence and `1` means -# absolute confidence. This gets translated in to std's and weights -# accordingly in a `CenteredPrior` -def _make_prior( - parameters: dict[str, CategoricalParameter | FloatParameter | IntegerParameter], -) -> Prior: - _mapping = {"low": 0.25, "medium": 0.5, "high": 0.75} - - domains: dict[str, Domain] = {} - centers: dict[str, tuple[Any, float]] = {} - categoricals: set[str] = set() - for name, hp in parameters.items(): - domains[name] = hp.domain # type: ignore - - if isinstance(hp, CategoricalParameter): - categoricals.add(name) - - if hp.default is None: - continue - - confidence_str = hp.default_confidence_choice - confidence_score = _mapping[confidence_str] - center = hp._default_index if isinstance(hp, CategoricalParameter) else hp.default - - centers[name] = (center, confidence_score) - - # Uses truncnorms for numerical and weighted choices categoricals - return Prior.make_centered( - domains=domains, - centers=centers, - categoricals=categoricals, - ) - - class BayesianOptimization(BaseOptimizer): """Implements the basic BO loop.""" @@ -179,6 +143,7 @@ def __init__( # noqa: D417 sample_default_first: bool = False, device: torch.device | None = None, encoder: TensorEncoder | None = None, + seed: int | None = None, treat_fidelity_as_hyperparameters: bool = False, ): """Initialise the BO loop. @@ -186,8 +151,7 @@ def __init__( # noqa: D417 Args: pipeline_space: Space in which to search initial_design_size: Number of samples used before using the surrogate model. - If None, it will take `int(log(N) ** 2)` samples where `N` is the number - of parameters in the search space. + If None, it will use the number of parameters in the search space. use_priors: Whether to use priors set on the hyperparameters during search. use_cost: Whether to consider reported "cost" from configurations in decision making. If True, the optimizer will weigh potential candidates by how much @@ -199,6 +163,7 @@ def __init__( # noqa: D417 If using `cost`, cost must be provided in the reports of the trials. sample_default_first: Whether to sample the default configuration first. + seed: Seed to use for the random number generator of samplers. device: Device to use for the optimization. encoder: Encoder to use for encoding the configurations. If None, it will will use the default encoder. @@ -221,17 +186,11 @@ def __init__( # noqa: D417 if treat_fidelity_as_hyperparameters: params.update(pipeline_space.fidelities) - if initial_design_size is None: - # As we have fairly regularized GPs, who start with a more smooth landscape - # model, we don't need a high level of initial samples. - ndims = len(params) - initial_design_size = max(2, int(math.log(ndims) ** 2)) - elif initial_design_size < 1: - raise ValueError("Initial_design_size to be at least 1") - self.encoder = TensorEncoder.default(params) if encoder is None else encoder + self.prior = Prior.from_parameters(params) if use_priors is True else None + self.treat_fidelity_as_hyperparameters = treat_fidelity_as_hyperparameters + self.seed = seed self.use_cost = use_cost - self.prior = _make_prior(params) if use_priors is True else None self.device = device self.sample_default_first = sample_default_first self.n_initial_design = initial_design_size @@ -243,44 +202,34 @@ def ask( budget_info: BudgetInfo, optimizer_state: dict[str, Any], seed: int | None = None, - ) -> tuple[SampledConfig, dict[str, Any]]: + ) -> SampledConfig: if seed is not None: raise NotImplementedError( "Seed is not yet implemented for BayesianOptimization" ) n_trials_sampled = len(trials) - space = self.pipeline_space config_id = str(n_trials_sampled + 1) - # Fill intitial design data if we don't have any... + # If we havn't passed the intial design phase if self.initial_design_ is None: - self.initial_design_ = [] - - if self.sample_default_first: - config = space.sample_default_configuration() - self.initial_design_.append(config.hp_values()) - - sampler = ( - self.prior if self.prior else Sampler.sobol(self.encoder.ncols, seed=seed) - ) - n_samples = self.n_initial_design - len(self.initial_design_) - - x = sampler.sample( - n_samples * 2, - to=self.encoder.domains, + self.initial_design_ = make_initial_design( + space=self.pipeline_space, + encoder=self.encoder, + sample_default_first=self.sample_default_first, + sampler=self.prior if self.prior is not None else "sobol", seed=seed, - device=self.device, + sample_size=( + "ndim" if self.n_initial_design is None else self.n_initial_design + ), + sample_fidelity=( + "max" if not self.treat_fidelity_as_hyperparameters else True + ), ) - uniq_x = torch.unique(x, dim=0) - configs = self.encoder.unpack(uniq_x[:n_samples]) - self.initial_design_.extend(configs) - # If we havn't passed the intial design phase if n_trials_sampled < len(self.initial_design_): config = self.initial_design_[n_trials_sampled] - sample = SampledConfig(id=config_id, config=config, previous_config_id=None) - return sample, optimizer_state + return SampledConfig(id=config_id, config=config) # Now we actually do the BO loop, start by encoding the data # TODO: Lift this into runtime, let the optimizer advertise the encoding wants... @@ -347,7 +296,7 @@ def ask( pibo_exp_term = _pibo_exp_term( n_trials_sampled, self.encoder.ncols, - self.n_initial_design, + len(self.initial_design_), ) # If the amount of weight derived from the pibo exponent becomes @@ -398,5 +347,4 @@ def ask( assert len(candidates) == 1, "Expected only one candidate!" config = self.encoder.unpack(candidates)[0] - sample = SampledConfig(id=config_id, config=config, previous_config_id=None) - return sample, optimizer_state + return SampledConfig(id=config_id, config=config) diff --git a/neps/optimizers/intial_design.py b/neps/optimizers/intial_design.py new file mode 100644 index 00000000..f2109f00 --- /dev/null +++ b/neps/optimizers/intial_design.py @@ -0,0 +1,130 @@ +from collections.abc import Sequence +from dataclasses import dataclass, field + +from typing import Literal, Any, Mapping + +from neps.sampling import Sampler +from neps.sampling.priors import Prior +from neps.search_spaces.encoding import TensorEncoder +from neps.search_spaces.search_space import SearchSpace +import torch + + +def make_initial_design( + space: SearchSpace, + encoder: TensorEncoder, + sampler: Literal["sobol", "prior", "uniform"] | Sampler, + sample_size: int | Literal["ndim"] | None = "ndim", + sample_default_first: bool = True, + sample_fidelity: ( + Literal["min", "max", True] | int | float | dict[str, int | float] + ) = True, + seed: int | None = None, +) -> list[dict[str, Any]]: + """Generate the initial design of the optimization process. + + Args: + space: The search space to use. + encoder: The encoder to use for encoding/decoding configurations. + sampler: The sampler to use for the initial design. + + If set to "sobol", a Sobol sequence will be used. + If set to "uniform", a uniform random sampler will be used. + If set to "prior", a prior sampler will be used, based on the defaults, + and confidence scores of the hyperparameters. + If set to a custom sampler, the sampler will be used directly. + + sample_size: + The number of configurations to sample. + + If "ndim", the number of configurations will be equal to the number of dimensions. + If None, no configurations will be sampled. + + sample_default_first: Whether to sample the default configuration first. + sample_fidelity: + At what fidelity to sample the configurations, including the default. + + If set to "min" or "max", the configuration will be sampled + at the minimum or maximum fidelity, respectively. If set to an integer + or a float, the configuration will be sampled at that fidelity. + When specified as a dictionary, the keys should be the names of the + fidelity parameters and the values should be the target fidelities. + If set to `True`, the configuration will have its fidelity randomly sampled. + seed: The seed to use for the random number generator of samplers. + + """ + configs: list[dict[str, Any]] = [] + + # First, we establish what fidelity to apply to them. + match sample_fidelity: + case "min": + fids = {name: fid.lower for name, fid in space.fidelities.items()} + case "max": + fids = {name: fid.upper for name, fid in space.fidelities.items()} + case True: + fids = {name: hp.sample_value() for name, hp in space.fidelities.items()} + case int() | float(): + if len(space.fidelities) != 1: + raise ValueError( + "The target fidelity should be specified as a dictionary" + " if there are multiple fidelities or no fidelity should" + " be specified." + " Current search space has fidelities: " + f"{list(space.fidelities.keys())}" + ) + name = next(iter(space.fidelities.keys())) + fids = {name: sample_fidelity} + case Mapping(): + missing_keys = set(space.fidelities.keys()) - set(sample_fidelity.keys()) + if any(missing_keys): + raise ValueError( + f"Missing target fidelities for the following fidelities: " + f"{missing_keys}" + ) + fids = sample_fidelity + case _: + raise ValueError( + "Invalid value for `sample_default_at_target`. " + "Expected 'min', 'max', True, int, float, or dict." + ) + + if sample_default_first: + # TODO: No way to pass a seed to the sampler + default = { + name: hp.default if hp.default is not None else hp.sample_value() + for name, hp in space.hyperparameters.items() + } + configs.append({**default, **fids}) + + params = {**space.numerical, **space.categoricals} + ndims = len(params) + + if sample_size == "ndim": + sample_size = ndims + elif sample_size is not None and not sample_size > 0: + raise ValueError( + "The sample size should be a positive integer if passing an int." + ) + + print("sample", sample_size, ndims) + if sample_size is not None: + match sampler: + case "sobol": + sampler = Sampler.sobol(ndim=len(params)) + case "uniform": + sampler = Sampler.uniform(ndim=len(params)) + case "prior": + sampler = Prior.from_parameters(params) + case _: + sampler = sampler + + encoded_configs = sampler.sample( + sample_size * 2, + to=encoder.domains, + seed=seed, + ) + uniq_x = torch.unique(encoded_configs, dim=0) + sample_configs = encoder.unpack(uniq_x[:sample_size]) + configs.extend([{**config, **fids} for config in sample_configs]) + + return configs diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index e8b34d25..066a0f99 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -1,70 +1,61 @@ -from __future__ import annotations - -import warnings -from typing import TYPE_CHECKING, Any, Mapping +from typing import Any, Mapping +import math import numpy as np +import torch from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig -from neps.optimizers.bayesian_optimization.acquisition_functions.mf_pi import MFPI_Random -from neps.optimizers.bayesian_optimization.acquisition_samplers.freeze_thaw_sampler import ( - FreezeThawSampler, -) -from neps.optimizers.multi_fidelity.mf_bo import PFNSurrogate -from neps.optimizers.multi_fidelity.utils import MFObservedData +from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate +from neps.optimizers.intial_design import make_initial_design +from neps.sampling.samplers import Sampler +from neps.search_spaces.domain import Domain +from neps.search_spaces.encoding import CategoricalToUnitNorm, TensorEncoder from neps.search_spaces.search_space import FloatParameter, IntegerParameter, SearchSpace from neps.state.trial import Trial +from neps.state.optimizer import BudgetInfo + + +def _select_trials(trials: Mapping[str, Trial]) -> Mapping[str, Trial]: + return { + trial_id: trial + for trial_id, trial in trials.items() + if trial.state + not in ( + Trial.State.FAILED, + Trial.State.CRASHED, + Trial.State.UNKNOWN, + Trial.State.CORRUPTED, + ) + } -if TYPE_CHECKING: - from neps.state.optimizer import BudgetInfo - - -def _adjust_fidelity_for_freeze_thaw_steps( - pipeline_space: SearchSpace, - step_size: int, -) -> SearchSpace: - """Adjusts the fidelity range to be divisible by `step_size` for Freeze-Thaw.""" - assert pipeline_space.fidelity is not None - - # Check if the fidelity range is divided into equal sized steps by `step_size` - fid_range = pipeline_space.fidelity.upper - pipeline_space.fidelity.lower - remainder = fid_range % step_size - if remainder == 0: - return pipeline_space - - # Adjust the fidelity lower bound to be divisible by `step_size` into equal steps - # Pushing the lower bound of the fidelity space by an offset to ensure equal-sized steps - offset = step_size - remainder - pipeline_space.fidelity.lower += offset - - warnings.warn( - f"Adjusted fidelity lower bound to {pipeline_space.fidelity.lower} " - f"for equal-sized steps of {step_size}.", - UserWarning, - stacklevel=3, - ) - return pipeline_space - - -# TODO: Maybe make this a part of searchspace functionality -def get_budget_value( - space: SearchSpace, - step_size: int, - budget_level: int | float, -) -> int | float: - assert space.fidelity is not None - match space.fidelity: - case IntegerParameter(): - return int(step_size * budget_level + space.fidelity.lower) - case FloatParameter(): - return step_size * budget_level + space.fidelity.lower - case _: - raise NotImplementedError( - f"Fidelity parameter: {space.fidelity}" - f"must be one of the types: " - f"[IntegerParameter, FloatParameter], but is type:" - f"{type(space.fidelity)}" - ) + +def _remove_duplicates(x: torch.Tensor) -> torch.Tensor: + # Does a lexsort, same as if we sorted by (config_id, budget), where + # theyre are sorted according to increasing config_id and then increasing budget. + # x[i2] -> sorted by config id and budget + i1 = torch.argsort(x[:, 1]) + i2 = i1[torch.argsort(x[i1][:, 0], stable=True)] + sorted_x = x[i2] + + # Now that it's sorted, we essentially want to count the occurence of each id into counts + _, counts = torch.unique_consecutive(sorted_x[:, 0], return_counts=True) + + # Now we can use these counts to get to the last occurence of each id + # The -1 is because we want to index from 0 but sum starts at 1. + ii = counts.cumsum(0) - 1 + return sorted_x[ii] + + +# NOTE: Ifbo was trained using 32 bit +FTPFN_DTYPE = torch.float32 + + +def tokenize( + ids: torch.Tensor, + budgets: torch.Tensor, + configs: torch.Tensor, +) -> torch.Tensor: + return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1) class IFBO(BaseOptimizer): @@ -77,10 +68,11 @@ def __init__( use_priors: bool = False, sample_default_first: bool = False, sample_default_at_target: bool = False, - patience: int = 100, # arguments for model surrogate_model_args: dict | None = None, - initial_design_size: int = 1, + initial_design_size: int | None = None, + device: torch.device | None = None, + **kwargs: Any, # TODO: Remove this ): """Initialise. @@ -92,29 +84,66 @@ def __init__( promotion_policy: The type of promotion procedure to use sample_default_first: Whether to sample the default configuration first initial_design_size: Number of configurations to sample before starting optimization - """ - assert self.pipeline_space.fidelity is not None - # Adjust pipeline space fidelity steps to be equally spaced - pipeline_space = _adjust_fidelity_for_freeze_thaw_steps(pipeline_space, step_size) - super().__init__(pipeline_space=pipeline_space, patience=patience) + If None, the number of configurations will be equal to the number of dimensions. + + device: Device to use for the model + """ + assert pipeline_space.fidelity is not None + assert isinstance(pipeline_space.fidelity_name, str) + super().__init__(pipeline_space=pipeline_space) self.step_size = step_size self.use_priors = use_priors - self.surrogate_model_args = surrogate_model_args self.sample_default_first = sample_default_first self.sample_default_at_target = sample_default_at_target - - self._initial_design_size = initial_design_size - - self.min_budget: int | float = self.pipeline_space.fidelity.lower - self.max_budget: int | float = self.pipeline_space.fidelity.upper - - fidelity_name = self.pipeline_space.fidelity_name - assert isinstance(fidelity_name, str) - self.fidelity_name: str = fidelity_name - - self._model_update_failed = False + self.surrogate_model_args = surrogate_model_args or {} + self.device = device + self.n_initial_design: int | None = initial_design_size + + self._min_budget: int | float = pipeline_space.fidelity.lower + self._max_budget: int | float = pipeline_space.fidelity.upper + self._fidelity_name: str = pipeline_space.fidelity_name + self._ftpfn_encoder: TensorEncoder = TensorEncoder.default( + { + **self.pipeline_space.numerical, + **self.pipeline_space.categoricals, + }, + custom_transformers={ + cat_name: CategoricalToUnitNorm(choices=cat.choices) + for cat_name, cat in self.pipeline_space.categoricals.items() + }, + ) + self._initial_design: list[dict[str, Any]] | None = None + + # TODO: We want it to be evenly divided by step size, so we need + # to add something to the minimum fidelity to ensure this. + + # NOTE: The PFN model expects fidelities to be normalized between 0 and 1, + # hence, we make sure to do min-max normalization but include the number of bins. + # Also, it expects it specifically in column 1 so we can't include it with the configs + maybe_bins = math.ceil((self._max_budget - self._min_budget) / self.step_size) + 1 + match pipeline_space.fidelity: + case IntegerParameter(): + assert pipeline_space.fidelity.domain.cardinality is not None + bins = max(maybe_bins, pipeline_space.fidelity.domain.cardinality) + case FloatParameter(): + bins = maybe_bins + case _: + raise NotImplementedError( + f"Fidelity type {type(pipeline_space.fidelity)} not supported" + ) + + # Domain of fidelity values, i.e. what is given in the configs that we + # give to the user to evaluate at. + self._fid_domain = pipeline_space.fidelity.domain + + # Domain in which we should pass budgets to ifbo model + self._budget_domain = Domain.float(1 / self._max_budget, 1) + + # Domain from which we assign an index to each budget + # Automatically takes care of rounding + self._budget_index_domain = Domain.indices(bins) def ask( self, @@ -122,102 +151,187 @@ def ask( budget_info: BudgetInfo, optimizer_state: dict[str, Any], seed: int | None = None, - ) -> tuple[SampledConfig, dict[str, Any]]: + ) -> SampledConfig: if seed is not None: raise NotImplementedError("Seed is not yet implemented for IFBO") - observed_configs = MFObservedData.from_trials(trials) + trials = _select_trials(trials) - in_initial_design_phase = ( - len(observed_configs.completed_runs) < self._initial_design_size - ) - if in_initial_design_phase: - # TODO: Copy BO setup where we can sample SOBOL or from Prior - self.logger.debug("Sampling from initial design...") - config = self.pipeline_space.sample( - patience=self.patience, user_priors=True, ignore_fidelity=False + ids = [ + int(trial.metadata.id.split("_", maxsplit=1)[0]) for trial in trials.values() + ] + n_unique_ids = len(set(ids)) + new_id = max(ids) + 1 if len(ids) > 0 else 0 + + # If we havn't passed the intial design phase + if self._initial_design is None: + self._initial_design = make_initial_design( + space=self.pipeline_space, + encoder=self._ftpfn_encoder, + sample_default_first=self.sample_default_first, + sampler="sobol", + seed=seed, + sample_fidelity="min", + sample_size=( + "ndim" if self.n_initial_design is None else self.n_initial_design + ), ) - _config_dict = config.hp_values() - _config_dict.update({self.fidelity_name: self.min_budget}) - config.set_hyperparameters_from_dict(_config_dict) - _config_id = observed_configs.next_config_id() - return SampledConfig( - config=config.hp_values(), id=_config_id, previous_config_id=None - ), optimizer_state - - # TODO: Maybe just remove `PFNSurrogate` as a whole and use FTPFN directly... - # this depends on whether we can actually create a proper surrogate model abstraction - # TODO: Really all of this should just be passed into an __init__ instead of 3 stage process - model_policy = PFNSurrogate( - pipeline_space=self.pipeline_space, - surrogate_model_args=self.surrogate_model_args, - step_size=self.step_size, + + if n_unique_ids < len(self._initial_design): + config = self._initial_design[n_unique_ids] + return SampledConfig(id=f"{new_id}_0", config=config) + + # Otherwise, we proceed to surrogate phase + ftpfn = FTPFNSurrogate( + target_path=self.surrogate_model_args.get("target_path", None), + version=self.surrogate_model_args.get("version", "0.0.1"), + device=self.device, ) - model_policy.observed_configs = observed_configs - model_policy.update_model() - # TODO: Replace with more efficient samplers we have from BO - # TODO: Just make this take in everything at __init__ instead of a 2 stage init - acquisition_sampler = FreezeThawSampler( - pipeline_space=self.pipeline_space, patience=self.patience + # NOTE: `0` is reserved in PFN, we add an additional +1 to all ids + train_ids = torch.tensor(ids, dtype=FTPFN_DTYPE, device=self.device) + 1 + train_configs = self._ftpfn_encoder.encode([t.config for t in trials.values()]) + + train_fidelities = [t.config[self._fidelity_name] for t in trials.values()] + train_budgets = self._budget_domain.cast( + torch.tensor(train_fidelities, device=self.device, dtype=FTPFN_DTYPE), + frm=self._fid_domain, ) - acquisition_sampler.set_state( - self.pipeline_space, observed_configs, self.step_size + x_train = tokenize(ids=train_ids, budgets=train_budgets, configs=train_configs) + x_train = x_train.to(FTPFN_DTYPE) + + # TODO: Document that it's on the user to ensure these are already all bounded + # We could possibly include some bounded transform to assert this. + minimize_ys = torch.tensor( + [ + trial.report.loss + if trial.report is not None and trial.report.loss is not None + else np.nan + for trial in trials.values() + ], + device=self.device, + dtype=FTPFN_DTYPE, ) + if not all(0 <= y <= 1.0 for y in minimize_ys): + raise RuntimeError( + "ifBO requires that all loss values reported lie in the interval [0, 1]" + " but recieved loss value outside of that range!" + f"\n{minimize_ys}" + ) + # Invert the ys + maximize_ys = 1 - minimize_ys + maximize_best_y = maximize_ys.max().item() + is_pending = minimize_ys.isnan() + + maximize_ys[is_pending] = ftpfn.get_mean_performance( + train_x=x_train[~is_pending], + train_y=maximize_ys[~is_pending], + test_x=x_train[is_pending], + ) + + rng = np.random.RandomState(seed) + n_rand = 1_000 # TODO: parametrize + + # TODO: Could also sample from a prior... + uniform = Sampler.uniform(ndim=self._ftpfn_encoder.ncols) - samples = acquisition_sampler.sample(set_new_sample_fidelity=self.min_budget) + # We sample the horizon in terms of step numbers to take + lower_index = self._budget_index_domain.lower + upper_index = self._budget_index_domain.upper + # The plus 1 here is because we want to sample that at least one step + # should be taken. + horizon_index_increment = rng.randint(lower_index, upper_index) + 1 - # TODO: See if we can get away from `set_state` style things - # and just instantiate it with what it needs - acquisition = MFPI_Random( - pipeline_space=self.pipeline_space, surrogate_model_name="ftpfn" + # We then normalize it to FTPFN normalized budget domain + horizon = self._budget_domain.cast_one( + horizon_index_increment, + frm=self._budget_index_domain, ) - acquisition.set_state( - self.pipeline_space, - model_policy.surrogate_model, - observed_configs, - self.step_size, + + # Now let's create the random configurations + rand_configs = uniform.sample( + n=n_rand, + to=self._ftpfn_encoder.domains, + seed=None, # TODO + device=self.device, + ).to(FTPFN_DTYPE) + + # We give them all the special 0 id, as well as set the budget accordinly + acq_new = tokenize( + ids=torch.zeros(n_rand, dtype=FTPFN_DTYPE, device=self.device), + budgets=torch.zeros(n_rand, dtype=FTPFN_DTYPE, device=self.device), + configs=rand_configs, ) - # `_samples` should have new configs with fidelities set to as required - acq, _samples = acquisition.eval(x=samples, asscalar=True) - # NOTE: len(samples) need not be equal to len(_samples) as `samples` contain - # all (partials + new) configurations obtained from the sampler, but - # in `_samples`, configs are removed that have reached maximum epochs allowed + # Construct all our samples for acqusition: + # 1. Take all non-pending configs + acq_train = x_train[~is_pending].clone().detach() + + # 2. We only want to include the configuration rows + # that are at their highest budget, + # i.e. don't include config_0_0 and config_0_1 + acq_train = _remove_duplicates(acq_train) + + # 3. Sub select all that are at a partial budget i.e. can evaluate further + # Note, it's important to do this after the above + partial_eval_mask = acq_train[:, 1] < 1 + acq_train = acq_train[partial_eval_mask] + + # 4. Add in the new sampled configurations + acq_samples = torch.vstack([acq_train, acq_new]) + + # 5. Add on the horizon to the budget, and clamping to maximum + # Note that we hold onto the intermediate unclamped budget for later + unclamped_budgets = acq_samples[:, 1] + horizon + acq_samples[:, 1] = torch.clamp(unclamped_budgets, max=1) + + # Now get the PI of these samples + lu = 10 ** rng.uniform(-4, -1) + f_inc = maximize_best_y * (1 - lu) + pi_new_samples = ftpfn.get_pi( + train_x=x_train.to(FTPFN_DTYPE), + train_y=maximize_ys.to(FTPFN_DTYPE), + test_x=acq_samples.to(FTPFN_DTYPE), + y_best=torch.full( + size=(len(acq_samples),), + fill_value=f_inc, + dtype=FTPFN_DTYPE, + ), + ) + best_ix = pi_new_samples.argmax() - best_idx = acq.argmax() - _config_id = best_idx + # Extract out the row which had the best PI + best_id = int(acq_samples[best_ix, 0].round().item()) + best_vector = acq_samples[best_ix, 2:].unsqueeze(0) + best_config = self._ftpfn_encoder.unpack(best_vector)[0] - # NOTE: `samples` and `_samples` should share the same index values, hence, - # avoid using `.iloc` and work with `.loc` on these pandas DataFrame/Series - config: SearchSpace = samples.loc[_config_id] - config = config.clone() + if best_id == 0: + # A newly sampled configuration was deemed more promising + config_id = f"{new_id}_0" + best_config[self._fidelity_name] = self._min_budget + previous_config_id = None + return SampledConfig(config_id, best_config, previous_config_id) - # IMPORTANT: setting the fidelity value appropriately - if best_idx > max(observed_configs.seen_config_ids): - next_fid_value = self.min_budget else: - max_observed_fids = ( - observed_configs.get_max_observed_fidelity_level_per_config() + # To calculate the next step to take in fidelity space, we remove the horizon + previous_budget_of_acquired_config = unclamped_budgets[best_ix] - horizon + + # Then we transform this: + # 1. Back to budget_index space + # 2. Increment it by one + # 3. Transform back to fidelity space + budget_ix = self._budget_index_domain.cast_one( + float(previous_budget_of_acquired_config), frm=self._budget_domain ) - best_configs_max_fid = max_observed_fids.loc[best_idx] - budget_value = get_budget_value( - space=self.pipeline_space, - step_size=self.step_size, - budget_level=best_configs_max_fid, + budget_ix += 1 + fid_value = self._fid_domain.cast_one( + budget_ix, frm=self._budget_index_domain ) - next_fid_value = budget_value + self.step_size - config.update_hp_values({self.fidelity_name: next_fid_value}) + real_best_id = best_id - 1 # NOTE: Remove the +1 we added to all ids + best_config[self._fidelity_name] = fid_value - # Lastly, we need to generate config id for it. - budget_level = int(np.ceil((next_fid_value - self.min_budget) / self.step_size)) - if _config_id in observed_configs.seen_config_ids: - config_id = f"{_config_id}_{budget_level}" - previous_config_id = f"{_config_id}_{budget_value - 1}" - else: - config_id = f"{observed_configs.next_config_id()}_{budget_level}" + config_id = f"{real_best_id}_{budget_ix}" + previous_config_id = f"{real_best_id}_{budget_ix - 1}" - return SampledConfig( - config=config.hp_values(), id=config_id, previous_config_id=previous_config_id - ), optimizer_state + return SampledConfig(config_id, best_config, previous_config_id) diff --git a/neps/optimizers/multi_fidelity/utils.py b/neps/optimizers/multi_fidelity/utils.py index 8e7b4910..b9d6e174 100644 --- a/neps/optimizers/multi_fidelity/utils.py +++ b/neps/optimizers/multi_fidelity/utils.py @@ -355,47 +355,55 @@ def token_ids(self) -> np.ndarray: return self.df.index.values @classmethod - def from_trials(cls, trials: Mapping[str, Trial]) -> Self: - observed_configs = MFObservedData( + def from_trials( + cls, + trials: Mapping[str, Trial], + *, + # TODO: We should store dicts, not the SearchSpace object... + # Once done, we can remove this + space: SearchSpace, + on_error: Literal["ignore"] | float = "ignore", + ) -> Self: + observed_configs = cls( columns=["config", "perf", "learning_curves"], index_names=["config_id", "budget_id"], ) - def _data(trial: Trial) -> Any: - # Considered pending - if report is None: + records: list[dict[str, Any]] = [] + for trial_id, trial in trials.items(): + _config_id, _budget_id = trial_id.split("_") + + if trial.report is None: loss = np.nan lc = [np.nan] - else: - loss = report.loss if report.loss is not None else "error" - lc = ( - report.learning_curve - if report.learning_curve is not None - else "error" - ) - - return [trial.config, loss, lc] - - # previous optimization run exists and needs to be loaded - def index_data_split( - config_id: str, trial: Trial - ) -> tuple[tuple[int, int], list]: - _config_id, _budget_id = config_id.split("_") - index = int(_config_id), int(_budget_id) - return index, _data(trial) - - if len(trials) > 0: - index_row = [ - tuple(index_data_split(trial_id, trial)) - for trial_id, trial in trials.items() - ] - indices, rows = zip(*index_row, strict=True) - observed_configs.add_data(data=list(rows), index=list(indices)) - - # an aesthetic choice more than a functional choice - observed_configs.df = observed_configs.df.sort_index( - level=self.observed_configs.df.index.names, inplace=True - ) + elif trial.report.loss is None: + assert trial.report.err is not None + if on_error == "ignore": + return None + + loss = on_error + lc = [on_error] + elif trial.report.loss is not None: + loss = trial.report.loss + assert trial.report.learning_curve is not None + lc = trial.report.learning_curve + + records.append( + { + "config_id": int(_config_id), + "budget_id": int(_budget_id), + # NOTE: Behavoiour around data in this requires that the dataframe stores + # `SearchSpace` objects and not dictionaries + "config": space.from_dict(trial.config), + "perf": loss, + "learning_curves": lc, + } + ) + + observed_configs.df = pd.DataFrame.from_records( + records, + index=["config_id", "budget_id"], + ).sort_index() return observed_configs diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py index 83c40e68..62c81ed8 100644 --- a/neps/sampling/priors.py +++ b/neps/sampling/priors.py @@ -22,11 +22,14 @@ TruncatedNormal, ) from neps.sampling.samplers import Sampler, WeightedSampler +from neps.search_spaces import CategoricalParameter from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain if TYPE_CHECKING: from torch.distributions import Distribution + from neps.search_spaces import FloatParameter, IntegerParameter + class Prior(Sampler, Protocol): """A protocol for priors over search spaces. @@ -103,7 +106,50 @@ def uniform(cls, ncols: int) -> UniformPrior: Args: ncols: The number of columns in the tensor to sample. """ - return UniformPrior(ncols=ncols) + return UniformPrior(ndims=ncols) + + @classmethod + def from_parameters( + cls, + parameters: dict[str, CategoricalParameter | FloatParameter | IntegerParameter], + ) -> Prior: + """Please refer to [`make_centered()`][neps.priors.Prior.make_centered] + for more details. This is a shortcut method. + """ + # TODO: This needs to be moved to the search space class, however + # to not break the current prior based APIs used elsewhere, we can + # just manually create this here. + # We use confidence here where `0` means no confidence and `1` means + # absolute confidence. This gets translated in to std's and weights + # accordingly in a `CenteredPrior` + _mapping = {"low": 0.25, "medium": 0.5, "high": 0.75} + + domains: dict[str, Domain] = {} + centers: dict[str, tuple[Any, float]] = {} + categoricals: set[str] = set() + for name, hp in parameters.items(): + domains[name] = hp.domain # type: ignore + + if isinstance(hp, CategoricalParameter): + categoricals.add(name) + + if hp.default is None: + continue + + confidence_str = hp.default_confidence_choice + confidence_score = _mapping[confidence_str] + center = ( + hp._default_index if isinstance(hp, CategoricalParameter) else hp.default + ) + + centers[name] = (center, confidence_score) + + # Uses truncnorms for numerical and weighted choices categoricals + return Prior.make_centered( + domains=domains, + centers=centers, + categoricals=categoricals, + ) @classmethod def make_centered( @@ -356,9 +402,14 @@ class UniformPrior(Prior): Uses a UnitUniform under the hood before converting to the value domain. """ - ncols: int + ndims: int """The number of columns in the tensor to sample from.""" + @property + @override + def ncols(self) -> int: + return self.ndims + @override def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor: # NOTE: We just assume everything is in bounds... @@ -378,9 +429,9 @@ def sample( raise NotImplementedError("Seeding is not yet implemented.") _n = ( - torch.Size((n, self.ncols)) + torch.Size((n, self.ndims)) if isinstance(n, int) - else torch.Size((*n, self.ncols)) + else torch.Size((*n, self.ndims)) ) samples = torch.rand(_n, device=device, dtype=torch.float64) return Domain.translate(samples, frm=UNIT_FLOAT_DOMAIN, to=to) diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py index 43758094..64105534 100644 --- a/neps/sampling/samplers.py +++ b/neps/sampling/samplers.py @@ -9,7 +9,7 @@ from collections.abc import Sequence from dataclasses import dataclass, field from functools import reduce -from typing import Protocol +from typing import TYPE_CHECKING, Protocol from typing_extensions import override import torch @@ -17,6 +17,9 @@ from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain +if TYPE_CHECKING: + from neps.sampling.priors import UniformPrior + class Sampler(Protocol): """A protocol for sampling tensors and vonerting them to a given domain.""" @@ -53,18 +56,31 @@ def sample( ... @classmethod - def sobol(cls, ndim: int, *, scramble: bool = True, seed: int | None = None) -> Sobol: + def sobol(cls, ndim: int, *, scramble: bool = True) -> Sobol: """Create a Sobol sampler. Args: ndim: The number of columns to sample. scramble: Whether to scramble the Sobol sequence. - seed: The seed for the Sobol sequence. Returns: A Sobol sampler. """ - return Sobol(ndim=ndim, scramble=scramble, seed=seed) + return Sobol(ndim=ndim, scramble=scramble) + + @classmethod + def uniform(cls, ndim: int) -> UniformPrior: + """Create a uniform sampler. + + Args: + ndim: The number of columns to sample. + + Returns: + A uniform sampler. + """ + from neps.sampling.priors import UniformPrior + + return UniformPrior(ndims=ndim) # Technically this could be a prior with a uniform distribution @@ -75,9 +91,6 @@ class Sobol(Sampler): ndim: int """The number of dimensions to sample for.""" - seed: int | None = None - """The seed for the Sobol sequence.""" - scramble: bool = True """Whether to scramble the Sobol sequence.""" @@ -113,7 +126,7 @@ def sample( sobol = torch.quasirandom.SobolEngine( dimension=self.ndim, scramble=self.scramble, - seed=self.seed, + seed=seed, ) out = torch.empty(_n, self.ncols, dtype=torch.float64, device=device) diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py index c1a10196..a5151aab 100644 --- a/neps/search_spaces/domain.py +++ b/neps/search_spaces/domain.py @@ -179,6 +179,15 @@ def int( bins=bins, ) + def next_value(self, x: Tensor) -> Tensor: + """Get the next value for a tensor of values.""" + if self.cardinality is None: + raise ValueError("Domain is non-finite, cannot get next value.") + cardinality_domain = Domain.indices(self.cardinality) + current_step = cardinality_domain.cast(x, frm=self) + bounded_next_step = (current_step + 1).clamp_max(self.cardinality - 1) + return self.cast(bounded_next_step, frm=cardinality_domain) + @classmethod def indices(cls, n: int) -> Domain[int]: """Create a domain for a range of indices. @@ -348,7 +357,7 @@ def translate( raise ValueError( "The number of domains in `to` must match the number of tensors" " if provided as a list." - f" Expected {ndims} from last dimension of {x.shape}, got {len(to)}." + f" Expected {ndims} from last dimension of {x.shape=}, got {len(to)}." ) out = torch.empty_like(x) @@ -357,5 +366,39 @@ def translate( return out + def cast_one(self, x: float | int, frm: Domain) -> float | int: + """Cast a single value from the domain `frm` to this domain. + + Args: + x: Value in the `frm` domain to cast to this domain. + frm: The domain to cast from. + + Returns: + Value cast to this domain. + """ + return self.cast(torch.tensor(x), frm=frm).item() + + def from_unit_one(self, x: float) -> float | int: + """Transform a single value from the unit interval [0, 1] to this domain. + + Args: + x: A value in the unit interval [0, 1] to convert. + + Returns: + Value lifted into this domain. + """ + return self.from_unit(torch.tensor(x)).item() + + def to_unit_one(self, x: float | int) -> float: + """Transform a single value from this domain to the unit interval [0, 1]. + + Args: + x: Value in this domain to convert. + + Returns: + Value normalized to the unit interval [0, 1]. + """ + return self.to_unit(torch.tensor(x)).item() + UNIT_FLOAT_DOMAIN = Domain.float(0.0, 1.0) diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py index 5f68aff9..eef1b25b 100644 --- a/neps/search_spaces/encoding.py +++ b/neps/search_spaces/encoding.py @@ -102,16 +102,56 @@ def decode(self, x: torch.Tensor) -> list[Any]: return [self.choices[int(i)] for i in torch.round(x).tolist()] +@dataclass +class CategoricalToUnitNorm(TensorTransformer): + choices: Sequence[Any] + + domain: Domain = field(init=False) + _integer_transformer: CategoricalToIntegerTransformer = field(init=False) + + def __post_init__(self): + self._integer_transformer = CategoricalToIntegerTransformer(self.choices) + + @override + def encode( + self, + x: Sequence[Any], + *, + out: torch.Tensor | None = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: + integers = self._integer_transformer.encode( + x, + dtype=dtype if dtype is not None else torch.float64, + device=device, + out=out, + ) + if out is not None: + return integers.div_(len(self.choices) - 1) + + return integers / (len(self.choices) - 1) + + @override + def decode(self, x: torch.Tensor) -> list[Any]: + x = torch.round(x * (len(self.choices) - 1)).type(torch.int64) + return self._integer_transformer.decode(x) + + # TODO: Maybe add a shift argument, could be useful to have `0` as midpoint # and `-0.5` as lower bound with `0.5` as upper bound. @dataclass class MinMaxNormalizer(TensorTransformer, Generic[V]): original_domain: Domain[V] + bins: int | None = None domain: Domain[float] = field(init=False) def __post_init__(self): - self.domain = UNIT_FLOAT_DOMAIN + if self.bins is None: + self.domain = UNIT_FLOAT_DOMAIN + else: + self.domain = Domain.float(0.0, 1.0, bins=self.bins) @override def encode( @@ -128,7 +168,7 @@ def encode( else: dtype = torch.float64 if dtype is None else dtype - values = torch.tensor(list(x), dtype=dtype, device=device) + values = torch.tensor(x, dtype=dtype, device=device) values = self.domain.cast(values, frm=self.original_domain) if out is None: return values @@ -251,7 +291,10 @@ def encode( return buffer def pack( - self, x: Sequence[Mapping[str, Any]], *, device: torch.device | None = None + self, + x: Sequence[Mapping[str, Any]], + *, + device: torch.device | None = None, ) -> TensorPack: return TensorPack(self.encode(x, device=device), self) @@ -269,15 +312,27 @@ def unpack(self, x: torch.Tensor) -> list[dict[str, Any]]: ] @classmethod - def default(cls, parameters: Mapping[str, Parameter]) -> TensorEncoder: + def default( + cls, + parameters: Mapping[str, Parameter], + *, + custom_transformers: dict[str, TensorTransformer] | None = None, + ) -> TensorEncoder: + custom = custom_transformers or {} sorted_params = sorted(parameters.items()) transformers: dict[str, TensorTransformer] = {} for name, hp in sorted_params: - if isinstance(hp, FloatParameter | IntegerParameter): - transformers[name] = MinMaxNormalizer(hp.domain) - else: - assert isinstance(hp, CategoricalParameter) - transformers[name] = CategoricalToIntegerTransformer(hp.choices) + if name in custom: + transformers[name] = custom[name] + continue + + match hp: + case FloatParameter() | IntegerParameter(): + transformers[name] = MinMaxNormalizer(hp.domain) + case CategoricalParameter(): + transformers[name] = CategoricalToIntegerTransformer(hp.choices) + case _: + raise ValueError(f"Unsupported parameter type: {type(hp)}") return TensorEncoder(transformers) diff --git a/neps/state/neps_state.py b/neps/state/neps_state.py index dd7d9279..056df9b4 100644 --- a/neps/state/neps_state.py +++ b/neps/state/neps_state.py @@ -111,12 +111,18 @@ def sample_trial( # NOTE: We don't want optimizers mutating this before serialization budget = opt_state.budget.clone() if opt_state.budget is not None else None - sampled_config, new_opt_state = optimizer.ask( + sampled_config_maybe_new_opt_state = optimizer.ask( trials=trials, budget_info=budget, optimizer_state=opt_state.shared_state, ) + if isinstance(sampled_config_maybe_new_opt_state, tuple): + sampled_config, new_opt_state = sampled_config_maybe_new_opt_state + else: + sampled_config = sampled_config_maybe_new_opt_state + new_opt_state = opt_state.shared_state + if sampled_config.previous_config_id is not None: previous_trial = trials.get(sampled_config.previous_config_id) if previous_trial is None: diff --git a/neps/state/trial.py b/neps/state/trial.py index 0360300c..05d2e129 100644 --- a/neps/state/trial.py +++ b/neps/state/trial.py @@ -132,7 +132,7 @@ class Trial: MetaData: ClassVar = MetaData NotReportedYetError: ClassVar = NotReportedYetError - config: dict[str, Any] + config: Mapping[str, Any] metadata: MetaData state: State report: Report | None From e9318785662de3dd1702837b6efa40867f6c09b3 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 25 Sep 2024 09:42:17 +0200 Subject: [PATCH 43/63] fix(ifbo): bin count for budget index --- neps/optimizers/multi_fidelity/ifbo.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index 066a0f99..42838a52 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -118,15 +118,11 @@ def __init__( # TODO: We want it to be evenly divided by step size, so we need # to add something to the minimum fidelity to ensure this. - - # NOTE: The PFN model expects fidelities to be normalized between 0 and 1, - # hence, we make sure to do min-max normalization but include the number of bins. - # Also, it expects it specifically in column 1 so we can't include it with the configs maybe_bins = math.ceil((self._max_budget - self._min_budget) / self.step_size) + 1 match pipeline_space.fidelity: case IntegerParameter(): assert pipeline_space.fidelity.domain.cardinality is not None - bins = max(maybe_bins, pipeline_space.fidelity.domain.cardinality) + bins = min(maybe_bins, pipeline_space.fidelity.domain.cardinality) case FloatParameter(): bins = maybe_bins case _: From becd324eac9b0f985c7d5ab989a37711bae44a21 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 25 Sep 2024 09:43:36 +0200 Subject: [PATCH 44/63] optim: Use torch operation for validation --- neps/optimizers/multi_fidelity/ifbo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index 42838a52..5ff2e03d 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -208,7 +208,7 @@ def ask( device=self.device, dtype=FTPFN_DTYPE, ) - if not all(0 <= y <= 1.0 for y in minimize_ys): + if minimize_ys.max() > 1 or minimize_ys.min() < 0: raise RuntimeError( "ifBO requires that all loss values reported lie in the interval [0, 1]" " but recieved loss value outside of that range!" From 5149034252bcbff3221636972b36168b9e9bc8dc Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 25 Sep 2024 13:24:01 +0200 Subject: [PATCH 45/63] refactor: Cleanup --- .../acquisition_functions/mf_pi.py | 208 ------------------ .../freeze_thaw_sampler.py | 146 ------------ neps/optimizers/multi_fidelity/ifbo.py | 167 +++++++------- neps/optimizers/multi_fidelity/utils.py | 53 ----- 4 files changed, 90 insertions(+), 484 deletions(-) delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py delete mode 100644 neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py b/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py deleted file mode 100644 index 75c7f1e3..00000000 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/mf_pi.py +++ /dev/null @@ -1,208 +0,0 @@ -# type: ignore -from __future__ import annotations - -from collections.abc import Iterable -from typing import TYPE_CHECKING, Any - -import numpy as np -import torch - -from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( - BaseAcquisition, -) -from neps.optimizers.multi_fidelity.utils import ( - MFObservedData, - get_freeze_thaw_normalized_step, - get_tokenized_data, -) -from neps.optimizers.utils import map_real_hyperparameters_from_tabular_ids - -if TYPE_CHECKING: - import pandas as pd - - from neps.search_spaces.search_space import SearchSpace - - -class MFPI(BaseAcquisition): - def __init__( - self, - pipeline_space: SearchSpace, - surrogate_model_name: str | None = None, - ): - super().__init__() - self.pipeline_space = pipeline_space - self.surrogate_model_name = surrogate_model_name - self.surrogate_model = None - self.observations = None - self.b_step = None - - def set_state( - self, - pipeline_space: SearchSpace, - surrogate_model: Any, - observations: MFObservedData, - b_step: int | float, - **kwargs, - ): - # overload to select incumbent differently through observations - self.pipeline_space = pipeline_space - self.surrogate_model = surrogate_model - self.observations = observations - self.b_step = b_step - - def preprocess(self, x: pd.Series) -> tuple[pd.Series, torch.Tensor]: - """Prepares the configurations for appropriate EI calculation. - - Takes a set of points and computes the budget and incumbent for each point, as - required by the multi-fidelity Expected Improvement acquisition function. - """ - raise NotImplementedError - - def eval(self, x: pd.Series, asscalar: bool = False) -> tuple[np.ndarray, pd.Series]: - # deepcopy - # _x = pd.Series([deepcopy(x.loc[idx]) for idx in x.index.values], index=x.index) - if self.surrogate_model_name == "ftpfn": - # preprocesses configs to have the appropriate fidelity values for acquisition - _x, inc_list = self.preprocess(x.copy()) - _x_tok = get_tokenized_data(_x.values, ignore_fidelity=True) - # padding IDs - _idx = torch.Tensor(_x.index.values + 1) - idx_mask = np.where(_idx > max(self.observations.seen_config_ids))[0] - _idx[idx_mask] = 0 - # normalizing steps - _steps = torch.Tensor( - [ - get_freeze_thaw_normalized_step( - _conf.fidelity.value, - self.pipeline_space.fidelity.lower, - self.pipeline_space.fidelity.upper, - self.b_step, - ) - for _conf in _x - ] - ) - _x_tok = torch.hstack( - ((_idx).reshape(-1, 1), _steps.reshape(-1, 1), torch.Tensor(_x_tok)) - ) - pi = self.eval_pfn_pi(_x_tok, inc_list) - else: - raise ValueError( - f"Unrecognized surrogate model name: {self.surrogate_model_name}" - ) - if pi.is_cuda: - pi = pi.cpu() - if len(_x) > 1 and asscalar: - return pi.detach().numpy(), _x - return pi.detach().numpy().item(), _x - - def eval_pfn_pi( - self, x: Iterable, inc_list: Iterable - ) -> np.ndarray | torch.Tensor | float: - """PFN-PI modified to preprocess samples and accept list of incumbents.""" - pi = self.surrogate_model.get_pi(x.to(self.surrogate_model.device), inc_list) - if len(pi.shape) == 2: - pi = pi.flatten() - return pi - - -class MFPI_Random(MFPI): - BUDGET = 1000 - - def __init__( - self, - pipeline_space: SearchSpace, - horizon: str = "random", - threshold: str = "random", - surrogate_model_name: str | None = None, - ): - super().__init__(pipeline_space, surrogate_model_name) - self.horizon = horizon - self.threshold = threshold - - def set_state( - self, - pipeline_space: SearchSpace, - surrogate_model: Any, - observations: MFObservedData, - b_step: int | float, - seed: int = 42, - ) -> None: - # set RNG - self.rng = np.random.RandomState(seed=seed) - - # TODO: wut is this? - for _i in range(len(observations.completed_runs)): - self.rng.uniform(-4, -1) - self.rng.randint(1, 51) - - return super().set_state(pipeline_space, surrogate_model, observations, b_step) - - def sample_horizon(self, steps_passed): - if self.horizon == "random": - shortest = self.pipeline_space.fidelity.lower - longest = min(self.pipeline_space.fidelity.upper, self.BUDGET - steps_passed) - return self.rng.randint(shortest, longest + 1) - if self.horizon == "max": - return min(self.pipeline_space.fidelity.upper, self.BUDGET - steps_passed) - return int(self.horizon) - - def sample_performance_threshold(self, f_inc): - if self.threshold == "random": - lu = 10 ** self.rng.uniform(-4, -1) # % of gap closed - else: - lu = float(self.threshold) - return f_inc * (1 - lu) - - def preprocess(self, x: pd.Series) -> tuple[pd.Series, torch.Tensor]: - """Prepares the configurations for appropriate EI calculation. - - Takes a set of points and computes the budget and incumbent for each point, as - required by the multi-fidelity acquisition function. - """ - if self.pipeline_space.has_tabular: - # preprocess tabular space differently - # expected input: IDs pertaining to the tabular data - x = map_real_hyperparameters_from_tabular_ids(x, self.pipeline_space) - - indices_to_drop = [] - inc_list = [] - - steps_passed = len(self.observations.completed_runs) - - # Like EI-AtMax, use the global incumbent as a basis for the EI threshold - inc_value = min(self.observations.get_best_performance_for_each_budget()) - - # Extension: Add a random min improvement threshold to encourage high risk high gain - t_value = self.sample_performance_threshold(inc_value) - inc_value = t_value - - # Like MFEI: set fidelities to query using horizon as self.b_step - # Extension: Unlike DyHPO, we sample the horizon randomly over the full range - horizon = self.sample_horizon(steps_passed) - - for i, config in x.items(): - if i <= max(self.observations.seen_config_ids): - if np.equal(config.fidelity.value, config.fidelity.upper): - # this training run has ended, drop it from future selection - indices_to_drop.append(i) - else: - # a candidate partial training run to continue - config.update_hp_values( - { - config.fidelity_name: min( - config.fidelity.value + horizon, config.fidelity.upper - ) # if horizon exceeds max, query at max - } - ) - inc_list.append(inc_value) - else: - # a candidate new training run that we would need to start - config.update_hp_values({config.fidelity_name: horizon}) - inc_list.append(inc_value) - - # Drop unused configs - x = x.drop(labels=indices_to_drop) - - assert len(inc_list) == len(x) - - return x, torch.Tensor(inc_list) diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py deleted file mode 100644 index cae5bee8..00000000 --- a/neps/optimizers/bayesian_optimization/acquisition_samplers/freeze_thaw_sampler.py +++ /dev/null @@ -1,146 +0,0 @@ -from __future__ import annotations - -import warnings -from collections.abc import Callable -from copy import deepcopy -from typing import TYPE_CHECKING - -import numpy as np -import pandas as pd - -from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, -) - -if TYPE_CHECKING: - from neps.optimizers.multi_fidelity.utils import MFObservedData - from neps.search_spaces.search_space import SearchSpace - -SAMPLES_TO_DRAW = ( - 100 # number of random samples to draw for optimizing acquisition function -) - - -class FreezeThawSampler(AcquisitionSampler): - def __init__(self, samples_to_draw: int | None = None, **kwargs): - super().__init__(**kwargs) - self.observations = None - self.b_step = None - self.n = None - self.pipeline_space = None - # args to manage tabular spaces/grid - self.is_tabular = False # flag is set by `set_state()` - self.sample_full_table = None - self.samples_to_draw = ( - samples_to_draw if samples_to_draw is not None else SAMPLES_TO_DRAW - ) - self.set_sample_full_tabular(True) # sets flag that samples full table - - def set_sample_full_tabular(self, flag: bool = False): - if self.is_tabular: - self.sample_full_table = flag - - def _sample_new( - self, - index_from: int, - n: int | None = None, - ignore_fidelity: bool = False, - ) -> pd.Series: - n = n if n is not None else self.samples_to_draw - assert self.pipeline_space is not None - new_configs = [ - self.pipeline_space.sample( - patience=self.patience, user_priors=False, ignore_fidelity=ignore_fidelity - ) - for _ in range(n) - ] - - return pd.Series( - new_configs, index=range(index_from, index_from + len(new_configs)) - ) - - def sample( - self, - acquisition_function: Callable | None = None, - n: int | None = None, - set_new_sample_fidelity: int | float | None = None, - ) -> pd.Series: - """Samples a new set and returns the total set of observed + new configs.""" - assert self.observations is not None - assert self.pipeline_space is not None - - partial_configs = self.observations.get_partial_configs_at_max_seen() - - _n = n if n is not None else self.samples_to_draw - if self.is_tabular: - assert self.pipeline_space.custom_grid_table is not None - # handles tabular data such that the entire unseen set of configs from the - # table is considered to be the new set of candidates - _partial_ids = {conf["id"].value for conf in partial_configs} - _all_ids = set(self.pipeline_space.custom_grid_table.keys()) - - # accounting for unseen configs only, samples remaining table if flag is set - max_n = len(_all_ids) + 1 if self.sample_full_table else _n - _n = min(max_n, len(_all_ids - _partial_ids)) - - _new_configs = np.random.choice( - list(_all_ids - _partial_ids), size=_n, replace=False - ) - placeholder_config = self.pipeline_space.sample( - patience=self.patience, user_priors=False, ignore_fidelity=False - ) - _configs = [placeholder_config.clone() for _id in _new_configs] - for _i, val in enumerate(_new_configs): - _configs[_i]["id"].set_value(val) - - new_configs = pd.Series( - _configs, - index=np.arange( - len(partial_configs), len(partial_configs) + len(_new_configs) - ), - ) - else: - # handles sampling new configurations for continuous spaces - new_configs = self._sample_new( - index_from=self.observations.next_config_id(), n=_n, ignore_fidelity=False - ) - # Continuous benchmarks need to deepcopy individual configs here, - # because in contrast to tabular benchmarks - # they are not reset in every sampling step - - # TODO: I do not know what the f p_config_ is meant to be so I don't know - # if we have a specific clone method or not... - partial_configs = pd.Series( - [deepcopy(p_config_) for idx, p_config_ in partial_configs.items()], - index=partial_configs.index, - ) - - # Updating fidelity values - new_fid = ( - set_new_sample_fidelity - if set_new_sample_fidelity is not None - else self.pipeline_space.fidelity.lower - ) - for config in new_configs: - config.update_hp_values({config.fidelity_name: new_fid}) - - return pd.concat([deepcopy(partial_configs), new_configs]) - - def set_state( - self, - pipeline_space: SearchSpace, - observations: MFObservedData, - b_step: int, - n: int | None = None, - ) -> None: - # overload to select incumbent differently through observations - self.pipeline_space = pipeline_space - self.observations = observations - self.b_step = b_step - self.n = n if n is not None else self.samples_to_draw - if ( - hasattr(self.pipeline_space, "custom_grid_table") - and self.pipeline_space.custom_grid_table is not None - ): - self.is_tabular = True - self.set_sample_full_tabular(True) diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index 5ff2e03d..e7caa38d 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -15,8 +15,29 @@ from neps.state.optimizer import BudgetInfo -def _select_trials(trials: Mapping[str, Trial]) -> Mapping[str, Trial]: - return { +# NOTE: Ifbo was trained using 32 bit +FTPFN_DTYPE = torch.float32 + + +def tokenize( + ids: torch.Tensor, + budgets: torch.Tensor, + configs: torch.Tensor, +) -> torch.Tensor: + return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1) + + +def _encode_ftpfn( + trials: Mapping[str, Trial], + encoder: TensorEncoder, + space: SearchSpace, + budget_domain: Domain, + device: torch.device | None = None, + dtype: torch.dtype = FTPFN_DTYPE, +) -> tuple[torch.Tensor, torch.Tensor]: + # TODO: Currently we do not handle error cases, we can't use NaN as that + # is what we use for trials that have no loss yet, i.e. pending trials. + selected = { trial_id: trial for trial_id, trial in trials.items() if trial.state @@ -27,6 +48,45 @@ def _select_trials(trials: Mapping[str, Trial]) -> Mapping[str, Trial]: Trial.State.CORRUPTED, ) } + assert space.fidelity_name is not None + assert space.fidelity is not None + train_configs = encoder.encode([t.config for t in selected.values()], device=device) + ids = torch.tensor( + [int(config_id.split("_", maxsplit=1)[0]) for config_id in selected.keys()], + device=device, + dtype=torch.float64, + ) + ids = ids + train_fidelities = torch.tensor( + [t.config[space.fidelity_name] for t in selected.values()], + device=device, + dtype=torch.float64, + ) + train_budgets = budget_domain.cast(train_fidelities, frm=space.fidelity.domain) + X = tokenize( + ids=torch.tensor(ids, device=device), budgets=train_budgets, configs=train_configs + ).to(dtype) + + # TODO: Document that it's on the user to ensure these are already all bounded + # We could possibly include some bounded transform to assert this. + minimize_ys = torch.tensor( + [ + trial.report.loss + if trial.report is not None and trial.report.loss is not None + else np.nan + for trial in trials.values() + ], + device=device, + dtype=FTPFN_DTYPE, + ) + if minimize_ys.max() > 1 or minimize_ys.min() < 0: + raise RuntimeError( + "ifBO requires that all loss values reported lie in the interval [0, 1]" + " but recieved loss value outside of that range!" + f"\n{minimize_ys}" + ) + maximize_ys = 1 - minimize_ys + return X, maximize_ys def _remove_duplicates(x: torch.Tensor) -> torch.Tensor: @@ -46,18 +106,6 @@ def _remove_duplicates(x: torch.Tensor) -> torch.Tensor: return sorted_x[ii] -# NOTE: Ifbo was trained using 32 bit -FTPFN_DTYPE = torch.float32 - - -def tokenize( - ids: torch.Tensor, - budgets: torch.Tensor, - configs: torch.Tensor, -) -> torch.Tensor: - return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1) - - class IFBO(BaseOptimizer): """Base class for MF-BO algorithms that use DyHPO-like acquisition and budgeting.""" @@ -71,6 +119,7 @@ def __init__( # arguments for model surrogate_model_args: dict | None = None, initial_design_size: int | None = None, + n_acquisition_new_configs: int = 1_000, device: torch.device | None = None, **kwargs: Any, # TODO: Remove this ): @@ -100,6 +149,7 @@ def __init__( self.surrogate_model_args = surrogate_model_args or {} self.device = device self.n_initial_design: int | None = initial_design_size + self.n_acquisition_new_configs = n_acquisition_new_configs self._min_budget: int | float = pipeline_space.fidelity.lower self._max_budget: int | float = pipeline_space.fidelity.upper @@ -151,12 +201,7 @@ def ask( if seed is not None: raise NotImplementedError("Seed is not yet implemented for IFBO") - trials = _select_trials(trials) - - ids = [ - int(trial.metadata.id.split("_", maxsplit=1)[0]) for trial in trials.values() - ] - n_unique_ids = len(set(ids)) + ids = [int(config_id.split("_", maxsplit=1)[0]) for config_id in trials.keys()] new_id = max(ids) + 1 if len(ids) > 0 else 0 # If we havn't passed the intial design phase @@ -173,9 +218,8 @@ def ask( ), ) - if n_unique_ids < len(self._initial_design): - config = self._initial_design[n_unique_ids] - return SampledConfig(id=f"{new_id}_0", config=config) + if new_id < len(self._initial_design): + return SampledConfig(id=f"{new_id}_0", config=self._initial_design[new_id]) # Otherwise, we proceed to surrogate phase ftpfn = FTPFNSurrogate( @@ -183,42 +227,20 @@ def ask( version=self.surrogate_model_args.get("version", "0.0.1"), device=self.device, ) - - # NOTE: `0` is reserved in PFN, we add an additional +1 to all ids - train_ids = torch.tensor(ids, dtype=FTPFN_DTYPE, device=self.device) + 1 - train_configs = self._ftpfn_encoder.encode([t.config for t in trials.values()]) - - train_fidelities = [t.config[self._fidelity_name] for t in trials.values()] - train_budgets = self._budget_domain.cast( - torch.tensor(train_fidelities, device=self.device, dtype=FTPFN_DTYPE), - frm=self._fid_domain, - ) - x_train = tokenize(ids=train_ids, budgets=train_budgets, configs=train_configs) - x_train = x_train.to(FTPFN_DTYPE) - - # TODO: Document that it's on the user to ensure these are already all bounded - # We could possibly include some bounded transform to assert this. - minimize_ys = torch.tensor( - [ - trial.report.loss - if trial.report is not None and trial.report.loss is not None - else np.nan - for trial in trials.values() - ], + x_train, maximize_ys = _encode_ftpfn( + trials=trials, + encoder=self._ftpfn_encoder, + space=self.pipeline_space, + budget_domain=self._budget_domain, device=self.device, - dtype=FTPFN_DTYPE, ) - if minimize_ys.max() > 1 or minimize_ys.min() < 0: - raise RuntimeError( - "ifBO requires that all loss values reported lie in the interval [0, 1]" - " but recieved loss value outside of that range!" - f"\n{minimize_ys}" - ) - # Invert the ys - maximize_ys = 1 - minimize_ys + x_train[:, 1] = x_train[:, 1] + 1 # PFN uses `0` id for test configurations + + # Get the best performance so far maximize_best_y = maximize_ys.max().item() - is_pending = minimize_ys.isnan() + # Fantasize the result of pending trials + is_pending = maximize_ys.isnan() maximize_ys[is_pending] = ftpfn.get_mean_performance( train_x=x_train[~is_pending], train_y=maximize_ys[~is_pending], @@ -226,9 +248,6 @@ def ask( ) rng = np.random.RandomState(seed) - n_rand = 1_000 # TODO: parametrize - - # TODO: Could also sample from a prior... uniform = Sampler.uniform(ndim=self._ftpfn_encoder.ncols) # We sample the horizon in terms of step numbers to take @@ -244,19 +263,16 @@ def ask( frm=self._budget_index_domain, ) - # Now let's create the random configurations - rand_configs = uniform.sample( - n=n_rand, - to=self._ftpfn_encoder.domains, - seed=None, # TODO - device=self.device, - ).to(FTPFN_DTYPE) - # We give them all the special 0 id, as well as set the budget accordinly acq_new = tokenize( - ids=torch.zeros(n_rand, dtype=FTPFN_DTYPE, device=self.device), - budgets=torch.zeros(n_rand, dtype=FTPFN_DTYPE, device=self.device), - configs=rand_configs, + ids=torch.zeros(self.n_acquisition_new_configs, device=self.device), + budgets=torch.zeros(self.n_acquisition_new_configs, device=self.device), + configs=uniform.sample( + n=self.n_acquisition_new_configs, + to=self._ftpfn_encoder.domains, + seed=None, # TODO + device=self.device, + ), ) # Construct all our samples for acqusition: @@ -284,15 +300,12 @@ def ask( # Now get the PI of these samples lu = 10 ** rng.uniform(-4, -1) f_inc = maximize_best_y * (1 - lu) + n_acq_samples = len(acq_samples) pi_new_samples = ftpfn.get_pi( - train_x=x_train.to(FTPFN_DTYPE), - train_y=maximize_ys.to(FTPFN_DTYPE), - test_x=acq_samples.to(FTPFN_DTYPE), - y_best=torch.full( - size=(len(acq_samples),), - fill_value=f_inc, - dtype=FTPFN_DTYPE, - ), + train_x=x_train, + train_y=maximize_ys, + test_x=acq_samples, + y_best=torch.full(size=(n_acq_samples,), fill_value=f_inc, dtype=FTPFN_DTYPE), ) best_ix = pi_new_samples.argmax() diff --git a/neps/optimizers/multi_fidelity/utils.py b/neps/optimizers/multi_fidelity/utils.py index b9d6e174..0158fbdf 100644 --- a/neps/optimizers/multi_fidelity/utils.py +++ b/neps/optimizers/multi_fidelity/utils.py @@ -4,7 +4,6 @@ from collections.abc import Sequence from copy import deepcopy from typing import TYPE_CHECKING, Any -from typing_extensions import Self import numpy as np import pandas as pd @@ -354,58 +353,6 @@ def get_max_observed_fidelity_level_per_config(self) -> pd.Series: def token_ids(self) -> np.ndarray: return self.df.index.values - @classmethod - def from_trials( - cls, - trials: Mapping[str, Trial], - *, - # TODO: We should store dicts, not the SearchSpace object... - # Once done, we can remove this - space: SearchSpace, - on_error: Literal["ignore"] | float = "ignore", - ) -> Self: - observed_configs = cls( - columns=["config", "perf", "learning_curves"], - index_names=["config_id", "budget_id"], - ) - - records: list[dict[str, Any]] = [] - for trial_id, trial in trials.items(): - _config_id, _budget_id = trial_id.split("_") - - if trial.report is None: - loss = np.nan - lc = [np.nan] - elif trial.report.loss is None: - assert trial.report.err is not None - if on_error == "ignore": - return None - - loss = on_error - lc = [on_error] - elif trial.report.loss is not None: - loss = trial.report.loss - assert trial.report.learning_curve is not None - lc = trial.report.learning_curve - - records.append( - { - "config_id": int(_config_id), - "budget_id": int(_budget_id), - # NOTE: Behavoiour around data in this requires that the dataframe stores - # `SearchSpace` objects and not dictionaries - "config": space.from_dict(trial.config), - "perf": loss, - "learning_curves": lc, - } - ) - - observed_configs.df = pd.DataFrame.from_records( - records, - index=["config_id", "budget_id"], - ).sort_index() - return observed_configs - if __name__ == "__main__": # TODO: Either delete these or convert them to tests (karibbov) From 307fc4508291c1fd283254f3085cf19f817bf30b Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 25 Sep 2024 13:24:34 +0200 Subject: [PATCH 46/63] fix: Remove the removed MFPI_Random --- .../acquisition_functions/__init__.py | 7 ------- neps/optimizers/multi_fidelity/mf_bo.py | 1 - 2 files changed, 8 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py b/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py index a125997d..03d41f6a 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/__init__.py @@ -4,7 +4,6 @@ from neps.optimizers.bayesian_optimization.acquisition_functions.ei import ( ComprehensiveExpectedImprovement, ) -from neps.optimizers.bayesian_optimization.acquisition_functions.mf_pi import MFPI_Random from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import ( DecayingPriorWeightedAcquisition, ) @@ -31,11 +30,6 @@ in_fill="posterior", augmented_ei=True, ), - "MFPI-random": partial( - MFPI_Random, - threshold="random", - horizon="random", - ), "UCB": partial( UpperConfidenceBound, maximize=False, @@ -47,5 +41,4 @@ "ComprehensiveExpectedImprovement", "UpperConfidenceBound", "DecayingPriorWeightedAcquisition", - "MFPI_Random", ] diff --git a/neps/optimizers/multi_fidelity/mf_bo.py b/neps/optimizers/multi_fidelity/mf_bo.py index 729e3718..e6205d00 100755 --- a/neps/optimizers/multi_fidelity/mf_bo.py +++ b/neps/optimizers/multi_fidelity/mf_bo.py @@ -5,7 +5,6 @@ import torch -from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate from neps.optimizers.multi_fidelity.utils import ( MFObservedData, From d162d1f89e7c776283b905c623fec9438f940ce5 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 25 Sep 2024 14:01:49 +0200 Subject: [PATCH 47/63] fix: Use budget_domain bounds where possible --- neps/optimizers/multi_fidelity/ifbo.py | 184 +++++++++++++++---------- 1 file changed, 108 insertions(+), 76 deletions(-) diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index e7caa38d..7b74a8d3 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -1,4 +1,4 @@ -from typing import Any, Mapping +from typing import Any, Mapping, Literal import math import numpy as np @@ -7,6 +7,7 @@ from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate from neps.optimizers.intial_design import make_initial_design +from neps.sampling.priors import Prior from neps.sampling.samplers import Sampler from neps.search_spaces.domain import Domain from neps.search_spaces.encoding import CategoricalToUnitNorm, TensorEncoder @@ -19,7 +20,7 @@ FTPFN_DTYPE = torch.float32 -def tokenize( +def _tokenize( ids: torch.Tensor, budgets: torch.Tensor, configs: torch.Tensor, @@ -27,7 +28,7 @@ def tokenize( return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1) -def _encode_ftpfn( +def _encode_for_ftpfn( trials: Mapping[str, Trial], encoder: TensorEncoder, space: SearchSpace, @@ -63,7 +64,7 @@ def _encode_ftpfn( dtype=torch.float64, ) train_budgets = budget_domain.cast(train_fidelities, frm=space.fidelity.domain) - X = tokenize( + X = _tokenize( ids=torch.tensor(ids, device=device), budgets=train_budgets, configs=train_configs ).to(dtype) @@ -106,6 +107,36 @@ def _remove_duplicates(x: torch.Tensor) -> torch.Tensor: return sorted_x[ii] +def _acquire_pfn( + train_x: torch.Tensor, + train_y: torch.Tensor, + test_x: torch.Tensor, + ftpfn: FTPFNSurrogate, + y_to_beat: float, + how: Literal["pi", "ei", "ucb", "lcb"], +) -> torch.Tensor: + match how: + case "pi": + y_best = torch.full( + size=(len(test_x),), fill_value=y_to_beat, dtype=FTPFN_DTYPE + ) + return ftpfn.get_pi(train_x, train_y, test_x, y_best=y_best) + case "ei": + y_best = torch.full( + size=(len(test_x),), fill_value=y_to_beat, dtype=FTPFN_DTYPE + ) + return ftpfn.get_ei(train_x, train_y, test_x, y_best=y_best) + case "ucb": + y_best = torch.full( + size=(len(test_x),), fill_value=y_to_beat, dtype=FTPFN_DTYPE + ) + return ftpfn.get_ucb(train_x, train_y, test_x) + case "lcb": + return ftpfn.get_lcb(train_x, train_y, test_x) + case _: + raise ValueError(f"Unknown acquisition function {how}") + + class IFBO(BaseOptimizer): """Base class for MF-BO algorithms that use DyHPO-like acquisition and budgeting.""" @@ -127,8 +158,7 @@ def __init__( Args: pipeline_space: Space in which to search - use_priors: Allows random samples to be generated from a default - Samples generated from a Gaussian centered around the default value + step_size: The size of the step to take in the fidelity domain. sampling_policy: The type of sampling procedure to use promotion_policy: The type of promotion procedure to use sample_default_first: Whether to sample the default configuration first @@ -154,17 +184,19 @@ def __init__( self._min_budget: int | float = pipeline_space.fidelity.lower self._max_budget: int | float = pipeline_space.fidelity.upper self._fidelity_name: str = pipeline_space.fidelity_name + self._initial_design: list[dict[str, Any]] | None = None + + params = {**self.pipeline_space.numerical, **self.pipeline_space.categoricals} + self._prior = Prior.from_parameters(params) if use_priors else None self._ftpfn_encoder: TensorEncoder = TensorEncoder.default( - { - **self.pipeline_space.numerical, - **self.pipeline_space.categoricals, - }, + params, + # FTPFN doesn't support categoricals and we were recomenned to just evenly distribute + # in the unit norm custom_transformers={ cat_name: CategoricalToUnitNorm(choices=cat.choices) for cat_name, cat in self.pipeline_space.categoricals.items() }, ) - self._initial_design: list[dict[str, Any]] | None = None # TODO: We want it to be evenly divided by step size, so we need # to add something to the minimum fidelity to ensure this. @@ -189,7 +221,7 @@ def __init__( # Domain from which we assign an index to each budget # Automatically takes care of rounding - self._budget_index_domain = Domain.indices(bins) + self._budget_ix_domain = Domain.indices(bins) def ask( self, @@ -210,7 +242,7 @@ def ask( space=self.pipeline_space, encoder=self._ftpfn_encoder, sample_default_first=self.sample_default_first, - sampler="sobol", + sampler="sobol" if self._prior is None else self._prior, seed=seed, sample_fidelity="min", sample_size=( @@ -227,17 +259,15 @@ def ask( version=self.surrogate_model_args.get("version", "0.0.1"), device=self.device, ) - x_train, maximize_ys = _encode_ftpfn( + x_train, maximize_ys = _encode_for_ftpfn( trials=trials, encoder=self._ftpfn_encoder, space=self.pipeline_space, budget_domain=self._budget_domain, device=self.device, ) - x_train[:, 1] = x_train[:, 1] + 1 # PFN uses `0` id for test configurations - - # Get the best performance so far - maximize_best_y = maximize_ys.max().item() + # PFN uses `0` id for test configurations, we remove this later + x_train[:, 1] = x_train[:, 1] + 1 # Fantasize the result of pending trials is_pending = maximize_ys.isnan() @@ -247,69 +277,74 @@ def ask( test_x=x_train[is_pending], ) + # We then sample a horizon, minimum one budget index increment and cast + # to the budget domain expected by the ftpfn model rng = np.random.RandomState(seed) - uniform = Sampler.uniform(ndim=self._ftpfn_encoder.ncols) - - # We sample the horizon in terms of step numbers to take - lower_index = self._budget_index_domain.lower - upper_index = self._budget_index_domain.upper - # The plus 1 here is because we want to sample that at least one step - # should be taken. - horizon_index_increment = rng.randint(lower_index, upper_index) + 1 - - # We then normalize it to FTPFN normalized budget domain + lower_index = self._budget_ix_domain.lower + upper_index = self._budget_ix_domain.upper horizon = self._budget_domain.cast_one( - horizon_index_increment, - frm=self._budget_index_domain, + rng.randint(lower_index, upper_index) + 1, + frm=self._budget_ix_domain, ) - # We give them all the special 0 id, as well as set the budget accordinly - acq_new = tokenize( + # Now we sample some new configurations into the domain expected by the FTPFN + if self._prior is not None: + acq_sampler = self._prior + else: + acq_sampler = Sampler.uniform(ndim=self._ftpfn_encoder.ncols) + + new_acq_configs = acq_sampler.sample( + self.n_acquisition_new_configs, + to=self._ftpfn_encoder.domains, + device=self.device, + seed=None, # TODO + ) + acq_new = _tokenize( ids=torch.zeros(self.n_acquisition_new_configs, device=self.device), - budgets=torch.zeros(self.n_acquisition_new_configs, device=self.device), - configs=uniform.sample( - n=self.n_acquisition_new_configs, - to=self._ftpfn_encoder.domains, - seed=None, # TODO + budgets=torch.full( + size=(self.n_acquisition_new_configs,), + fill_value=self._budget_domain.lower, device=self.device, ), + configs=new_acq_configs, ) # Construct all our samples for acqusition: # 1. Take all non-pending configs - acq_train = x_train[~is_pending].clone().detach() + acq_continue_existing = x_train[~is_pending].clone().detach() - # 2. We only want to include the configuration rows - # that are at their highest budget, - # i.e. don't include config_0_0 and config_0_1 - acq_train = _remove_duplicates(acq_train) + # 2. We only want to include the configuration at their highest + # budget evaluated, i.e. don't include config_0_0 if config_0_1 is highest + acq_continue_existing = _remove_duplicates(acq_continue_existing) - # 3. Sub select all that are at a partial budget i.e. can evaluate further - # Note, it's important to do this after the above - partial_eval_mask = acq_train[:, 1] < 1 - acq_train = acq_train[partial_eval_mask] + # 3. Sub select all that are not fully evaluated + acq_continue_existing = acq_continue_existing[acq_continue_existing[:, 1] < 1] # 4. Add in the new sampled configurations - acq_samples = torch.vstack([acq_train, acq_new]) + acq_samples = torch.vstack([acq_continue_existing, acq_new]) - # 5. Add on the horizon to the budget, and clamping to maximum - # Note that we hold onto the intermediate unclamped budget for later + # 5. Add on the horizon to the budget unclamped_budgets = acq_samples[:, 1] + horizon - acq_samples[:, 1] = torch.clamp(unclamped_budgets, max=1) - # Now get the PI of these samples + # 6. Clamp to the maximum of the budget domain + acq_samples[:, 1] = torch.clamp(unclamped_budgets, max=self._budget_domain.upper) + + # Now get the PI of these samples according to MFPI_Random + maximize_best_y = maximize_ys.max().item() lu = 10 ** rng.uniform(-4, -1) f_inc = maximize_best_y * (1 - lu) - n_acq_samples = len(acq_samples) - pi_new_samples = ftpfn.get_pi( + + acq_scores = _acquire_pfn( train_x=x_train, - train_y=maximize_ys, + train_y=maximize_ys[~is_pending], test_x=acq_samples, - y_best=torch.full(size=(n_acq_samples,), fill_value=f_inc, dtype=FTPFN_DTYPE), + ftpfn=ftpfn, + y_to_beat=f_inc, + how="pi", ) - best_ix = pi_new_samples.argmax() # Extract out the row which had the best PI + best_ix = acq_scores.argmax() best_id = int(acq_samples[best_ix, 0].round().item()) best_vector = acq_samples[best_ix, 2:].unsqueeze(0) best_config = self._ftpfn_encoder.unpack(best_vector)[0] @@ -321,26 +356,23 @@ def ask( previous_config_id = None return SampledConfig(config_id, best_config, previous_config_id) - else: - # To calculate the next step to take in fidelity space, we remove the horizon - previous_budget_of_acquired_config = unclamped_budgets[best_ix] - horizon - - # Then we transform this: - # 1. Back to budget_index space - # 2. Increment it by one - # 3. Transform back to fidelity space - budget_ix = self._budget_index_domain.cast_one( - float(previous_budget_of_acquired_config), frm=self._budget_domain - ) - budget_ix += 1 - fid_value = self._fid_domain.cast_one( - budget_ix, frm=self._budget_index_domain - ) + # To get to the next fidelity value to provide, + # 1. Get the budget before we added the horizon + budget = float(unclamped_budgets[best_ix] - horizon) - real_best_id = best_id - 1 # NOTE: Remove the +1 we added to all ids - best_config[self._fidelity_name] = fid_value + # 2. Cast to budget index domain + budget_ix = self._budget_ix_domain.cast_one(budget, frm=self._budget_domain) - config_id = f"{real_best_id}_{budget_ix}" - previous_config_id = f"{real_best_id}_{budget_ix - 1}" + # 3. Increment it to the next budget index + budget_ix += 1 - return SampledConfig(config_id, best_config, previous_config_id) + # 4. And finally convert back into the fidelity domain + fid_value = self._fid_domain.cast_one(budget_ix, frm=self._budget_ix_domain) + + real_best_id = best_id - 1 # NOTE: Remove the +1 we added to all ids earlier + best_config[self._fidelity_name] = fid_value + + config_id = f"{real_best_id}_{budget_ix}" + previous_config_id = f"{real_best_id}_{budget_ix - 1}" + + return SampledConfig(config_id, best_config, previous_config_id) From 7e2a048b6b27a9667664c03d6753d6b5811f498f Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 25 Sep 2024 14:24:19 +0200 Subject: [PATCH 48/63] fix: Increment lower bound of fidelity space to make divisble --- neps/optimizers/multi_fidelity/ifbo.py | 99 ++++++++++++++++++-------- 1 file changed, 71 insertions(+), 28 deletions(-) diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index 7b74a8d3..2d041df1 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -20,6 +20,59 @@ FTPFN_DTYPE = torch.float32 +def _adjust_pipeline_space_to_match_stepsize( + pipeline_space: SearchSpace, + step_size: int | float, +) -> tuple[SearchSpace, int]: + """Adjust the pipeline space to be evenly divisible by the step size. + + This is done by incrementing the lower bound of the fidelity domain to the + that enables this. + + Args: + pipeline_space: The pipeline space to adjust + step_size: The size of the step to take in the fidelity domain. + + Returns: + The adjusted pipeline space and the number of bins it can be divided into + """ + fidelity = pipeline_space.fidelity + fidelity_name = pipeline_space.fidelity_name + assert fidelity_name is not None + assert isinstance(fidelity, FloatParameter | IntegerParameter) + if fidelity.log: + raise NotImplementedError("Log fidelity not yet supported") + + # Can't use mod since it's quite innacurate for floats + # Use the fact that we can always write x = n*k + r + # where k = stepsize and x = (fid_upper - fid_lower) + # x = n*k + r + # n = x // k + # r = x - n*k + x = fidelity.upper - fidelity.lower + n = int(x // step_size) + + if n <= 0: + raise ValueError( + f"Step size ({step_size}) is too large for the fidelity domain {fidelity}." + "Considering lowering this parameter to ifBO." + ) + + r = x - n * step_size + new_lower = fidelity.lower + r + new_fid = fidelity.__class__( + lower=new_lower, + upper=fidelity.upper, + log=fidelity.log, + default=fidelity.default, + default_confidence=fidelity.default_confidence_choice, + ) + return ( + SearchSpace(**{**pipeline_space.hyperparameters, fidelity_name: new_fid}), + n, + ) + + def _tokenize( ids: torch.Tensor, budgets: torch.Tensor, @@ -143,7 +196,7 @@ class IFBO(BaseOptimizer): def __init__( self, pipeline_space: SearchSpace, - step_size: int = 1, + step_size: int | float = 1, use_priors: bool = False, sample_default_first: bool = False, sample_default_at_target: bool = False, @@ -168,25 +221,30 @@ def __init__( device: Device to use for the model """ - assert pipeline_space.fidelity is not None - assert isinstance(pipeline_space.fidelity_name, str) + # TODO: I'm not sure how this might effect tables, whose lowest fidelity + # might be below to possibly increased lower bound. + space, fid_bins = _adjust_pipeline_space_to_match_stepsize( + pipeline_space, step_size + ) + assert space.fidelity is not None + assert isinstance(space.fidelity_name, str) - super().__init__(pipeline_space=pipeline_space) + super().__init__(pipeline_space=space) self.step_size = step_size self.use_priors = use_priors self.sample_default_first = sample_default_first self.sample_default_at_target = sample_default_at_target - self.surrogate_model_args = surrogate_model_args or {} self.device = device - self.n_initial_design: int | None = initial_design_size + self.n_initial_design = initial_design_size self.n_acquisition_new_configs = n_acquisition_new_configs + self.surrogate_model_args = surrogate_model_args or {} - self._min_budget: int | float = pipeline_space.fidelity.lower - self._max_budget: int | float = pipeline_space.fidelity.upper - self._fidelity_name: str = pipeline_space.fidelity_name + self._min_budget: int | float = space.fidelity.lower + self._max_budget: int | float = space.fidelity.upper + self._fidelity_name: str = space.fidelity_name self._initial_design: list[dict[str, Any]] | None = None - params = {**self.pipeline_space.numerical, **self.pipeline_space.categoricals} + params = {**space.numerical, **space.categoricals} self._prior = Prior.from_parameters(params) if use_priors else None self._ftpfn_encoder: TensorEncoder = TensorEncoder.default( params, @@ -194,34 +252,19 @@ def __init__( # in the unit norm custom_transformers={ cat_name: CategoricalToUnitNorm(choices=cat.choices) - for cat_name, cat in self.pipeline_space.categoricals.items() + for cat_name, cat in space.categoricals.items() }, ) - # TODO: We want it to be evenly divided by step size, so we need - # to add something to the minimum fidelity to ensure this. - maybe_bins = math.ceil((self._max_budget - self._min_budget) / self.step_size) + 1 - match pipeline_space.fidelity: - case IntegerParameter(): - assert pipeline_space.fidelity.domain.cardinality is not None - bins = min(maybe_bins, pipeline_space.fidelity.domain.cardinality) - case FloatParameter(): - bins = maybe_bins - case _: - raise NotImplementedError( - f"Fidelity type {type(pipeline_space.fidelity)} not supported" - ) - # Domain of fidelity values, i.e. what is given in the configs that we # give to the user to evaluate at. - self._fid_domain = pipeline_space.fidelity.domain + self._fid_domain = space.fidelity.domain # Domain in which we should pass budgets to ifbo model self._budget_domain = Domain.float(1 / self._max_budget, 1) # Domain from which we assign an index to each budget - # Automatically takes care of rounding - self._budget_ix_domain = Domain.indices(bins) + self._budget_ix_domain = Domain.indices(fid_bins) def ask( self, From 995bf9c7fdfe594303f6863788c25d914080da3f Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 25 Sep 2024 14:36:07 +0200 Subject: [PATCH 49/63] doc(ifbo): Document how encoding works --- neps/optimizers/multi_fidelity/ifbo.py | 44 +++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index 2d041df1..c4ac5a0c 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -89,6 +89,43 @@ def _encode_for_ftpfn( device: torch.device | None = None, dtype: torch.dtype = FTPFN_DTYPE, ) -> tuple[torch.Tensor, torch.Tensor]: + """Encode the trials into a format that the FTPFN model can understand. + + !!! warning "loss values reported" + + The `ys` are a single dimension but consist of the losses inverted to scores. + As result, we have to assert that the loss values provided in the trials are + in the range [0, 1]. + + !!! note "X layout" + + The layout of the X is: + + ``` + | config_id | budget (normalized from fidelity) | hp_1 | hp_2 | ... | hp_n | + ``` + + Here the `budget` is normalized to the range [0, 1] while the hp parameters + are encoded according to the provided encoder, which should map the parameter + values from the original domain to some domain in [0, 1]. + + !!! warning "Pending and Error trials" + + We currently do not handle error cases, **and they are ignored**. + For trials which do not have a loss reported yet, they are considered pending + and will have `torch.nan` as their score inside the returned y values. + + Args: + trials: The trials to encode + encoder: The encoder to use + space: The search space + budget_domain: The domain to use for the budgets of the FTPFN + device: The device to use + dtype: The dtype to use + + Returns: + The encoded trials and their corresponding **scores** + """ # TODO: Currently we do not handle error cases, we can't use NaN as that # is what we use for trials that have no loss yet, i.e. pending trials. selected = { @@ -110,7 +147,7 @@ def _encode_for_ftpfn( device=device, dtype=torch.float64, ) - ids = ids + ids = ids + 1 # We add one to all ids to make room for the test configurations train_fidelities = torch.tensor( [t.config[space.fidelity_name] for t in selected.values()], device=device, @@ -143,7 +180,7 @@ def _encode_for_ftpfn( return X, maximize_ys -def _remove_duplicates(x: torch.Tensor) -> torch.Tensor: +def _keep_highest_budget_evaluation(x: torch.Tensor) -> torch.Tensor: # Does a lexsort, same as if we sorted by (config_id, budget), where # theyre are sorted according to increasing config_id and then increasing budget. # x[i2] -> sorted by config id and budget @@ -358,7 +395,7 @@ def ask( # 2. We only want to include the configuration at their highest # budget evaluated, i.e. don't include config_0_0 if config_0_1 is highest - acq_continue_existing = _remove_duplicates(acq_continue_existing) + acq_continue_existing = _keep_highest_budget_evaluation(acq_continue_existing) # 3. Sub select all that are not fully evaluated acq_continue_existing = acq_continue_existing[acq_continue_existing[:, 1] < 1] @@ -417,5 +454,4 @@ def ask( config_id = f"{real_best_id}_{budget_ix}" previous_config_id = f"{real_best_id}_{budget_ix - 1}" - return SampledConfig(config_id, best_config, previous_config_id) From 9b1d050089b66ad8ed79c2e619065782c1351f4e Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 25 Sep 2024 15:19:55 +0200 Subject: [PATCH 50/63] fix(ifbo): Example running again --- .../acquisition_samplers/__init__.py | 2 - neps/optimizers/default_searchers/ifbo.yaml | 14 ++--- neps/optimizers/multi_fidelity/ifbo.py | 53 +++++++++++-------- 3 files changed, 39 insertions(+), 30 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/acquisition_samplers/__init__.py b/neps/optimizers/bayesian_optimization/acquisition_samplers/__init__.py index e3b12572..7f53780c 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_samplers/__init__.py +++ b/neps/optimizers/bayesian_optimization/acquisition_samplers/__init__.py @@ -1,5 +1,4 @@ from .evolution_sampler import EvolutionSampler -from .freeze_thaw_sampler import FreezeThawSampler from .mutation_sampler import MutationSampler from .random_sampler import RandomSampler @@ -7,5 +6,4 @@ "random": RandomSampler, "mutation": MutationSampler, "evolution": EvolutionSampler, - "freeze-thaw": FreezeThawSampler, } diff --git a/neps/optimizers/default_searchers/ifbo.yaml b/neps/optimizers/default_searchers/ifbo.yaml index 76522922..dda4ee8c 100644 --- a/neps/optimizers/default_searchers/ifbo.yaml +++ b/neps/optimizers/default_searchers/ifbo.yaml @@ -1,9 +1,11 @@ strategy: ifbo -surrogate_model: ftpfn surrogate_model_args: version: "0.0.1" -acquisition: MFPI-random -acquisition_sampler: freeze-thaw -acquisition_sampler_args: - samples_to_draw: 250 -model_policy: PFNSurrogate \ No newline at end of file + target_path: null # Defaults to current_working_directory/.model +step_size: 1 # Step size to use for partial evaluations +use_priors: false # Whether to use priors set through `default` and `default_confidence` +sample_default_first: false # Whether to sample the default configuration first +sample_default_at_target: false # Whether to evaluate the default at the maximum fidelity or not +initial_design_size: "ndim" # How many initial samples to try before using the model +n_acquisition_new_configs: 1_000 # Number samples of new configs to include during acqusition +device: null # Device to load the model on with torch diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index c4ac5a0c..cf5f27f5 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -1,6 +1,5 @@ from typing import Any, Mapping, Literal -import math import numpy as np import torch @@ -18,6 +17,8 @@ # NOTE: Ifbo was trained using 32 bit FTPFN_DTYPE = torch.float32 +ID_COL = 0 +BUDGET_COL = 1 def _adjust_pipeline_space_to_match_stepsize( @@ -65,6 +66,7 @@ def _adjust_pipeline_space_to_match_stepsize( upper=fidelity.upper, log=fidelity.log, default=fidelity.default, + is_fidelity=True, default_confidence=fidelity.default_confidence_choice, ) return ( @@ -78,7 +80,9 @@ def _tokenize( budgets: torch.Tensor, configs: torch.Tensor, ) -> torch.Tensor: - return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1) + return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1).to( + FTPFN_DTYPE + ) def _encode_for_ftpfn( @@ -147,7 +151,6 @@ def _encode_for_ftpfn( device=device, dtype=torch.float64, ) - ids = ids + 1 # We add one to all ids to make room for the test configurations train_fidelities = torch.tensor( [t.config[space.fidelity_name] for t in selected.values()], device=device, @@ -155,7 +158,9 @@ def _encode_for_ftpfn( ) train_budgets = budget_domain.cast(train_fidelities, frm=space.fidelity.domain) X = _tokenize( - ids=torch.tensor(ids, device=device), budgets=train_budgets, configs=train_configs + ids=torch.tensor(ids, device=device), + budgets=train_budgets, + configs=train_configs, ).to(dtype) # TODO: Document that it's on the user to ensure these are already all bounded @@ -184,12 +189,12 @@ def _keep_highest_budget_evaluation(x: torch.Tensor) -> torch.Tensor: # Does a lexsort, same as if we sorted by (config_id, budget), where # theyre are sorted according to increasing config_id and then increasing budget. # x[i2] -> sorted by config id and budget - i1 = torch.argsort(x[:, 1]) - i2 = i1[torch.argsort(x[i1][:, 0], stable=True)] + i1 = torch.argsort(x[:, BUDGET_COL]) + i2 = i1[torch.argsort(x[i1][:, ID_COL], stable=True)] sorted_x = x[i2] # Now that it's sorted, we essentially want to count the occurence of each id into counts - _, counts = torch.unique_consecutive(sorted_x[:, 0], return_counts=True) + _, counts = torch.unique_consecutive(sorted_x[:, ID_COL], return_counts=True) # Now we can use these counts to get to the last occurence of each id # The -1 is because we want to index from 0 but sum starts at 1. @@ -237,12 +242,14 @@ def __init__( use_priors: bool = False, sample_default_first: bool = False, sample_default_at_target: bool = False, - # arguments for model surrogate_model_args: dict | None = None, - initial_design_size: int | None = None, + initial_design_size: int | Literal["ndim"] = "ndim", n_acquisition_new_configs: int = 1_000, device: torch.device | None = None, - **kwargs: Any, # TODO: Remove this + budget: int | float | None = None, # TODO: Remove + loss_value_on_error: float | None = None, # TODO: Remove + cost_value_on_error: float | None = None, # TODO: Remove + ignore_errors: bool = False, # TODO: Remove ): """Initialise. @@ -272,7 +279,7 @@ def __init__( self.sample_default_first = sample_default_first self.sample_default_at_target = sample_default_at_target self.device = device - self.n_initial_design = initial_design_size + self.n_initial_design: int | Literal["ndim"] = initial_design_size self.n_acquisition_new_configs = n_acquisition_new_configs self.surrogate_model_args = surrogate_model_args or {} @@ -325,9 +332,7 @@ def ask( sampler="sobol" if self._prior is None else self._prior, seed=seed, sample_fidelity="min", - sample_size=( - "ndim" if self.n_initial_design is None else self.n_initial_design - ), + sample_size=self.n_initial_design, ) if new_id < len(self._initial_design): @@ -347,7 +352,7 @@ def ask( device=self.device, ) # PFN uses `0` id for test configurations, we remove this later - x_train[:, 1] = x_train[:, 1] + 1 + x_train[:, ID_COL] = x_train[:, ID_COL] + 1 # Fantasize the result of pending trials is_pending = maximize_ys.isnan() @@ -391,23 +396,27 @@ def ask( # Construct all our samples for acqusition: # 1. Take all non-pending configs - acq_continue_existing = x_train[~is_pending].clone().detach() + acq_existing = x_train[~is_pending].clone().detach() # 2. We only want to include the configuration at their highest # budget evaluated, i.e. don't include config_0_0 if config_0_1 is highest - acq_continue_existing = _keep_highest_budget_evaluation(acq_continue_existing) + acq_existing = _keep_highest_budget_evaluation(acq_existing) # 3. Sub select all that are not fully evaluated - acq_continue_existing = acq_continue_existing[acq_continue_existing[:, 1] < 1] + acq_existing = acq_existing[ + acq_existing[:, BUDGET_COL] < self._budget_domain.upper + ] # 4. Add in the new sampled configurations - acq_samples = torch.vstack([acq_continue_existing, acq_new]) + acq_samples = torch.vstack([acq_existing, acq_new]) # 5. Add on the horizon to the budget - unclamped_budgets = acq_samples[:, 1] + horizon + unclamped_budgets = acq_samples[:, BUDGET_COL] + horizon # 6. Clamp to the maximum of the budget domain - acq_samples[:, 1] = torch.clamp(unclamped_budgets, max=self._budget_domain.upper) + acq_samples[:, BUDGET_COL] = torch.clamp( + unclamped_budgets, max=self._budget_domain.upper + ) # Now get the PI of these samples according to MFPI_Random maximize_best_y = maximize_ys.max().item() @@ -425,7 +434,7 @@ def ask( # Extract out the row which had the best PI best_ix = acq_scores.argmax() - best_id = int(acq_samples[best_ix, 0].round().item()) + best_id = int(acq_samples[best_ix, ID_COL].round().item()) best_vector = acq_samples[best_ix, 2:].unsqueeze(0) best_config = self._ftpfn_encoder.unpack(best_vector)[0] From da34471614323748a5e32ed22f8e5d38f2a3d4da Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 25 Sep 2024 15:29:19 +0200 Subject: [PATCH 51/63] refactor: Remove PFNSurrogate --- neps/optimizers/multi_fidelity/mf_bo.py | 99 ------------------------- 1 file changed, 99 deletions(-) diff --git a/neps/optimizers/multi_fidelity/mf_bo.py b/neps/optimizers/multi_fidelity/mf_bo.py index e6205d00..8cb31ceb 100755 --- a/neps/optimizers/multi_fidelity/mf_bo.py +++ b/neps/optimizers/multi_fidelity/mf_bo.py @@ -15,7 +15,6 @@ calc_total_resources_spent, update_fidelity, ) -from neps.utils.common import instance_from_map class MFBOBase: @@ -182,101 +181,3 @@ def sample_new_config( ignore_fidelity=True, ) return config - - -class PFNSurrogate: - """Special class to deal with PFN surrogate model and freeze-thaw acquisition.""" - - def __init__( - self, - pipeline_space: SearchSpace, - surrogate_model: str = "ftpfn", - surrogate_model_args: dict | None = None, - step_size: int = 1, - ): - self.train_x = None - self.train_y = None - self.observed_configs: MFObservedData | None = None - self.pipeline_space = pipeline_space - self.surrogate_model_name = surrogate_model - self.surrogate_model_args = ( - surrogate_model_args if surrogate_model_args is not None else {} - ) - - # TODO: Lift this into the responsility of the caller of this function. - self.surrogate_model = FTPFNSurrogate(**surrogate_model_args) - self.step_size = step_size - - def update_model(self) -> None: - # tokenize the observations - idxs, steps, configs, performance = get_training_data_for_freeze_thaw( - self.observed_configs.df.loc[self.observed_configs.completed_runs_index], - self.observed_configs.config_col, - self.observed_configs.perf_col, - self.pipeline_space, - step_size=self.step_size, - maximize=True, # inverts performance since NePS minimizes - ) - df_idxs = torch.Tensor(idxs) - df_x = torch.Tensor(get_tokenized_data(configs)) - df_steps = torch.Tensor(steps) - train_x = torch.hstack( - [ - df_idxs.reshape(df_steps.shape[0], 1), - df_steps.reshape(df_steps.shape[0], 1), - df_x, - ] - ) - train_y = torch.Tensor(performance) - - # fit the model, on only completed runs - self._fit(train_x, train_y) - - # fantasize pending evaluations - if self.observed_configs.pending_condition.any(): - # tokenize the pending observations - _idxs, _steps, _configs, _ = get_training_data_for_freeze_thaw( - self.observed_configs.df.loc[self.observed_configs.pending_runs_index], - self.observed_configs.config_col, - self.observed_configs.perf_col, - self.pipeline_space, - step_size=self.step_size, - maximize=True, # inverts performance since NePS minimizes - ) - _df_x = torch.Tensor(get_tokenized_data(_configs)) - _df_idxs = torch.Tensor(_idxs) - _df_steps = torch.Tensor(_steps) - _test_x = torch.hstack( - [ - _df_idxs.reshape(_df_idxs.shape[0], 1), - _df_steps.reshape(_df_steps.shape[0], 1), - _df_x, - ] - ) - _performances = self._predict(_test_x) # returns maximizing metric - # update the training data - train_x = torch.vstack([train_x, _test_x]) - train_y = torch.hstack([train_y, _performances]) - # refit the model, on completed runs + fantasized pending runs - self._fit(train_x, train_y) - - def _fit(self, train_x: torch.Tensor, train_y: torch.Tensor) -> None: - # no training required,, only preprocessing the training data as context during inference - assert self.surrogate_model is not None, "Surrogate model not set!" - self.surrogate_model.train_x = train_x - self.surrogate_model.train_y = train_y - - def _predict(self, test_x: torch.Tensor) -> torch.Tensor: - assert ( - self.surrogate_model.train_x is not None - and self.surrogate_model.train_y is not None - ), "Model not trained yet!" - return self.surrogate_model.get_mean_performance(test_x) - - def set_state( - self, - pipeline_space, - surrogate_model_args, - **kwargs, # pylint: disable=unused-argument - ): - self.pipeline_space = pipeline_space From 268dfb6feb37393f80d32eebd4574824d57191be Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 25 Sep 2024 15:36:40 +0200 Subject: [PATCH 52/63] fix(ifbo): handle all trials which contain model-able info --- neps/optimizers/multi_fidelity/ifbo.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index cf5f27f5..569e0ccf 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -130,18 +130,12 @@ def _encode_for_ftpfn( Returns: The encoded trials and their corresponding **scores** """ - # TODO: Currently we do not handle error cases, we can't use NaN as that - # is what we use for trials that have no loss yet, i.e. pending trials. + # Select all trials which have something we can actually use for modelling + # The absence of a report signifies pending selected = { trial_id: trial for trial_id, trial in trials.items() - if trial.state - not in ( - Trial.State.FAILED, - Trial.State.CRASHED, - Trial.State.UNKNOWN, - Trial.State.CORRUPTED, - ) + if trial.report is None or trial.report.loss is not None } assert space.fidelity_name is not None assert space.fidelity is not None From 673530ea8f08ffdb8935f6999c1c591547f28607 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 25 Sep 2024 16:24:08 +0200 Subject: [PATCH 53/63] refactor: Remove unused --- .../bayesian_optimization/kernels/__init__.py | 25 - .../kernels/grakel_replace/__init__.py | 8 - .../kernels/grakel_replace/edge_histogram.py | 128 --- .../kernels/grakel_replace/utils.py | 58 -- .../grakel_replace/vertex_histogram.py | 447 ---------- .../grakel_replace/weisfeiler_lehman.py | 766 ------------------ .../bayesian_optimization/kernels/utils.py | 155 ---- .../kernels/weisfilerlehman.py | 121 --- .../bayesian_optimization/models/__init__.py | 9 +- .../bayesian_optimization/models/gp.py | 2 +- .../bayesian_optimization/optimizer.py | 32 +- .../bayesian_optimization.yaml | 18 +- neps/optimizers/default_searchers/pibo.yaml | 19 +- neps/optimizers/multi_fidelity/mf_bo.py | 7 - .../multi_fidelity/sampling_policy.py | 165 ---- neps/search_spaces/architecture/api.py | 2 +- neps_examples/basic_usage/architecture.py | 5 +- .../architecture_and_hyperparameters.py | 5 + ...rs_for_architecture_and_hyperparameters.py | 2 +- .../experimental/hierarchical_architecture.py | 5 + tests/test_examples.py | 7 + .../testing_scripts/default_neps.py | 34 - 22 files changed, 46 insertions(+), 1974 deletions(-) delete mode 100644 neps/optimizers/bayesian_optimization/kernels/__init__.py delete mode 100644 neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py delete mode 100644 neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py delete mode 100644 neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py delete mode 100644 neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py delete mode 100644 neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py delete mode 100644 neps/optimizers/bayesian_optimization/kernels/utils.py delete mode 100644 neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py diff --git a/neps/optimizers/bayesian_optimization/kernels/__init__.py b/neps/optimizers/bayesian_optimization/kernels/__init__.py deleted file mode 100644 index 7c7018d0..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -from collections.abc import Callable -from functools import partial - -from .vectorial_kernels import HammingKernel, Matern32Kernel, Matern52Kernel, RBFKernel -from .weisfilerlehman import WeisfilerLehman - -StationaryKernelMapping: dict[str, Callable] = { - "m52": Matern52Kernel, - "m32": Matern32Kernel, - "rbf": RBFKernel, - "hm": HammingKernel, -} - -GraphKernelMapping: dict[str, Callable] = { - "wl": partial( - WeisfilerLehman, - h=2, - oa=False, - ), - "vh": partial( - WeisfilerLehman, - h=0, - oa=False, - ), -} diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py deleted file mode 100644 index ac1c60ad..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from neps.optimizers.bayesian_optimization.kernels.grakel_replace.vertex_histogram import ( - VertexHistogram, -) -from neps.optimizers.bayesian_optimization.kernels.grakel_replace.weisfeiler_lehman import ( - WeisfeilerLehman, -) - -__all__ = ["VertexHistogram", "WeisfeilerLehman"] diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py deleted file mode 100644 index 12a83a19..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/edge_histogram.py +++ /dev/null @@ -1,128 +0,0 @@ -"""The Edge Histogram kernel as defined in :cite:`sugiyama2015halting`.""" - -from __future__ import annotations - -from collections import Counter -from collections.abc import Iterable -from warnings import warn - -from grakel.graph import Graph -from numpy import zeros -from scipy.sparse import csr_matrix - -from .vertex_histogram import VertexHistogram - - -class EdgeHistogram(VertexHistogram): - """Edge Histogram kernel as found in :cite:`sugiyama2015halting`. - - Parameters - ---------- - sparse : bool, or 'auto', default='auto' - Defines if the data will be stored in a sparse format. - Sparse format is slower, but less memory consuming and in some cases the only solution. - If 'auto', uses a sparse matrix when the number of zeros is more than the half of the matrix size. - In all cases if the dense matrix doesn't fit system memory, I sparse approach will be tried. - - Attributes: - ---------- - None. - - """ - - def parse_input(self, X: Iterable, **kwargs): - """Parse and check the given input for EH kernel. - - Parameters - ---------- - X : iterable - For the input to pass the test, we must have: - Each element must be an iterable with at most three features and at - least one. The first that is obligatory is a valid graph structure - (adjacency matrix or edge_dictionary) while the second is - node_labels and the third edge_labels (that fitting the given graph - format). - - Returns: - ------- - out : np.array, shape=(len(X), n_labels) - A np array for frequency (cols) histograms for all Graphs (rows). - - """ - if not isinstance(X, Iterable): - raise TypeError("input must be an iterable\n") - rows, cols, data = [], [], [] - if self._method_calling in [1, 2]: - labels = {} - self._labels = labels - elif self._method_calling == 3: - labels = dict(self._labels) - ni = 0 - for i, x in enumerate(iter(X)): - is_iter = isinstance(x, Iterable) - if is_iter: - x = list(x) - if is_iter and len(x) in [0, 3]: - if len(x) == 0: - warn("Ignoring empty element on index: " + str(i)) - continue - # Our element is an iterable of at least 2 elements - L = x[2] - elif isinstance(x, Graph): - # get labels in any existing format - L = x.get_labels(purpose="any", label_type="edge") - else: - raise TypeError( - "each element of X must be either a " - + "graph object or a list with at least " - + "a graph like object and node labels " - + "dict \n" - ) - - if L is None: - raise ValueError("Invalid graph entry at location " + str(i) + "!") - # construct the data input for the numpy array - for label, frequency in Counter(L.values()).items(): - # for the row that corresponds to that graph - rows.append(ni) - - # and to the value that this label is indexed - col_idx = labels.get(label, None) - if col_idx is None: - # if not indexed, add the new index (the next) - col_idx = len(labels) - labels[label] = col_idx - - # designate the certain column information - cols.append(col_idx) - - # as well as the frequency value to data - data.append(frequency) - ni += 1 - - # Initialise the feature matrix - if self._method_calling in [1, 2]: - if self.sparse == "auto": - self.sparse_ = len(cols) / float(ni * len(labels)) <= 0.5 - else: - self.sparse_ = bool(self.sparse) - - if self.sparse_: - features = csr_matrix( - (data, (rows, cols)), shape=(ni, len(labels)), copy=False - ) - else: - # Initialise the feature matrix - try: - features = zeros(shape=(ni, len(labels))) - features[rows, cols] = data - except MemoryError: - warn("memory-error: switching to sparse") - self.sparse_, features = ( - True, - csr_matrix((data, (rows, cols)), shape=(ni, len(labels)), copy=False), - ) - - if ni == 0: - raise ValueError("parsed input is empty") - return features diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py deleted file mode 100644 index fe8f8d06..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/utils.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import annotations - -import torch - - -def calculate_kernel_matrix_as_tensor( - X, Y=None, oa=False, se_kernel=None, normalize=True -) -> torch.Tensor: - """Same as calculate kernel matrix, but in pytorch framework and uses autodiff to compute the gradient of - the kernel function with respect to the feature vector. - - This function is taken out of the class to facilitate derivative computation. - - One difference is that to prevent the un-differentiable point at the min operation if optimal assignment - kernel is used, we replace the hard-min with a soft-min differentiable approximation that uses the x-norm - approximation. - - Parameters - ---------- - X, Y: the feature vectors (X: train, Y: test). When Y is not supplied, the kernel matrix is computed with - respect to itself. - - oa: bool: whether the optimal assignment kernel should be used. - - se_kernel: Defines any successive embedding kernel to be applied over the inner produce of X and Y. If none, - a simple - - normalize: bool: Whether to normalize the GP covariance matrix to the range of [0, 1]. Default is True. - - Returns: - ------- - K: pytorch tensor, shape = [n_targets, n_inputs] - dK_dY: pytorch tensor, of the same shape of K. The derivative of the value of the kernel function with - respect to each of the X. If Y is None, the derivative is instead taken at the *training point* (i.e. X). - """ - if Y is None: - K = se_kernel.forward(X, X) if se_kernel is not None else X @ X.t() - if normalize: - K_diag = torch.sqrt(torch.diag(K)) - K_diag_outer = torch.outer(K_diag, K_diag) - return K / K_diag_outer - else: - assert Y.shape[1] == X.shape[1], ( - "got Y shape " + str(Y.shape[1]) + " but X shape " + str(X.shape[1]) - ) - K = se_kernel.forward(X, Y) if se_kernel is not None else Y @ X.t() - if normalize: - Kxx = calculate_kernel_matrix_as_tensor( - X, X, oa=oa, se_kernel=se_kernel, normalize=False - ) - Kyy = calculate_kernel_matrix_as_tensor( - Y, Y, oa=oa, se_kernel=se_kernel, normalize=False - ) - K_diag_outer = torch.outer( - torch.sqrt(torch.diag(Kyy)), torch.sqrt(torch.diag(Kxx)) - ) - return K / K_diag_outer - return K diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py deleted file mode 100644 index a3a31bdf..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/vertex_histogram.py +++ /dev/null @@ -1,447 +0,0 @@ -"""The vertex kernel as defined in :cite:`sugiyama2015halting`.""" - -from __future__ import annotations - -import logging -from collections import Counter -from collections.abc import Iterable -from typing import TYPE_CHECKING -from warnings import warn - -import numpy as np -import torch -from grakel.graph import Graph -from grakel.kernels import Kernel -from numpy import array, einsum, squeeze, zeros -from scipy.sparse import csr_matrix -from sklearn.exceptions import NotFittedError -from sklearn.utils.validation import check_is_fitted - -if TYPE_CHECKING: - from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import ( - NumericKernel, - ) - - -class VertexHistogram(Kernel): - """Vertex Histogram kernel as found in :cite:`sugiyama2015halting`. - - Parameters - ---------- - sparse : bool, or 'auto', default='auto' - Defines if the data will be stored in a sparse format. - Sparse format is slower, but less memory consuming and in some cases the only solution. - If 'auto', uses a sparse matrix when the number of zeros is more than the half of the matrix size. - In all cases if the dense matrix doesn't fit system memory, I sparse approach will be tried. - - oa: bool: default=True - Defines whether optimal assignment variant of the kernel should be used. - - se_kernel: default=None - The standard vectorial kernel to be used for successive embedding (i.e. after the transformation from graph - to the vector embedding, whether to use an additional kernel to compute the vector similarity. - - se_kernel_params: dict, default=None - Any parameters to be passed to the se_kernel - - mahalanobis_precision: np.array: - If supplied, the Malahanobis distance with the precision matrix as supplied will be computed in the dot - product, instead of the vanilla dot product. - - Attributes: - ---------- - None. - - """ - - def __init__( - self, - n_jobs=None, - normalize=False, - sparse="auto", - oa=False, - mahalanobis_precision=None, - se_kernel: NumericKernel | None = None, - requires_ordered_features: bool = False, - as_tensor: bool = True, - ): - """Initialise a vertex histogram kernel. - - require_ordered_features: bool - Whether the ordering of the features in the feature matrix matters. - If True, the features will be parsed in the same order as the WL - node label. - - Note that if called directly (not from Weisfiler Lehman kernel), turning - this option on could break the code, as the label in general is non-int. - - """ - super().__init__(n_jobs=n_jobs, normalize=normalize) - self.as_tensor = as_tensor - if self.as_tensor: - self.sparse = False - else: - self.sparse = sparse - - self.oa = oa - self.se_kernel = se_kernel - self._initialized.update({"sparse": True}) - self.mahalanobis_precision = mahalanobis_precision - self.require_ordered_features = requires_ordered_features - - self._X_diag = None - self.X_tensor = None - self.Y_tensor = None - - self._labels = None - self.sparse_ = None - self._method_calling = None - self._Y = None - self._is_transformed = None - self.X = None - - def initialize(self): - """Initialize all transformer arguments, needing initialization.""" - if not self._initialized["n_jobs"]: - if self.n_jobs is not None: - warn("no implemented parallelization for VertexHistogram") - self._initialized["n_jobs"] = True - if not self._initialized["sparse"]: - if self.sparse not in ["auto", False, True]: - raise TypeError("sparse could be False, True or auto") - self._initialized["sparse"] = True - - def parse_input(self, X, label_start_idx=0, label_end_idx=None): - """Parse and check the given input for VH kernel. - - Parameters - ---------- - X : iterable - For the input to pass the test, we must have: - Each element must be an iterable with at most three features and at - least one. The first that is obligatory is a valid graph structure - (adjacency matrix or edge_dictionary) while the second is - node_labels and the third edge_labels (that fitting the given graph - format). - - - - Returns: - ------- - out : np.array, shape=(len(X), n_labels) - A np.array for frequency (cols) histograms for all Graphs (rows). - - """ - if self.require_ordered_features: - if label_start_idx is None or label_end_idx is None: - raise ValueError( - "When requires_ordered_features flag is True, you must supply the start and end" - "indices of the feature matrix to have consistent feature dimensions!" - ) - assert ( - label_end_idx > label_start_idx - ), "End index must be larger than the start index!" - - if not isinstance(X, Iterable): - raise TypeError("input must be an iterable\n") - rows, cols, data = [], [], [] - if self._method_calling in [0, 1, 2]: - labels = {} - self._labels = labels - elif self._method_calling == 3: - labels = dict(self._labels) - ni = 0 - for i, x in enumerate(iter(X)): - is_iter = isinstance(x, Iterable) - if is_iter: - x = list(x) - if is_iter and len(x) in [0, 2, 3]: - if len(x) == 0: - warn("Ignoring empty element on index: " + str(i)) - continue - # Our element is an iterable of at least 2 elements - L = x[1] - elif isinstance(x, Graph): - # get labels in any existing format - L = x.get_labels(purpose="any") - else: - raise TypeError( - "each element of X must be either a " - "graph object or a list with at least " - "a graph like object and node labels " - "dict \n" - ) - - # construct the data input for the numpy array - for label, frequency in Counter(L.values()).items(): - # for the row that corresponds to that graph - rows.append(ni) - - # and to the value that this label is indexed - if self.require_ordered_features: - try: - col_idx = int(label) - label_start_idx # Offset - except ValueError: - logging.error( - "Failed to convert label to a valid integer. Check whether all labels are" - "numeric, and whether you called this kernel directly instead of from the" - "Weisfiler-Lehman kernel. Falling back to the default unordered feature" - "matrix." - ) - self.require_ordered_features = False - if not self.require_ordered_features: - col_idx = labels.get(label, None) - if col_idx is None: - # if not indexed, add the new index (the next) - col_idx = len(labels) - labels[label] = col_idx - - # designate the certain column information - cols.append(col_idx) - - # as well as the frequency value to data - data.append(frequency) - ni += 1 - - if self.require_ordered_features: - label_length = max(label_end_idx - label_start_idx, *cols) + 1 - else: - label_length = len(labels) - - if self._method_calling in [0, 1, 2]: - if self.sparse == "auto": - self.sparse_ = len(cols) / float(ni * label_length) <= 0.5 - else: - self.sparse_ = bool(self.sparse) - - if self.sparse_: - features = csr_matrix( - (data, (rows, cols)), shape=(ni, label_length), copy=False - ) - else: - # Initialise the feature matrix - try: - features = zeros(shape=(ni, label_length)) - features[rows, cols] = data - - except MemoryError: - warn("memory-error: switching to sparse") - self.sparse_, features = ( - True, - csr_matrix( - (data, (rows, cols)), shape=(ni, label_length), copy=False - ), - ) - - if ni == 0: - raise ValueError("parsed input is empty") - return features - - def _calculate_kernel_matrix(self, Y=None): - """Calculate the kernel matrix given a target_graph and a kernel. - - Each a matrix is calculated between all elements of Y on the rows and - all elements of X on the columns. - - Parameters - ---------- - Y : np.array, default=None - The array between samples and features. - - Returns: - ------- - K : numpy array, shape = [n_targets, n_inputs] - The kernel matrix: a calculation between all pairs of graphs - between targets and inputs. If Y is None targets and inputs - are the taken from self.X. Otherwise Y corresponds to targets - and self.X to inputs. - - """ - if Y is None: - if self.oa: - K = np.zeros((self.X.shape[0], self.X.shape[0])) - for i in range(self.X.shape[0]): - for j in range(i, self.X.shape[0]): - K[i, j] = np.sum(np.minimum(self.X[i, :], self.X[j, :])) - K[j, i] = K[i, j] - elif self.se_kernel is not None: - K = self.se_kernel._forward(self.X, self.X) - else: - K = self.X @ self.X.T - elif self.oa: - K = np.zeros((Y.shape[0], self.X.shape[0])) - for i in range(Y.shape[0]): - for j in range(self.X.shape[0]): - K[i, j] = np.sum(np.minimum(self.X[j, :], Y[i, : self.X.shape[1]])) - elif self.se_kernel is not None: - K = self.se_kernel._forward(self.X, Y) - else: - K = Y[:, : self.X.shape[1]] @ self.X.T - - if self.sparse_: - return K.toarray() - return K - - def diagonal(self, use_tensor=False): - """Calculate the kernel matrix diagonal of the fitted data. - - Parameters - ---------- - None. - - Returns: - ------- - X_diag : np.array - The diagonal of the kernel matrix, of the fitted. This consists - of each element calculated with itself. - - use_tensor: bool: - The flag to use whether return tensor instead of numpy array. All other operations are the same - - """ - # Check is fit had been called - check_is_fitted(self, ["X", "sparse_"]) - try: - check_is_fitted(self, ["_X_diag"]) - except NotFittedError: - # Calculate diagonal of X - if use_tensor: - self._X_diag = torch.einsum("ij,ij->i", [self.X_tensor, self.X_tensor]) - elif self.sparse_: - self._X_diag = squeeze(array(self.X.multiply(self.X).sum(axis=1))) - else: - self._X_diag = einsum("ij,ij->i", self.X, self.X) - try: - check_is_fitted(self, ["_Y"]) - if use_tensor: - Y_diag = torch.einsum("ij, ij->i", [self.Y_tensor, self.Y_tensor]) - return self._X_diag, Y_diag - if self.sparse_: - Y_diag = squeeze(array(self._Y.multiply(self._Y).sum(axis=1))) - else: - Y_diag = einsum("ij,ij->i", self._Y, self._Y) - return self._X_diag, Y_diag - except NotFittedError: - return self._X_diag - - def transform(self, X, return_embedding_only=False, **kwargs): - """Calculate the kernel matrix, between given and fitted dataset. - - Parameters - ---------- - X : iterable - Each element must be an iterable with at most three features and at - least one. The first that is obligatory is a valid graph structure - (adjacency matrix or edge_dictionary) while the second is - node_labels and the third edge_labels (that fitting the given graph - format). If None the kernel matrix is calculated upon fit data. - The test samples. - - return_embedding_only: bool - Whether returns the vector embedding of the kernel only (without actually - computing the kernel function). This is used when computing the derivative - of the kernel w.r.t. the test points/ - - Returns: - ------- - K : numpy array, shape = [n_targets, n_input_graphs] - corresponding to the kernel matrix, a calculation between - all pairs of graphs between target an features - - """ - self._method_calling = 3 - # Check is fit had been called - check_is_fitted(self, ["X"]) - - # Input validation and parsing - if X is None: - raise ValueError("`transform` input cannot be None") - Y = self.parse_input(X, **kwargs) - if return_embedding_only: - return Y - - self._Y = Y - self._is_transformed = True - - # Transform - calculate kernel matrix - km = self._calculate_kernel_matrix(Y) - # Self transform must appear before the diagonal call on normilization - if self.normalize: - X_diag, Y_diag = self.diagonal() - km /= np.sqrt(np.outer(Y_diag, X_diag)) - if self.as_tensor: - km = torch.tensor(km) - return km - - def fit_transform(self, X, **kwargs): - """Fit and transform, on the same dataset. - - Parameters - ---------- - X : iterable - Each element must be an iterable with at most three features and at - least one. The first that is obligatory is a valid graph structure - (adjacency matrix or edge_dictionary) while the second is - node_labels and the third edge_labels (that fitting the given graph - format). If None the kernel matrix is calculated upon fit data. - The test samples. - - y : None - There is no need of a target in a transformer, yet the pipeline API - requires this parameter. - - Returns: - ------- - K : numpy array, shape = [n_targets, n_input_graphs] - corresponding to the kernel matrix, a calculation between - all pairs of graphs between target an features - - """ - self._method_calling = 2 - self.fit(X, **kwargs) - - # Transform - calculate kernel matrix - km = self._calculate_kernel_matrix() - - self._X_diag = np.diagonal(km) - if self.normalize: - km = km / np.sqrt(np.outer(self._X_diag, self._X_diag)) - if self.as_tensor: - km = torch.tensor(km) - return km - - def fit(self, X, y=None, **kwargs): - """Fit a dataset, for a transformer. - - Parameters - ---------- - X : iterable - Each element must be an iterable with at most three features and at - least one. The first that is obligatory is a valid graph structure - (adjacency matrix or edge_dictionary) while the second is - node_labels and the third edge_labels (that fitting the given graph - format). The train samples. - - y : None - There is no need of a target in a transformer, yet the pipeline API - requires this parameter. - - Returns: - ------- - self : object - Returns self. - - """ - self._is_transformed = False - self._method_calling = 1 - - # Parameter initialization - self.initialize() - - # Input validation and parsing - if X is None: - raise ValueError("`fit` input cannot be None") - self.X = self.parse_input(X, **kwargs) - - # Return the transformer - return self diff --git a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py b/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py deleted file mode 100644 index be35c02a..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/grakel_replace/weisfeiler_lehman.py +++ /dev/null @@ -1,766 +0,0 @@ -"""The weisfeiler lehman kernel :cite:`shervashidze2011weisfeiler`.""" - -from __future__ import annotations - -import collections -import collections.abc -import logging -import warnings -from ast import literal_eval -from collections import OrderedDict -from collections.abc import Iterable -from copy import deepcopy - -import numpy as np -import torch -from grakel.graph import Graph -from grakel.kernels import Kernel -from sklearn.exceptions import NotFittedError -from sklearn.utils.validation import check_is_fitted - -from .vertex_histogram import VertexHistogram - - -class WeisfeilerLehman(Kernel): - """Compute the Weisfeiler Lehman Kernel. - - See :cite:`shervashidze2011weisfeiler`. - - Parameters - ---------- - h : int, default=5 - The number of iterations. - - base_graph_kernel : `grakel.kernel_operators.Kernel` or tuple, default=None - If tuple it must consist of a valid kernel object and a - dictionary of parameters. General parameters concerning - normalization, concurrency, .. will be ignored, and the - ones of given on `__init__` will be passed in case it is needed. - Default `base_graph_kernel` is `VertexHistogram`. - - node_weights: iterable - If not None, the nodes will be assigned different weights according - to this vector. Must be a dictionary with the following format: - {'node_name1': weight1, 'node_name2': weight2 ... } - Must be of the same length as the number of different node attributes - - Attributes: - ---------- - X : dict - Holds a dictionary of fitted subkernel modules for all levels. - - _nx : number - Holds the number of inputs. - - _h : int - Holds the number, of iterations. - - _base_graph_kernel : function - A void function that initializes a base kernel object. - - _inv_labels : dict - An inverse dictionary, used for relabeling on each iteration. - - """ - - _graph_format = "dictionary" - - def __init__( - self, - n_jobs=None, - normalize: bool = False, - h: int = 5, - base_graph_kernel=VertexHistogram, - node_weights=None, - layer_weights=None, - as_tensor: bool = True, - ): - """Initialise a `weisfeiler_lehman` kernel.""" - super().__init__(n_jobs=n_jobs, normalize=normalize) - - self.h = h - self.base_graph_kernel = base_graph_kernel - self._initialized.update( - {"h": False, "base_graph_kernel": False, "layer_weights": False} - ) - self._base_graph_kernel = None - self.weights = None - self.node_weights = node_weights - self.as_tensor = as_tensor - self.layer_weights = layer_weights # The weights of each layer. If None, each WL iteration has same weight - self.feature_dims = [ - 0, - ] # Record the dimensions of the vectors of each WL iteration - self._params = None - self._h = None - self._nx = None - self._inv_labels = None - self._inv_label_node_attr = None - self._label_node_attr = None - self._feature_weight = None - self._method_calling = None - self._is_transformed = None - self.X = None - self._X_diag = None - - self.X_fit = {} - self.K_precomputed = {} - self.base_graph_kernel_precomputed = {} - - def initialize(self): - """Initialize all transformer arguments, needing initialization.""" - super().initialize() - if not self._initialized["base_graph_kernel"]: - base_graph_kernel = self.base_graph_kernel - if base_graph_kernel is None: - base_graph_kernel, params = VertexHistogram, {} - # TODO: make sure we're always passing like this - elif type(base_graph_kernel) is type and issubclass( # pylint: disable=C0123 - base_graph_kernel, Kernel - ): - params = {} - else: - try: - base_graph_kernel, params = base_graph_kernel - except Exception as _error: - raise TypeError( - "Base kernel was not formulated in " - "the correct way. " - "Check documentation." - ) from _error - - if not ( - type(base_graph_kernel) is type # pylint: disable=C0123 - and issubclass(base_graph_kernel, Kernel) - ): - raise TypeError( - "The first argument must be a valid " - "grakel.kernel.kernel Object" - ) - if not isinstance(params, dict): - raise ValueError( - "If the second argument of base " - "kernel exists, it must be a diction" - "ary between parameters names and " - "values" - ) - params.pop("normalize", None) - - params["normalize"] = False - params["n_jobs"] = None - self._base_graph_kernel = base_graph_kernel - self._params = params - self._initialized["base_graph_kernel"] = True - - if not self._initialized["h"]: - if not isinstance(self.h, int) or self.h < 0: - raise TypeError( - "'h' must be a non-negative integer. Got h:" + str(self.h) - ) - self._h = self.h + 1 - self._initialized["h"] = True - - if self.layer_weights is None or self.layer_weights.shape[0] != self._h: - self.layer_weights = np.ones((self._h,)) - if self.as_tensor and not isinstance(self.layer_weights, torch.Tensor): - self.layer_weights = torch.tensor(self.layer_weights) - - self._initialized["h"] = True - self._initialized["layer_weights"] = True - - def change_se_kernel(self, se_kernel): - if self._base_graph_kernel is None: - self.initialize() - self._params["se_kernel"] = se_kernel - logging.info("Base kernel changed") - - def parse_input( - self, X: Iterable, return_embedding_only: bool = False, gp_fit: bool = True - ): - """Parse input for weisfeiler lehman. - - Parameters - ---------- - X : iterable - For the input to pass the test, we must have: - Each element must be an iterable with at most three features and at - least one. The first that is obligatory is a valid graph structure - (adjacency matrix or edge_dictionary) while the second is - node_labels and the third edge_labels (that correspond to the given - graph format). A valid input also consists of graph type objects. - - return_embedding_only: bool - Whether to return the embedding of the graphs only, instead of computing the kernel all - the way to the end. - - gp_fit: bool - If False use precomputed vals for first N values, else compute them and save them - - Returns: - ------- - base_graph_kernel : object - Returns base_graph_kernel. - - if requires_grad is enabled and we call fit_transform or transform, an additional torch tensor - K_grad is returned as well. - - """ - if self._method_calling not in [1, 2]: - raise ValueError( - "method call must be called either from fit " + "or fit-transform" - ) - if hasattr(self, "_X_diag"): - # Clean _X_diag value - delattr(self, "_X_diag") - - # skip kernel computation if we have already computed the corresponding kernel - if self._h in self.K_precomputed and self.X_fit[self._h] == X: - K = self.K_precomputed[self._h] - base_graph_kernel = self.base_graph_kernel_precomputed[self._h] - else: - # Input validation and parsing - if not isinstance(X, collections.abc.Iterable): - raise TypeError("input must be an iterable\n") - nx = 0 - Gs_ed, L, distinct_values, extras = {}, {}, set(), {} - for idx, x in enumerate(iter(X)): - is_iter = isinstance(x, collections.abc.Iterable) - if is_iter: - x = list(x) - if is_iter and (len(x) == 0 or len(x) >= 2): - if len(x) == 0: - warnings.warn("Ignoring empty element on index: " + str(idx)) - continue - if len(x) > 2: - extra = () - if len(x) > 3: - extra = tuple(x[3:]) - x = Graph(x[0], x[1], x[2], graph_format=self._graph_format) - extra = ( - x.get_labels( - purpose=self._graph_format, - label_type="edge", - return_none=True, - ), - *extra, - ) - else: - x = Graph(x[0], x[1], {}, graph_format=self._graph_format) - extra = () - - elif isinstance(x, Graph): - x.desired_format(self._graph_format) - el = x.get_labels( - purpose=self._graph_format, - label_type="edge", - return_none=True, - ) - extra = () if el is None else (el,) - - else: - raise TypeError( - "each element of X must be either a " - + "graph object or a list with at least " - + "a graph like object and node labels " - + "dict \n" - ) - Gs_ed[nx] = x.get_edge_dictionary() - L[nx] = x.get_labels(purpose="dictionary") - extras[nx] = extra - distinct_values |= set(L[nx].values()) - nx += 1 - if nx == 0: - raise ValueError("parsed input is empty") - - # Save the number of "fitted" graphs. - self._nx = nx - WL_labels_inverse = OrderedDict() - - # assign a number to each label - label_count = 0 - for dv in sorted(distinct_values): - WL_labels_inverse[dv] = label_count - label_count += 1 - - # Initalize an inverse dictionary of labels for all iterations - self._inv_labels = ( - OrderedDict() - ) # Inverse dictionary of labels, in term of the *previous layer* - self._inv_labels[0] = deepcopy(WL_labels_inverse) - self.feature_dims.append( - len(WL_labels_inverse) - ) # Update the zeroth iteration feature dim - - self._inv_label_node_attr = ( - OrderedDict() - ) # Inverse dictionary of labels, in term of the *node attribute* - self._label_node_attr = ( - OrderedDict() - ) # Same as above, but with key and value inverted - self._label_node_attr[0], self._inv_label_node_attr[0] = self.translate_label( - WL_labels_inverse, 0 - ) - - if self.node_weights is not None: - self._feature_weight = OrderedDict() - # Ensure the order is the same - self._feature_weight[0] = self._compute_feature_weight( - self.node_weights, 0, WL_labels_inverse - )[1] - else: - self._feature_weight = None - - def generate_graphs(label_count: int, WL_labels_inverse): - new_graphs = [] - for j in range(self._nx): - new_labels = {} - for k in L[j]: - new_labels[k] = WL_labels_inverse[L[j][k]] - L[j] = new_labels - # add new labels - new_graphs.append((Gs_ed[j], new_labels) + extras[j]) - yield new_graphs - - for i in range(1, self._h): - label_set, WL_labels_inverse, L_temp = set(), {}, {} - for j in range(nx): - # Find unique labels and sort - # them for both graphs - # Keep for each node the temporary - L_temp[j] = {} - for v in Gs_ed[j]: - credential = ( - str(L[j][v]) - + "," - + str(sorted(L[j][n] for n in Gs_ed[j][v])) - ) - L_temp[j][v] = credential - label_set.add(credential) - - label_list = sorted(label_set) - for dv in label_list: - WL_labels_inverse[dv] = label_count - label_count += 1 - - # Recalculate labels - new_graphs = [] - for j in range(nx): - new_labels = {} - for k in L_temp[j]: - new_labels[k] = WL_labels_inverse[L_temp[j][k]] - L[j] = new_labels - # relabel - new_graphs.append((Gs_ed[j], new_labels) + extras[j]) - self._inv_labels[i] = WL_labels_inverse - # Compute the translated inverse node label - ( - self._label_node_attr[i], - self._inv_label_node_attr[i], - ) = self.translate_label( - WL_labels_inverse, i, self._label_node_attr[i - 1] - ) - self.feature_dims.append( - self.feature_dims[-1] + len(self._label_node_attr[i]) - ) - # Compute the feature weight of the current layer - if self.node_weights is not None: - self._feature_weight[i] = self._compute_feature_weight( - self.node_weights, i, self._inv_label_node_attr[i] - )[1] - # assert len(self._feature_weight[i] == len(WL_labels_inverse)) - yield new_graphs - - # Initialise the base graph kernel. - base_graph_kernel = {} - - K = [] - for i, g in enumerate(generate_graphs(label_count, WL_labels_inverse)): - param = self._params - if self._feature_weight is not None: - param.update({"mahalanobis_precision": self._feature_weight[i]}) - base_graph_kernel.update({i: self._base_graph_kernel(**param)}) - if return_embedding_only: - K.append( - base_graph_kernel[i].parse_input( - g, - label_start_idx=self.feature_dims[i], - label_end_idx=self.feature_dims[i + 1], - ) - ) - elif self._method_calling == 1: - base_graph_kernel[i].fit( - g, - label_start_idx=self.feature_dims[i], - label_end_idx=self.feature_dims[i + 1], - ) - else: - K.append( - self.layer_weights[i] - * base_graph_kernel[i].fit_transform( - g, - label_start_idx=self.feature_dims[i], - label_end_idx=self.feature_dims[i + 1], - ) - ) - - if gp_fit: - self.X_fit[self._h] = X - self.K_precomputed[self._h] = K - self.base_graph_kernel_precomputed[self._h] = base_graph_kernel - - if return_embedding_only: - return K - if self._method_calling == 1: - return base_graph_kernel - if self._method_calling == 2: - if self.as_tensor: - K = torch.stack(K, dim=0).sum(dim=0) - return K, base_graph_kernel - return np.sum(K, axis=0), base_graph_kernel - return None - - def fit_transform(self, X: Iterable, y=None, gp_fit: bool = True): # pylint: disable=unused-argument - """Fit and transform, on the same dataset. - - Parameters - ---------- - X : iterable - Each element must be an iterable with at most three features and at - least one. The first that is obligatory is a valid graph structure - (adjacency matrix or edge_dictionary) while the second is - node_labels and the third edge_labels (that fitting the given graph - format). If None the kernel matrix is calculated upon fit data. - The test samples. - - y : Object, default=None - Ignored argument, added for the pipeline. - - Returns: - ------- - K : numpy array, shape = [n_targets, n_input_graphs] - corresponding to the kernel matrix, a calculation between - all pairs of graphs between target an features - - """ - self._method_calling = 2 - self._is_transformed = False - self.initialize() - self.feature_dims = [ - 0, - ] # Flush the feature dimensions - if X is None: - raise ValueError("transform input cannot be None") - km, self.X = self.parse_input(X, gp_fit=gp_fit) - - return km - - def transform(self, X: Iterable, return_embedding_only: bool = True): - """Calculate the kernel matrix, between given and fitted dataset. - - Parameters - ---------- - X : iterable - Each element must be an iterable with at most three features and at - least one. The first that is obligatory is a valid graph structure - (adjacency matrix or edge_dictionary) while the second is - node_labels and the third edge_labels (that fitting the given graph - format). If None the kernel matrix is calculated upon fit data. - The test samples. - - return_embedding_only: bool - Whether to return the embedding of the graphs only, instead of computing the kernel all - the way to the end. - - Returns: - ------- - K : numpy array, shape = [n_targets, n_input_graphs] - corresponding to the kernel matrix, a calculation between - all pairs of graphs between target an features - - """ - self._method_calling = 3 - # Check is fit had been called - check_is_fitted(self, ["X", "_nx", "_inv_labels"]) - - # Input validation and parsing - if X is None: - raise ValueError("transform input cannot be None") - if not isinstance(X, collections.abc.Iterable): - raise ValueError("input must be an iterable\n") - nx = 0 - distinct_values = set() - Gs_ed, L = {}, {} - for i, x in enumerate(iter(X)): - is_iter = isinstance(x, collections.abc.Iterable) - if is_iter: - x = list(x) - if is_iter and len(x) in [0, 2, 3]: - if len(x) == 0: - warnings.warn("Ignoring empty element on index: " + str(i)) - continue - - if len(x) in [2, 3]: - x = Graph(x[0], x[1], {}, self._graph_format) - elif isinstance(x, Graph): - x.desired_format("dictionary") - else: - raise ValueError( - "each element of X must have at " - + "least one and at most 3 elements\n" - ) - Gs_ed[nx] = x.get_edge_dictionary() - L[nx] = x.get_labels(purpose="dictionary") - - # Hold all the distinct values - distinct_values |= {v for v in L[nx].values() if v not in self._inv_labels[0]} - nx += 1 - if nx == 0: - raise ValueError("parsed input is empty") - - nl = len(self._inv_labels[0]) - WL_labels_inverse = { - dv: idx for (idx, dv) in enumerate(sorted(distinct_values), nl) - } - WL_labels_inverse = OrderedDict(WL_labels_inverse) - - def generate_graphs_transform(WL_labels_inverse, nl): - # calculate the kernel matrix for the 0 iteration - new_graphs = [] - for j in range(nx): - new_labels = {} - for k, v in L[j].items(): - if v in self._inv_labels[0]: - new_labels[k] = self._inv_labels[0][v] - else: - new_labels[k] = WL_labels_inverse[v] - L[j] = new_labels - # produce the new graphs - new_graphs.append([Gs_ed[j], new_labels]) - yield new_graphs - - for i in range(1, self._h): - new_graphs = [] - L_temp, label_set = {}, set() - nl += len(self._inv_labels[i]) - for j in range(nx): - # Find unique labels and sort them for both graphs - # Keep for each node the temporary - L_temp[j] = {} - for v in Gs_ed[j]: - credential = ( - str(L[j][v]) + "," + str(sorted(L[j][n] for n in Gs_ed[j][v])) - ) - L_temp[j][v] = credential - if credential not in self._inv_labels[i]: - label_set.add(credential) - - # Calculate the new label_set - WL_labels_inverse = {} - if len(label_set) > 0: - for dv in sorted(label_set): - idx = len(WL_labels_inverse) + nl - WL_labels_inverse[dv] = idx - - # Recalculate labels - new_graphs = [] - for j in range(nx): - new_labels = {} - for k, v in L_temp[j].items(): - if v in self._inv_labels[i]: - new_labels[k] = self._inv_labels[i][v] - else: - new_labels[k] = WL_labels_inverse[v] - L[j] = new_labels - # Create the new graphs with the new labels. - new_graphs.append([Gs_ed[j], new_labels]) - yield new_graphs - - if return_embedding_only: - K = [] - for i, g in enumerate(generate_graphs_transform(WL_labels_inverse, nl)): - K.append( - self.X[i].transform( - g, - label_start_idx=self.feature_dims[i], - label_end_idx=self.feature_dims[i + 1], - return_embedding_only=True, - ) - ) - return K - - # Calculate the kernel matrix without parallelization - if self.as_tensor: - summand = [ - self.layer_weights[i] - * self.X[i].transform( - g, - label_start_idx=self.feature_dims[i], - label_end_idx=self.feature_dims[i + 1], - ) - for i, g in enumerate(generate_graphs_transform(WL_labels_inverse, nl)) - ] - K = torch.stack(summand, dim=0).sum(dim=0) - else: - K = np.sum( - ( - self.layer_weights[i] - * self.X[i].transform( - g, - label_start_idx=self.feature_dims[i], - label_end_idx=self.feature_dims[i + 1], - ) - for (i, g) in enumerate( - generate_graphs_transform(WL_labels_inverse, nl) - ) - ), - axis=0, - ) - - self._is_transformed = True - if self.normalize: - X_diag, Y_diag = self.diagonal() - if self.as_tensor: - div_ = torch.sqrt(torch.outer(Y_diag, X_diag)) - K /= div_ - else: - old_settings = np.seterr(divide="ignore") - K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag)))) - np.seterr(**old_settings) - - return K - - def diagonal(self): - """Calculate the kernel matrix diagonal for fitted data. - - A funtion called on transform on a seperate dataset to apply - normalization on the exterior. - - Parameters - ---------- - None. - - Returns: - ------- - X_diag : np.array - The diagonal of the kernel matrix, of the fitted data. - This consists of kernel calculation for each element with itself. - - Y_diag : np.array - The diagonal of the kernel matrix, of the transformed data. - This consists of kernel calculation for each element with itself. - - """ - # Check if fit had been called - check_is_fitted(self, ["X"]) - try: - check_is_fitted(self, ["_X_diag"]) - if self._is_transformed: - Y_diag = self.X[0].diagonal()[1] - for i in range(1, self._h): - Y_diag += self.X[i].diagonal()[1] - except NotFittedError: - # Calculate diagonal of X - if self._is_transformed: - X_diag, Y_diag = self.X[0].diagonal() - # X_diag is considered a mutable and should not affect the kernel matrix itself. - X_diag.flags.writeable = True - for i in range(1, self._h): - x, y = self.X[i].diagonal() - X_diag += x - Y_diag += y - self._X_diag = X_diag - - # case sub kernel is only fitted - X_diag = self.X[0].diagonal() - # X_diag is considered a mutable and should not affect the kernel matrix itself. - X_diag.flags.writeable = True - for i in range(1, self._n_iter): - x = self.X[i].diagonal() - X_diag += x - self._X_diag = X_diag - - if self.as_tensor: - self._X_diag = torch.tensor(self._X_diag) - if Y_diag is not None: - Y_diag = torch.tensor(Y_diag) - if self._is_transformed: - return self._X_diag, Y_diag - return self._X_diag - - @staticmethod - def translate_label(curr_layer: dict, h: int, prev_layer: dict | None = None): - """Translate the label to be in terms of the node attributes - curr_layer: the WL_label_inverse object. A dictionary with element of the format of - {pattern: encoding}. - - Return: - label_in_node_attr: in terms of {encoding: pattern}, but pattern is always in term of the node attribute - inv_label_in_node_attr: in terms of {pattern: encoding} - - """ - if h == 0: - return {v: str(k) for k, v in curr_layer.items()}, curr_layer - assert prev_layer is not None - label_in_node_attr, inv_label_in_node_attr = OrderedDict(), OrderedDict() - for pattern, encoding in curr_layer.items(): - # current pattern is in terms of the encoding previous layer. Find the pattern from the prev_layer - root, leaf = literal_eval(pattern) - root_ = prev_layer[root] - leaf_ = [prev_layer[i] for i in leaf] - label_in_node_attr.update({encoding: "~".join([root_, *leaf_])}) - inv_label_in_node_attr.update({"~".join([root_, *leaf_]): encoding}) - return label_in_node_attr, inv_label_in_node_attr - - @staticmethod - def _compute_feature_weight( - node_weight: OrderedDict, h: int, inv_label_node_attr: OrderedDict - ): - """Compute the feature weight, based on the average weight of the constituent node attributes. - - Return: - feature_weights: a dictionary with h layers, each of which is a dictionary of the format of - {tuple1: weight1; tuplr2, weight2 ...} where tuplex is the tuple representation of the learned graph feature. - - feature_weight_flattened: same as above, but in a flattened np format. - """ - feature_weights = OrderedDict() - feature_weights_flattened = [] - if h == 0: - feature_weight = OrderedDict( - {k: (node_weight[k]) ** 2 for k in inv_label_node_attr} - ) - feature_weights_flattened = np.array(list(feature_weight.values())).flatten() - else: - for k, _ in inv_label_node_attr.items(): - # k is the pattern, v is the encoding - k_sep = k.split("~") - average_weight = np.mean([(node_weight[i]) ** 2 for i in k_sep]) - feature_weights.update({k: average_weight}) - feature_weights_flattened.append(average_weight) - feature_weights_flattened = np.array(feature_weights_flattened).flatten() - assert len(feature_weights_flattened) == len(inv_label_node_attr) - return feature_weights, feature_weights_flattened - - def dK_dX(self, X_test: None): - """Do additional forward and backward pass, compute the kernel derivative wrt the testing location. - If no test locations are provided, the derivatives are evaluated at the training points. - - Returns. - ------- - - """ - - -def efit(obj, data): - """Fit an object on data.""" - obj.fit(data) - - -def efit_transform(obj, data): - """Fit-Transform an object on data.""" - return obj.fit_transform(data) - - -def etransform(obj, data): - """Transform an object on data.""" - return obj.transform(data) diff --git a/neps/optimizers/bayesian_optimization/kernels/utils.py b/neps/optimizers/bayesian_optimization/kernels/utils.py deleted file mode 100644 index 6d94a25d..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/utils.py +++ /dev/null @@ -1,155 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import networkx as nx -import numpy as np - -if TYPE_CHECKING: - from neps.search_spaces.search_space import SearchSpace - - -def transform_to_undirected(gr: list): - """Transform a list of directed graphs by undirected graphs.""" - undirected_gr = [] - for g in gr: - if not isinstance(g, nx.Graph): - continue - if isinstance(g, nx.DiGraph): - undirected_gr.append(g.to_undirected()) - else: - undirected_gr.append(g) - return undirected_gr - - -def extract_configs(configs: list[SearchSpace]) -> tuple[list, list]: - """Extracts graph & HPs from configs objects. - - Args: - configs (list): Object holding graph and/or HPs - - Returns: - Tuple[list, list]: list of graphs, list of HPs - """ - config_hps = [conf.get_normalized_hp_categories() for conf in configs] - graphs = [hps["graphs"] for hps in config_hps] - - # Don't call np.array on structured objects - # https://github.com/numpy/numpy/issues/24546#issuecomment-1693913119 - # _nested_graphs = np.array(graphs, dtype=object) - # if _nested_graphs.ndim == 3 - # graphs = _nested_graphs[:, :, 0].reshape(-1).tolist() - # Long hand way of doing the above - # I guess this is just flattening... - if ( - len(graphs) > 0 - and isinstance(graphs[0], list) - and len(graphs[0]) > 0 - and isinstance(graphs[0][0], list) - ): - graphs = [_list for list_of_list in graphs for _list in list_of_list] - - return graphs, config_hps - - -def graph_metrics(graph, metric=None, directed=True): - G = graph if directed else graph.to_undirected() - - # global metrics - if metric == "avg_path_length": - avg_path_length = nx.average_shortest_path_length(G) - metric_score = avg_path_length - - elif metric == "density": - density = nx.density(G) - metric_score = density - - else: - raise NotImplementedError - - return metric_score - - -def extract_configs_hierarchy( - configs: list, d_graph_features: int, hierarchy_consider=None -) -> tuple[list, list]: - """Extracts graph & graph features from configs objects - Args: - configs (list): Object holding graph and/or graph features - d_graph_features (int): Number of global graph features used; if d_graph_features=0, indicate not using global graph features - hierarchy_consider (list or None): Specify graphs at which earlier hierarchical levels to be considered - Returns: - Tuple[list, list]: list of graphs, list of HPs. - """ - N = len(configs) - - config_hps = [conf.get_normalized_hp_categories() for conf in configs] - combined_graphs = [hps["graphs"] for hps in config_hps] - if N > 0 and hierarchy_consider is not None and combined_graphs[0]: - # graphs = list( - # map( - # list, - # zip( - # *[ - # [g[0][0]] - # + [g[0][1][hierarchy_id] for hierarchy_id in hierarchy_consider] - # for g in combined_graphs - # ] - # ), - # ) - # ) - graphs = list( - map( - list, - zip( - *[ - [g[0][0]] - + [ - g[0][1][hierarchy_id] - if hierarchy_id in g[0][1] - else g[0][1][max(g[0][1].keys())] - for hierarchy_id in hierarchy_consider - ] - for g in combined_graphs - ], - strict=False, - ), - ) - ) - ### full graph, 0th hierarchy (high-level, smallest), 1st hierarchy, 2nd hierarchy, 3rd hierarchy, ... - ### graph gets bigger of hierarchies - ### list shape: (1+4) x N - - # modify the node attribute labels on earlier hierarchy graphs e.g. - # note the node feature for graph in earlier hierarchical level should be more coarse - # e.g. {'op_name': '(Cell diamond (OPS id) (OPS avg_pool) (OPS id) (OPS avg_pool))'} -> {'op_name': 'Cell diamond '} - for hg_list in graphs[1:]: - for G in hg_list: - original_node_labels = nx.get_node_attributes(G, "op_name") - new_node_labels = { - k: v.split("(")[1] - for k, v in original_node_labels.items() - if "(" in v and ")" in v - } - nx.set_node_attributes(G, new_node_labels, name="op_name") - else: - # graphs = [g[0][0] for g in combined_graphs] - graphs = combined_graphs - - if N > 0 and d_graph_features > 0: - # graph_features = [c['metafeature'] for c in configs] - # these feature values are normalised between 0 and 1 - # the two graph features used are 'avg_path_length', 'density' - graph_features = [ - [ - graph_metrics(g[0][0], metric="avg_path_length"), - graph_metrics(g[0][0], metric="density"), - ] - for g in combined_graphs - ] - graph_features_array = np.vstack(graph_features) # shape n_archs x 2 (nx(2+d_hp)) - else: - # if not using global graph features of the final architectures, set them to None - graph_features_array = [None] * N - - return graphs, graph_features_array diff --git a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py b/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py deleted file mode 100644 index 44e8b8e1..00000000 --- a/neps/optimizers/bayesian_optimization/kernels/weisfilerlehman.py +++ /dev/null @@ -1,121 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING -from typing_extensions import Self - -import numpy as np -import numpy.typing as npt -import torch -from torch import nn - -from neps.optimizers.bayesian_optimization.kernels.grakel_replace import ( - VertexHistogram, - WeisfeilerLehman as _WL, -) -from neps.optimizers.bayesian_optimization.kernels.kernel import Kernel - -if TYPE_CHECKING: - from neps.optimizers.bayesian_optimization.kernels.vectorial_kernels import ( - NumericKernel, - ) - -GRID_WL_LENGTHSCALES = torch.tensor([np.e**i for i in range(-2, 3)]) -GRID_WL_SUBTREE_CANDIDATES = (1, 2, 3, 4, 5) - - -def normal_prior(param: torch.Tensor, mean: float, std: float) -> torch.Tensor: - return -0.5 * torch.sum(((param - mean) / std) ** 2) - torch.sum( - torch.log(std * torch.sqrt(2 * torch.tensor(np.pi))) - ) - - -def kernel_hp_prior(params: dict[str, nn.Parameter]) -> torch.Tensor: - return normal_prior(params["layer_weights"], mean=0, std=1) - - -class WeisfilerLehman(Kernel[npt.NDArray[np.object_]]): - """Weisfiler Lehman kernel using grakel functions.""" - - def __init__( - self, - *, - h: int = 0, - se_kernel: NumericKernel | None = None, - layer_weights: torch.Tensor | None = None, - oa: bool = False, - node_label: str = "op_name", - ): - """Initializes the Weisfeiler-Lehman kernel. - - Args: - h: The number of Weisfeiler-Lehman iterations - se_kernel: defines a stationary vector kernel to be used for - successive embedding (i.e. the kernel function on which the - vector embedding inner products are computed). - If None, uses the default linear kernel - layer_weights: The weights for each layer of the Weisfeiler-Lehman kernel. - If None, uses uniform 1s - oa: whether the optimal assignment variant of the Weisfiler-Lehman - kernel should be used - node_label: the node_label defining the key node attribute. - """ - super().__init__(hyperparameter_prior=kernel_hp_prior) - if se_kernel is not None and oa: - raise ValueError( - "Only one or none of se (successive embedding) and oa (optimal assignment) may be true!" - ) - - self.h = h - self.se_kernel = se_kernel - self.layer_weights = ( - layer_weights if layer_weights is not None else torch.ones(h + 1) - ) - self.oa = oa - self.node_label = node_label - if node_label != "op_name": - raise NotImplementedError("Only 'op_name' is supported for node_label") - - self.wl_kernel_: _WL | None = None - - def as_optimizable(self) -> Self: - return self.clone_with(layer_weights=nn.Parameter(self.layer_weights)) - - def fit_transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor: - self.wl_kernel_ = _WL( - h=self.h, - base_graph_kernel=( # type: ignore - VertexHistogram, - { - "sparse": False, - "se_kernel": self.se_kernel, - "oa": self.oa, - "requires_ordered_features": True, - }, - ), - layer_weights=self.layer_weights / self.layer_weights.sum(), - normalize=True, - ) - - K = self.wl_kernel_.fit_transform(iter(gr)) - return torch.as_tensor(K, dtype=torch.float64) - - def transform(self, gr: npt.NDArray[np.object_]) -> torch.Tensor: - assert self.wl_kernel_ is not None - - K = self.wl_kernel_.transform(iter(gr)) - return torch.as_tensor(K, dtype=torch.float64) - - def forward( - self, - x: npt.NDArray[np.object_], - x2: npt.NDArray[np.object_] | None = None, - ) -> torch.Tensor: - if x2 is None: - K = self.fit_transform(x) - self.wl_kernel_ = None - return K - - self.fit_transform(x) - K = self.transform(x2) - self.wl_kernel_ = None - return K diff --git a/neps/optimizers/bayesian_optimization/models/__init__.py b/neps/optimizers/bayesian_optimization/models/__init__.py index 49ac7258..034049a3 100755 --- a/neps/optimizers/bayesian_optimization/models/__init__.py +++ b/neps/optimizers/bayesian_optimization/models/__init__.py @@ -1,9 +1,4 @@ from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate +from neps.optimizers.bayesian_optimization.models.gp import make_default_single_obj_gp -# TODO: Need the GP back here -# * What actually uses the GP -SurrogateModelMapping = { - "ftpfn": FTPFNSurrogate, -} - -__all__ = ["FTPFNSurrogate", "SurrogateModelMapping"] +__all__ = ["FTPFNSurrogate", "make_default_single_obj_gp"] diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index b3f5c2b2..6cafb1f7 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -50,7 +50,7 @@ def default_categorical_kernel( ) -def default_single_obj_gp( +def make_default_single_obj_gp( x: TensorPack, y: torch.Tensor, *, diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 11eca577..ad167a8a 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -19,7 +19,7 @@ pibo_acquisition, ) from neps.optimizers.bayesian_optimization.models.gp import ( - default_single_obj_gp, + make_default_single_obj_gp, optimize_acq, ) from neps.optimizers.intial_design import make_initial_design @@ -28,10 +28,7 @@ from neps.search_spaces.hyperparameters.categorical import CategoricalParameter if TYPE_CHECKING: - from neps.search_spaces import ( - SearchSpace, - ) - from neps.search_spaces.domain import Domain + from neps.search_spaces import SearchSpace from neps.search_spaces.hyperparameters.float import FloatParameter from neps.search_spaces.hyperparameters.integer import IntegerParameter from neps.state import BudgetInfo, Trial @@ -144,7 +141,11 @@ def __init__( # noqa: D417 device: torch.device | None = None, encoder: TensorEncoder | None = None, seed: int | None = None, - treat_fidelity_as_hyperparameters: bool = False, + budget: Any | None = None, # TODO: remove + surrogate_model: Any | None = None, # TODO: remove + loss_value_on_error: Any | None = None, # TODO: remove + cost_value_on_error: Any | None = None, # TODO: remove + ignore_errors: Any | None = None, # TODO: remove ): """Initialise the BO loop. @@ -167,9 +168,6 @@ def __init__( # noqa: D417 device: Device to use for the optimization. encoder: Encoder to use for encoding the configurations. If None, it will will use the default encoder. - treat_fidelity_as_hyperparameters: Whether to treat fidelities as - hyperparameters. If left as False, fidelities will be ignored - and configurations will always be sampled at the maximum fidelity. Raises: ValueError: if initial_design_size < 1 @@ -183,12 +181,8 @@ def __init__( # noqa: D417 **pipeline_space.numerical, **pipeline_space.categoricals, } - if treat_fidelity_as_hyperparameters: - params.update(pipeline_space.fidelities) - self.encoder = TensorEncoder.default(params) if encoder is None else encoder self.prior = Prior.from_parameters(params) if use_priors is True else None - self.treat_fidelity_as_hyperparameters = treat_fidelity_as_hyperparameters self.seed = seed self.use_cost = use_cost self.device = device @@ -222,9 +216,7 @@ def ask( sample_size=( "ndim" if self.n_initial_design is None else self.n_initial_design ), - sample_fidelity=( - "max" if not self.treat_fidelity_as_hyperparameters else True - ), + sample_fidelity="max", ) if n_trials_sampled < len(self.initial_design_): @@ -233,10 +225,10 @@ def ask( # Now we actually do the BO loop, start by encoding the data # TODO: Lift this into runtime, let the optimizer advertise the encoding wants... - x_configs: list[dict[str, Any]] = [] + x_configs: list[Mapping[str, Any]] = [] ys: list[float] = [] costs: list[float] = [] - pending: list[dict[str, Any]] = [] + pending: list[Mapping[str, Any]] = [] for trial in trials.values(): if trial.state.pending(): pending.append(trial.config) @@ -260,7 +252,7 @@ def ask( y = _missing_y_strategy(y) # Now fit our model - y_model = default_single_obj_gp( + y_model = make_default_single_obj_gp( x, y, # TODO: We should consider applying some heurisitc to see if this should @@ -318,7 +310,7 @@ def ask( cost = torch.tensor(costs, dtype=torch.float64, device=self.device) cost_z_score = _missing_cost_strategy(cost) - cost_model = default_single_obj_gp( + cost_model = make_default_single_obj_gp( x, cost_z_score, y_transform=ChainedOutcomeTransform( diff --git a/neps/optimizers/default_searchers/bayesian_optimization.yaml b/neps/optimizers/default_searchers/bayesian_optimization.yaml index 9b5a3f37..c3525cc4 100644 --- a/neps/optimizers/default_searchers/bayesian_optimization.yaml +++ b/neps/optimizers/default_searchers/bayesian_optimization.yaml @@ -1,16 +1,6 @@ strategy: bayesian_optimization # Arguments that can be modified by the user -surrogate_model: gp -acquisition: EI # or {"LogEI", "AEI"} -log_prior_weighted: false -acquisition_sampler: mutation # or {"random", "evolution"} -random_interleave_prob: 0.0 -disable_priors: true -sample_default_first: false - -# Other arguments: -# surrogate_model_args: None # type: dict -# optimal_assignment: false # type: bool -# domain_se_kernel: None # type: str -# graph_kernels: None # type: list -# hp_kernels: None # type: list +initial_design_size: null # Defaults to depending on number or hyperparameters +use_cost: false # Whether to factor in cost when selecting new configurations +sample_default_first: # Whether to sample the default configuration first +device: null # Device to load the gaussian process model on with torch diff --git a/neps/optimizers/default_searchers/pibo.yaml b/neps/optimizers/default_searchers/pibo.yaml index 0dc7a7db..36bff8b2 100644 --- a/neps/optimizers/default_searchers/pibo.yaml +++ b/neps/optimizers/default_searchers/pibo.yaml @@ -1,17 +1,6 @@ strategy: pibo # Arguments that can be modified by the user -surrogate_model: gp -acquisition: EI # or {"LogEI", "AEI"} -log_prior_weighted: false -acquisition_sampler: mutation # or {"random", "evolution"} -random_interleave_prob: 0.0 -disable_priors: false -prior_confidence: medium # or {"low", "high"} -sample_default_first: false - -# Other arguments: -# surrogate_model_args: None # type: dict -# optimal_assignment: false # type: bool -# domain_se_kernel: None # type: str -# graph_kernels: None # type: list -# hp_kernels: None # type: list +initial_design_size: null # Defaults to depending on number or hyperparameters +use_cost: false # Whether to factor in cost when selecting new configurations +sample_default_first: # Whether to sample the default configuration first +device: null # Device to load the gaussian process model on with torch diff --git a/neps/optimizers/multi_fidelity/mf_bo.py b/neps/optimizers/multi_fidelity/mf_bo.py index 8cb31ceb..2a092da1 100755 --- a/neps/optimizers/multi_fidelity/mf_bo.py +++ b/neps/optimizers/multi_fidelity/mf_bo.py @@ -3,14 +3,7 @@ from copy import deepcopy -import torch -from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate -from neps.optimizers.multi_fidelity.utils import ( - MFObservedData, - get_tokenized_data, - get_training_data_for_freeze_thaw, -) from neps.optimizers.multi_fidelity_prior.utils import ( calc_total_resources_spent, update_fidelity, diff --git a/neps/optimizers/multi_fidelity/sampling_policy.py b/neps/optimizers/multi_fidelity/sampling_policy.py index 9208e4c3..58d75387 100644 --- a/neps/optimizers/multi_fidelity/sampling_policy.py +++ b/neps/optimizers/multi_fidelity/sampling_policy.py @@ -16,7 +16,6 @@ from neps.optimizers.bayesian_optimization.acquisition_samplers import ( AcquisitionSamplerMapping, ) -from neps.optimizers.bayesian_optimization.models import SurrogateModelMapping from neps.optimizers.multi_fidelity_prior.utils import ( compute_config_dist, custom_crossover, @@ -397,167 +396,3 @@ def sample( # TODO: can generalize s.t. sampler works for all types, currently, # random sampler in NePS does not do what is required here # return self.acquisition_sampler.sample(self.acquisition) - - -class BaseDynamicModelPolicy(SamplingPolicy): - def __init__( - self, - pipeline_space: SearchSpace, - observed_configs: Any = None, - surrogate_model: str | Any = "gp", - domain_se_kernel: str | None = None, - hp_kernels: list | None = None, - graph_kernels: list | None = None, - surrogate_model_args: dict | None = None, - acquisition: str | BaseAcquisition = "EI", - use_priors: bool = False, - log_prior_weighted: bool = False, - acquisition_sampler: str | AcquisitionSampler = "random", - patience: int = 100, - logger=None, - ): - super().__init__(pipeline_space=pipeline_space, logger=logger) - - surrogate_model_args = surrogate_model_args or {} - - graph_kernels, hp_kernels = get_default_kernels( - pipeline_space=pipeline_space, - domain_se_kernel=domain_se_kernel, - graph_kernels=graph_kernels, - hp_kernels=hp_kernels, - optimal_assignment=False, - ) - if "graph_kernels" not in surrogate_model_args: - surrogate_model_args["graph_kernels"] = graph_kernels - if "hp_kernels" not in surrogate_model_args: - surrogate_model_args["hp_kernels"] = hp_kernels - if not surrogate_model_args["hp_kernels"]: - raise ValueError("No kernels are provided!") - if "vectorial_features" not in surrogate_model_args: - surrogate_model_args["vectorial_features"] = ( - pipeline_space.get_vectorial_dim() - ) - - self.surrogate_model = instance_from_map( - SurrogateModelMapping, - surrogate_model, - name="surrogate model", - kwargs=surrogate_model_args, - ) - - self.acquisition = instance_from_map( - AcquisitionMapping, - acquisition, - name="acquisition function", - ) - - if use_priors and pipeline_space.has_prior: - self.acquisition = DecayingPriorWeightedAcquisition( - self.acquisition, log=log_prior_weighted - ) - - self.acquisition_sampler = instance_from_map( - AcquisitionSamplerMapping, - acquisition_sampler, - name="acquisition sampler function", - kwargs={"patience": patience, "pipeline_space": pipeline_space}, - ) - - self.sampling_args: dict = {} - - self.observed_configs = observed_configs - - def _fantasize_pending(self, train_x, train_y, pending_x): - if len(pending_x) == 0: - return train_x, train_y - # fit model on finished evaluations - self.surrogate_model.fit(train_x, train_y) - # hallucinating: predict for the pending evaluations - _y, _ = self.surrogate_model.predict(pending_x) - _y = _y.detach().numpy().tolist() - # appending to training data - train_x.extend(pending_x) - train_y.extend(_y) - return train_x, train_y - - def update_model(self, train_x=None, train_y=None, pending_x=None, decay_t=None): - if train_x is None: - train_x = [] - if train_y is None: - train_y = [] - if pending_x is None: - pending_x = [] - - if decay_t is None: - decay_t = len(train_x) - train_x, train_y = self._fantasize_pending(train_x, train_y, pending_x) - self.surrogate_model.fit(train_x, train_y) - self.acquisition.set_state(self.surrogate_model, decay_t=decay_t) - self.acquisition_sampler.set_state(x=train_x, y=train_y) - - @abstractmethod - def sample(self, *args, **kwargs) -> tuple[int, SearchSpace]: - pass - - -class RandomPromotionDynamicPolicy(BaseDynamicModelPolicy): - def __init__(self, *args, **kwargs): - self.num_train_configs = 0 - - super().__init__(*args, **kwargs) - - def _fantasize_pending(self, *args, **kwargs): - pending_configs = [] - - # Select configs that are neither pending nor resulted in error - completed_configs = self.observed_configs.completed_runs.copy(deep=True) - - # Get the config, performance values for the maximum budget runs that are completed - max_budget_samples = completed_configs.sort_index().groupby(level=0).last() - max_budget_configs = max_budget_samples[ - self.observed_configs.config_col - ].to_list() - max_budget_perf = max_budget_samples[self.observed_configs.perf_col].to_list() - - pending_condition = self.observed_configs.pending_condition - if pending_condition.any(): - pending_configs = ( - self.observed_configs.df[pending_condition] - .loc[(), self.observed_configs.config_col] - .unique() - .to_list() - ) - return super()._fantasize_pending( - max_budget_configs, max_budget_perf, pending_configs - ) - - def sample(self, rand_promotion_prob=0.5, seed=777, is_promotion=False, **kwargs): - promoted = False - # np.random.seed(seed) - if np.random.random_sample() < rand_promotion_prob: - config_id = ( - self.observed_configs.df[~self.observed_configs.error_condition] - .sample(1) - .index[0][0] - ) - max_budget_id = self.observed_configs.df.loc[(config_id,)].index[-1] - config = self.observed_configs.df.loc[ - (config_id, max_budget_id), self.observed_configs.config_col - ] - promoted = True - - else: - config_id = len(self.observed_configs.df.index.levels[0]) - config = self.acquisition_sampler.sample(self.acquisition) - - if is_promotion and promoted: - return config_id - if is_promotion: - return None - return config - - # def sample(self, **kwargs): - # return self._sample(is_promotion=False, **kwargs) - # - # def retrieve_promotions(self, **kwargs): - # return self._sample(is_promotion=True, **kwargs) diff --git a/neps/search_spaces/architecture/api.py b/neps/search_spaces/architecture/api.py index de19a9ef..ba73f1ca 100644 --- a/neps/search_spaces/architecture/api.py +++ b/neps/search_spaces/architecture/api.py @@ -1,4 +1,4 @@ - +from __future__ import annotations import inspect from typing import TYPE_CHECKING, Callable diff --git a/neps_examples/basic_usage/architecture.py b/neps_examples/basic_usage/architecture.py index 5d43efe7..cc73029a 100644 --- a/neps_examples/basic_usage/architecture.py +++ b/neps_examples/basic_usage/architecture.py @@ -1,4 +1,7 @@ - +raise NotImplementedError( + "Support for graphs was temporarily removed, if you'd like to use a version" + " of NePS that supports graphs, please use version v0.12.2" +) import logging diff --git a/neps_examples/basic_usage/architecture_and_hyperparameters.py b/neps_examples/basic_usage/architecture_and_hyperparameters.py index e0b63fe4..c83f3eac 100644 --- a/neps_examples/basic_usage/architecture_and_hyperparameters.py +++ b/neps_examples/basic_usage/architecture_and_hyperparameters.py @@ -1,3 +1,8 @@ +raise NotImplementedError( + "Support for graphs was temporarily removed, if you'd like to use a version" + " of NePS that supports graphs, please use version v0.12.2" +) + import logging from torch import nn diff --git a/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py b/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py index 77ce9e9f..5aa3e523 100644 --- a/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py +++ b/neps_examples/experimental/expert_priors_for_architecture_and_hyperparameters.py @@ -130,5 +130,5 @@ def run_pipeline(some_architecture, some_float, some_integer, some_cat): pipeline_space=pipeline_space, root_directory="results/user_priors_with_graphs", max_evaluations_total=15, - log_prior_weighted=True, + use_priors=True, ) diff --git a/neps_examples/experimental/hierarchical_architecture.py b/neps_examples/experimental/hierarchical_architecture.py index 55ed9144..20a912d0 100644 --- a/neps_examples/experimental/hierarchical_architecture.py +++ b/neps_examples/experimental/hierarchical_architecture.py @@ -1,3 +1,8 @@ +raise NotImplementedError( + "Support for graphs was temporarily removed, if you'd like to use a version" + " of NePS that supports graphs, please use version v0.12.2" +) + import logging from torch import nn diff --git a/tests/test_examples.py b/tests/test_examples.py index abdd10c5..5575eb4d 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -38,6 +38,13 @@ def test_core_examples(example): # Run hyperparameters example to have something to analyse runpy.run_path(str(core_examples_scripts[0]), run_name="__main__") + if example.name in ( + "architecture.py", + "hierarchical_architecture.py", + "expert_priors_for_architecture_and_hyperparameters.py", + ): + pytest.xfail("Architecture were removed temporarily") + runpy.run_path(str(example), run_name="__main__") diff --git a/tests/test_neps_api/testing_scripts/default_neps.py b/tests/test_neps_api/testing_scripts/default_neps.py index 370c6255..c6e1ac12 100644 --- a/tests/test_neps_api/testing_scripts/default_neps.py +++ b/tests/test_neps_api/testing_scripts/default_neps.py @@ -1,7 +1,6 @@ import logging import neps -from neps.optimizers.bayesian_optimization.kernels import GraphKernelMapping pipeline_space_fidelity_priors = dict( val1=neps.FloatParameter(lower=-10, upper=10, default=1), @@ -46,39 +45,6 @@ def run_pipeline(val1, val2): eta=3, ) -# Case 2: Choosing Bayesian optimization - -early_hierarchies_considered = "0_1_2_3" -hierarchy_considered = [int(hl) for hl in early_hierarchies_considered.split("_")] -graph_kernels = ["wl"] * (len(hierarchy_considered) + 1) -wl_h = [2, 1] + [2] * (len(hierarchy_considered) - 1) -graph_kernels = [ - GraphKernelMapping[kernel]( - h=wl_h[j], - oa=False, - se_kernel=None, - ) - for j, kernel in enumerate(graph_kernels) -] -surrogate_model = -surrogate_model_args = { - "graph_kernels": graph_kernels, - "hp_kernels": [], - "verbose": False, - "hierarchy_consider": hierarchy_considered, - "d_graph_features": 0, - "vectorial_features": None, -} -neps.run( - run_pipeline=run_pipeline, - pipeline_space=pipeline_space_not_fidelity, - root_directory="bo_user_decided", - max_evaluations_total=1, - searcher="bayesian_optimization", - surrogate_model=surrogate_model, - surrogate_model_args=surrogate_model_args, -) - # Testing neps decision tree on deciding the searcher and rejecting the # additional arguments. From 178ae68d302187de003b299f89468135d6f8c9e3 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Mon, 30 Sep 2024 16:15:14 +0200 Subject: [PATCH 54/63] refactor(ifbo): Mostly clean --- .../acquisition_functions/cost_cooling.py | 2 - .../acquisition_functions/pibo.py | 4 +- .../weighted_acquisition.py | 15 +- .../bayesian_optimization/models/ftpfn.py | 272 +++++++++++++++++- .../bayesian_optimization/models/gp.py | 248 ++++++++++++++-- .../bayesian_optimization/optimizer.py | 249 ++++------------ neps/optimizers/intial_design.py | 13 +- neps/optimizers/multi_fidelity/ifbo.py | 260 ++++++++--------- .../multi_fidelity/sampling_policy.py | 129 +++++---- neps/sampling/priors.py | 21 +- neps/sampling/samplers.py | 104 ++++++- neps/search_spaces/domain.py | 54 ++-- neps/search_spaces/encoding.py | 121 +------- 13 files changed, 897 insertions(+), 595 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py b/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py index a32baebe..cea2aebd 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/cost_cooling.py @@ -42,7 +42,6 @@ def cost_cooled_acq( acq_fn: AcquisitionFunction, model: GPyTorchModel, used_budget_percentage: float, - X_pending: torch.Tensor | None = None, ) -> WeightedAcquisition: assert 0 <= used_budget_percentage <= 1 return WeightedAcquisition( @@ -52,5 +51,4 @@ def cost_cooled_acq( cost_model=model, alpha=1 - used_budget_percentage, ), - X_pending=X_pending, ) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py index db3120e7..c61d9d56 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py @@ -40,7 +40,7 @@ def apply_pibo_acquisition_weight( prior_exponent: float, ): if acq._log: - weighted_log_probs = prior.log_prob(X, frm=x_domain) * prior_exponent + weighted_log_probs = prior.log_prob(X, frm=x_domain) + prior_exponent return acq_values + weighted_log_probs weighted_probs = prior.prob(X, frm=x_domain).pow(prior_exponent) @@ -52,7 +52,6 @@ def pibo_acquisition( prior: Prior, prior_exponent: float, x_domain: Domain | list[Domain], - X_pending: Tensor | None = None, ) -> WeightedAcquisition: return WeightedAcquisition( acq=acq_fn, @@ -62,5 +61,4 @@ def pibo_acquisition( x_domain=x_domain, prior_exponent=prior_exponent, ), - X_pending=X_pending, ) diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py index f589298b..fd23d331 100644 --- a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py +++ b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py @@ -97,7 +97,6 @@ def __init__( self, acq: A, apply_weight: Callable[[Tensor, Tensor, A], Tensor], - X_pending: Tensor | None = None, ) -> None: """Initialize the weighted acquisition function. @@ -109,15 +108,19 @@ def __init__( Please see the module docstring for more information on the dimensions and how to handle them. - X_pending: `n x d` Tensor with `n` `d`-dim design points that have - been submitted for evaluation but have not yet been evaluated. """ super().__init__(model=acq.model) # NOTE: We remove the X_pending from the base acquisition function as we will get # it in our own forward with `@concatenate_pending_points` and pass that forward. - # This avoids possible duplicates - acq.set_X_pending(None) - self.set_X_pending(X_pending) + # This avoids possible duplicates. Also important to explicitly set it to None + # even if it does not exist as otherwise the attribute does not exists -_- + if (X_pending := getattr(acq, "X_pending", None)) is not None: + acq.set_X_pending(None) + self.set_X_pending(X_pending) + else: + acq.set_X_pending(None) + self.set_X_pending(None) + self.apply_weight = apply_weight self.acq = acq self._log = acq._log diff --git a/neps/optimizers/bayesian_optimization/models/ftpfn.py b/neps/optimizers/bayesian_optimization/models/ftpfn.py index 2041396f..6c6464a6 100644 --- a/neps/optimizers/bayesian_optimization/models/ftpfn.py +++ b/neps/optimizers/bayesian_optimization/models/ftpfn.py @@ -1,11 +1,40 @@ from __future__ import annotations +from collections.abc import Callable, Mapping +from dataclasses import dataclass from pathlib import Path -from typing import Any +from typing import Any, Literal import torch from ifbo import FTPFN +from neps.sampling.samplers import Sampler +from neps.search_spaces.domain import Domain +from neps.search_spaces.encoding import CategoricalToUnitNorm, ConfigEncoder +from neps.search_spaces.search_space import SearchSpace +from neps.state.trial import Trial + + +def _keep_highest_budget_evaluation( + x: torch.Tensor, + id_col: int = 0, + budget_col: int = 1, +) -> torch.Tensor: + # Does a lexsort, same as if we sorted by (config_id, budget), where + # theyre are sorted according to increasing config_id and then increasing budget. + # x[i2] -> sorted by config id and budget + i1 = torch.argsort(x[:, budget_col]) + i2 = i1[torch.argsort(x[i1][:, id_col], stable=True)] + sorted_x = x[i2] + + # Now that it's sorted, we essentially want to count the occurence of each id into counts + _, counts = torch.unique_consecutive(sorted_x[:, id_col], return_counts=True) + + # Now we can use these counts to get to the last occurence of each id + # The -1 is because we want to index from 0 but sum starts at 1. + ii = counts.cumsum(0) - 1 + return sorted_x[ii] + def _download_workaround_for_ifbo_issue_10(path: Path | None, version: str) -> Path: # TODO: https://github.com/automl/ifBO/issues/10 @@ -68,6 +97,233 @@ def _cast_tensor_shapes(x: torch.Tensor) -> torch.Tensor: raise ValueError(f"Shape not recognized: {x.shape}") +# NOTE: Ifbo was trained using 32 bit +FTPFN_DTYPE = torch.float32 + + +def encode_trials_for_ftpfn( + trials: Mapping[str, Trial], + space: SearchSpace, + budget_domain: Domain, + encoder: ConfigEncoder, + *, + device: torch.device | None = None, + dtype: torch.dtype = FTPFN_DTYPE, + error_value: float = 0.0, +) -> FTPFNData: + """Encode the trials into a format that the FTPFN model can understand. + + !!! warning "Pending trials" + + For trials which do not have a loss reported yet, they are considered pending + and will have `torch.nan` as their score inside the returned y values. + If using + [`acquire_next_from_ftpfn()`][neps.optimizers.bayesian_optimization.models.ftpfn.acquire_next_from_ftpfn], + the result of these configurations will be fantasized. + + !!! warning "Error values" + + The FTPFN model requires that all loss values lie in the interval [0, 1]. + By default, using the value of `error_value=0.0`, we encode crashed configurations as + having an error value of 0. + + Args: + trials: The trials to encode + encoder: The encoder to use + space: The search space + budget_domain: The domain to use for the budgets of the FTPFN + device: The device to use + dtype: The dtype to use + + Returns: + The encoded trials and their corresponding **scores** + """ + # Select all trials which have something we can actually use for modelling + # The absence of a report signifies pending + selected = {trial_id: trial for trial_id, trial in trials.items()} + assert space.fidelity_name is not None + assert space.fidelity is not None + assert 0 <= error_value <= 1 + train_configs = encoder.encode([t.config for t in selected.values()], device=device) + ids = torch.tensor( + [int(config_id.split("_", maxsplit=1)[0]) for config_id in selected.keys()], + device=device, + dtype=dtype, + ) + # PFN uses `0` id for test configurations + ids = ids + 1 + + train_fidelities = torch.tensor( + [t.config[space.fidelity_name] for t in selected.values()], + device=device, + dtype=dtype, + ) + train_budgets = budget_domain.cast(train_fidelities, frm=space.fidelity.domain) + + # TODO: Document that it's on the user to ensure these are already all bounded + # We could possibly include some bounded transform to assert this. + minimize_ys = torch.tensor( + [ + torch.nan + if trial.report is None + else (error_value if trial.report.loss is None else trial.report.loss) + for trial in trials.values() + ], + device=device, + dtype=dtype, + ) + if minimize_ys.max() > 1 or minimize_ys.min() < 0: + raise RuntimeError( + "ifBO requires that all loss values reported lie in the interval [0, 1]" + " but recieved loss value outside of that range!" + f"\n{minimize_ys}" + ) + maximize_ys = 1 - minimize_ys + return FTPFNData( + ids=ids, + x=train_configs, + y=maximize_ys, + budgets=train_budgets, + pending_mask=minimize_ys.isnan(), + ) + + +@dataclass +class FTPFNData: + """Dataclass to hold the data for the FTPFN model. + + The layout of the data is as follows: + + * `ids`: The configuration ids. These will have +1 added to them as FTPFN uses `0` + for test configurations, but NePS starts ids at `0`. + * `x`: The encoded configurations, includes everything that was encoded by the encoder + passed to + [`encode_trials_for_ftpfn()`][neps.optimizers.bayesian_optimization.models.ftpfn.encode_trials_for_ftpfn] + * `y`: The scores of the configurations, these are inverted such they are to be maximized, where 1 is the maximum + score obtainable and 0 is the minimum. Any configuration which did not have a loss gets a score of `nan`. + * `budgets`: The budgets of the configurations, normalized to the range [0, 1]. + These are normalized such that the lower bound of the fidelity domain maps to `1/max_fid` + while the upper bound maps to `1`. + * `pending_mask`: A mask to indicate which configurations are pending, i.e. have not been evaluated yet. + If there are no pending configurations, this should be `None`. + """ + + ids: torch.Tensor + x: torch.Tensor + y: torch.Tensor + budgets: torch.Tensor + pending_mask: torch.Tensor | None = None + + +def create_border_configs( + ndims: int, + *, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + max_samples: int = 2**9, +) -> torch.Tensor: + n_samples = 2**ndims + _arange = torch.arange(n_samples, device=device, dtype=torch.int32) + # 2**9 is only 512 samples, so we can afford to exhaustively generate them + # We likely won't have this many hyperparameters anywho + if n_samples <= max_samples: + configs = _arange + else: + # Otherwise, we take a random sample of the 2**n possible border configs + rand_uniq_indices = torch.randperm(n_samples, device=device)[:max_samples] + configs = _arange[rand_uniq_indices] + + # https://stackoverflow.com/a/63546308/5332072 + bit_masks = 2 ** _arange[ndims] + return configs.unsqueeze(1).bitwise_and(bit_masks).ne(0).to(dtype) + + +def acquire_next_from_ftpfn( + *, + ftpfn: FTPFNSurrogate, + data: FTPFNData, + encoder: ConfigEncoder, + budget_domain: Domain, + fidelity_domain: Domain, + seed: int | None = None, + acq_strategy: Callable[ + [torch.Tensor, torch.Tensor, torch.Tensor, FTPFNSurrogate], torch.Tensor + ], + dtype: torch.dtype | None = FTPFN_DTYPE, + extra_acq_samples: torch.Tensor | None = None, +) -> tuple[int | None, int | float | None, dict[str, Any]]: + X = torch.cat([data.ids.unsqueeze(1), data.budgets.unsqueeze(1), data.x], dim=1).to( + dtype + ) + ys = data.y.clone().detach() + + # In-fill pending with predicted performance + if data.pending_mask is not None: + not_pending = ~data.pending_mask + pending_ys = ftpfn.get_mean_performance( + train_x=X[not_pending], + train_y=ys[not_pending], + test_x=X[data.pending_mask], + ) + ys[data.pending_mask] = pending_ys + + # We also need to append existing configurations that are in training data, but bump up their + # budget by one step. + # 1. Exclude all configurations which are currently pending + acq_existing = X + if data.pending_mask is not None: + acq_existing = X[~data.pending_mask] + + # 2. Remove duplicate configurations from x train, keeping only the most recent eval + acq_existing = _keep_highest_budget_evaluation(acq_existing, id_col=0, budget_col=1) + + # 3. Remove configs that have been fully evaluated + acq_existing = acq_existing[acq_existing[:, 1] < budget_domain.upper] + + # 4. Include the extra acquisition samples + if extra_acq_samples is None: + samples = [acq_existing] + else: + _shape = (len(extra_acq_samples), 1) + acq_extra = torch.cat( + [ + torch.zeros(_shape, dtype=dtype, device=ftpfn.device), + torch.full(_shape, budget_domain.lower, dtype=dtype, device=ftpfn.device), + extra_acq_samples, + ], + dim=1, + ) + samples = [acq_existing, acq_extra] + + # 5. Now we can fuse them together + acq_samples = torch.cat(samples, dim=0).to(dtype=dtype) + + # We keep a copy of the original budgets incase they get modified + # so we can return the fidelity of the sample that had the best acquisition score + budgets_prior_to_acq = acq_samples[:, 1].clone().detach() + + # Now we offload acquisition to the caller + acq_scores = acq_strategy(X, ys, acq_samples, ftpfn) + + # Extract out the row which had the best PI + best_ix = acq_scores.argmax() + + best_id = int(acq_samples[best_ix, 0].round().item()) + if best_id == 0: # It was a new acq. sample + best_real_id = None + best_fid = None + else: # It was a sample to continue, decrement the 1 added earlier + best_real_id = best_id - 1 + best_fid = fidelity_domain.cast_one( + budgets_prior_to_acq[best_ix].item(), frm=budget_domain + ) + + best_vector = acq_samples[best_ix, 2:].unsqueeze(0) + best_config = encoder.decode(best_vector)[0] + + return best_real_id, best_fid, best_config + + _CACHED_FTPFN_MODEL: dict[tuple[str, str], FTPFN] = {} @@ -122,14 +378,10 @@ def get_pi( train_x: torch.Tensor, train_y: torch.Tensor, test_x: torch.Tensor, - # TODO: just calculate from train_y? - y_best: torch.Tensor, + y_best: torch.Tensor | float, ) -> torch.Tensor: logits = self._get_logits(train_x, train_y, test_x) - return self.ftpfn.model.criterion.pi( - logits.squeeze(), - best_f=(1 - y_best).unsqueeze(1), - ) + return self.ftpfn.model.criterion.pi(logits.squeeze(), best_f=y_best) @torch.no_grad() def get_ei( @@ -137,12 +389,10 @@ def get_ei( train_x: torch.Tensor, train_y: torch.Tensor, test_x: torch.Tensor, - y_best: torch.Tensor, + y_best: torch.Tensor | float, ) -> torch.Tensor: logits = self._get_logits(train_x, train_y, test_x) - return self.ftpfn.model.criterion.ei( - logits.squeeze(), best_f=(1 - y_best).unsqueeze(1) - ) + return self.ftpfn.model.criterion.ei(logits.squeeze(), best_f=y_best) @torch.no_grad() def get_lcb( diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index 6cafb1f7..96d6b7e0 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -6,25 +6,31 @@ from collections.abc import Mapping from functools import reduce from typing import TYPE_CHECKING, Any, TypeVar +from dataclasses import dataclass +from botorch.fit import fit_gpytorch_mll +from gpytorch import ExactMarginalLogLikelihood import torch import gpytorch.constraints -from botorch.acquisition.analytic import SingleTaskGP -from botorch.models.gp_regression import ( - get_covar_module_with_dim_scaled_prior, -) +from botorch.models import SingleTaskGP +from botorch.models.gp_regression import Log, get_covar_module_with_dim_scaled_prior from botorch.models.gp_regression_mixed import CategoricalKernel, OutcomeTransform -from botorch.models.transforms.outcome import Standardize +from botorch.models.transforms.outcome import ChainedOutcomeTransform, Standardize from botorch.optim import optimize_acqf, optimize_acqf_mixed from gpytorch.kernels import ScaleKernel from botorch.optim import optimize_acqf, optimize_acqf_mixed from itertools import product -from neps.search_spaces.encoding import ( - CategoricalToIntegerTransformer, - TensorEncoder, - TensorPack, +from neps.optimizers.bayesian_optimization.acquisition_functions.cost_cooling import ( + cost_cooled_acq, +) +from neps.optimizers.bayesian_optimization.acquisition_functions.pibo import ( + pibo_acquisition, ) +from neps.sampling.priors import Prior +from neps.search_spaces.encoding import CategoricalToIntegerTransformer, ConfigEncoder +from neps.search_spaces.search_space import SearchSpace +from neps.state.trial import Trial if TYPE_CHECKING: from botorch.acquisition import AcquisitionFunction @@ -35,6 +41,16 @@ T = TypeVar("T") +@dataclass +class GPEncodedData: + """Tensor data of finished configurations.""" + + x: torch.Tensor + y: torch.Tensor + cost: torch.Tensor | None = None + x_pending: torch.Tensor | None = None + + def default_categorical_kernel( N: int, active_dims: tuple[int, ...] | None = None, @@ -51,8 +67,9 @@ def default_categorical_kernel( def make_default_single_obj_gp( - x: TensorPack, + x: torch.Tensor, y: torch.Tensor, + encoder: ConfigEncoder, *, y_transform: OutcomeTransform | None = None, ) -> SingleTaskGP: @@ -63,7 +80,6 @@ def make_default_single_obj_gp( if y_transform is None: y_transform = Standardize(m=1) - encoder = x.encoder numerics: list[int] = [] categoricals: list[int] = [] for hp_name, transformer in encoder.transformers.items(): @@ -74,12 +90,12 @@ def make_default_single_obj_gp( # Purely vectorial if len(categoricals) == 0: - return SingleTaskGP(train_X=x.tensor, train_Y=y, outcome_transform=y_transform) + return SingleTaskGP(train_X=x, train_Y=y, outcome_transform=y_transform) # Purely categorical if len(numerics) == 0: return SingleTaskGP( - train_X=x.tensor, + train_X=x, train_Y=y, covar_module=default_categorical_kernel(len(categoricals)), outcome_transform=y_transform, @@ -108,16 +124,13 @@ def make_default_single_obj_gp( kernel = numeric_kernel + cat_kernel return SingleTaskGP( - train_X=x.tensor, - train_Y=y, - covar_module=kernel, - outcome_transform=y_transform, + train_X=x, train_Y=y, covar_module=kernel, outcome_transform=y_transform ) def optimize_acq( acq_fn: AcquisitionFunction, - encoder: TensorEncoder, + encoder: ConfigEncoder, *, n_candidates_required: int = 1, num_restarts: int = 20, @@ -200,3 +213,202 @@ def optimize_acq( fixed_features_list=fixed_cats, **acq_options, ) + + +def encode_trials_for_gp( + trials: Mapping[str, Trial], + space: SearchSpace, + *, + encoder: ConfigEncoder | None = None, + device: torch.device | None = None, +) -> tuple[GPEncodedData, ConfigEncoder]: + train_configs: list[Mapping[str, Any]] = [] + train_losses: list[float] = [] + train_costs: list[float] = [] + pending_configs: list[Mapping[str, Any]] = [] + + if encoder is None: + encoder = ConfigEncoder.default({**space.numerical, **space.categoricals}) + + for trial in trials.values(): + if trial.report is None: + pending_configs.append(trial.config) + continue + + train_configs.append(trial.config) + + loss = trial.report.loss + train_losses.append(torch.nan if loss is None else loss) + + cost = trial.report.cost + train_costs.append(torch.nan if cost is None else cost) + + x_train = encoder.encode(train_configs, device=device) + y_train = torch.tensor(train_losses, dtype=torch.float64, device=device) + cost_train = torch.tensor(train_costs, dtype=torch.float64, device=device) + if len(pending_configs) > 0: + x_pending = encoder.encode(pending_configs, device=device) + else: + x_pending = None + + data = GPEncodedData(x=x_train, y=y_train, cost=cost_train, x_pending=x_pending) + return data, encoder + + +def fit_and_acquire_from_gp( + *, + gp: SingleTaskGP, + x_train: torch.Tensor, + y_train: torch.Tensor, + encoder: ConfigEncoder, + fantasize_pending: torch.Tensor | None = None, + acquisition: AcquisitionFunction, + prior: Prior | None = None, + pibo_exp_term: float | None = None, + cost_gp: SingleTaskGP | None = None, + costs: torch.Tensor | None = None, + cost_percentage_used: float | None = None, + costs_on_log_scale: bool = True, + seed: int | None = None, + n_candidates_required: int | None = None, + num_restarts: int = 20, + n_initial_start_points: int | None = None, + maximum_allowed_categorical_combinations: int = 30, + acq_options: Mapping[str, Any] | None = None, +) -> torch.Tensor: + """Acquire the next configuration to evaluate using a GP. + + Please see the following for: + + * Making a GP to pass in: + [`make_default_single_obj_gp`][neps.optimizers.bayesian_optimization.models.gp.make_default_single_obj_gp] + * Encoding configurations: + [`encode_trails_for_gp`][neps.optimizers.bayesian_optimization.models.gp.encode_trails_for_gp] + + Args: + gp: The GP model to use. + x_train: The encoded configurations that have already been evaluated + y_train: The loss of the evaluated configurations. + + !!! note "NaNs" + + Any y values encoded with NaNs will automatically be filled with + the mean loss value. This is to ensure a smoother acquisition + function optimization landscape. While this is a poorer + approximation of the landscape, this does not matter as we are + aiming to ensure that the GP models good areas as being better + than any other area, regardless of whether they are average or garbage. + + encoder: The encoder used for encoding the configurations + fantasize_pending: The pending configurations to fantasize over. Please be aware + that there are more efficient strategies as some acquisition functions can + handle this explicitly. + acquisition: The acquisition function to use. + + A good default is `qLogNoisyExpectedImprovement` which can + handle pending configurations gracefully without fantasization. + + prior: The prior to use over configurations. If this is provided, the + acquisition function will be further weighted using the piBO acquisition. + pibo_exp_term: The exponential term for the piBO acquisition. If `None` is + provided, one will be estimated. + costs: The costs of evaluating the configurations. If this is provided, + then a secondary GP will be used to estimate the cost of a given + configuration and factor into the weighting during the acquisiton of a new + configuration. + cost_percentage_used: The percentage of the budget used so far. This is used to determine + the strength of the cost cooling. Should be between 0 and 1. + Must be provided if costs is provided. + costs_on_log_scale: Whether the costs are on a log scale. + encoder: The encoder used for encoding the configurations + seed: The seed to use. + n_candidates_required: The number of candidates to return. If left + as `None`, only the best candidate will be returned. Otherwise + a list of candidates will be returned. + num_restarts: The number of restarts to use during optimization. + n_initial_start_points: The number of initial start points to use during optimization. + maximum_allowed_categorical_combinations: The maximum number of categorical + combinations to allow. If the number of combinations exceeds this, an error + will be raised. + acq_options: Additional options to pass to the botorch `optimizer_acqf` function. + + Returns: + The encoded next configuration(s) to evaluate. Use the encoder you provided + to decode the configuration. + """ + fit_gpytorch_mll(ExactMarginalLogLikelihood(likelihood=gp.likelihood, model=gp)) + + if fantasize_pending is not None: + y_train = torch.cat([y_train, gp.posterior(fantasize_pending).mean], dim=0) + x_train = torch.cat([x_train, fantasize_pending], dim=0) + + if prior: + if pibo_exp_term is None: + raise ValueError( + "If providing a prior, you must provide the `pibo_exp_term`." + ) + + acquisition = pibo_acquisition( + acquisition, + prior=prior, + prior_exponent=pibo_exp_term, + x_domain=encoder.domains, + ) + + if costs is not None: + if cost_percentage_used is None: + raise ValueError( + "If providing costs, you must provide `cost_percentage_used`." + ) + + # We simply ignore missing costs when training the cost GP. + missing_costs = torch.isnan(costs) + if missing_costs.any(): + raise ValueError( + "Must have at least some configurations reported with a cost if using costs" + " with a GP." + ) + + if missing_costs.any(): + not_missing_mask = ~missing_costs + x_train_cost = costs[not_missing_mask] + y_train_cost = x_train[not_missing_mask] + else: + x_train_cost = x_train + y_train_cost = costs + + if costs_on_log_scale: + transform = ChainedOutcomeTransform( + log=Log(), + standardize=Standardize(m=1), + ) + else: + transform = Standardize(m=1) + + cost_gp = make_default_single_obj_gp( + x_train_cost, + y_train_cost, + encoder=encoder, + y_transform=transform, + ) + fit_gpytorch_mll( + ExactMarginalLogLikelihood(likelihood=cost_gp.likelihood, model=cost_gp) + ) + acquisition = cost_cooled_acq( + acq_fn=acquisition, + model=cost_gp, + used_budget_percentage=cost_percentage_used, + ) + + _n = n_candidates_required if n_candidates_required is not None else 1 + + candidates, _scores = optimize_acq( + acquisition, + encoder, + n_candidates_required=_n, + num_restarts=num_restarts, + n_intial_start_points=n_initial_start_points, + acq_options=acq_options, + maximum_allowed_categorical_combinations=maximum_allowed_categorical_combinations, + ) + return candidates diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index ad167a8a..27411158 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -2,29 +2,21 @@ import math from collections.abc import Mapping -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any import torch from botorch.acquisition import LinearMCObjective from botorch.acquisition.logei import qLogNoisyExpectedImprovement -from botorch.fit import fit_gpytorch_mll -from botorch.models.transforms.outcome import ChainedOutcomeTransform, Log, Standardize -from gpytorch import ExactMarginalLogLikelihood from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig -from neps.optimizers.bayesian_optimization.acquisition_functions.cost_cooling import ( - cost_cooled_acq, -) -from neps.optimizers.bayesian_optimization.acquisition_functions.pibo import ( - pibo_acquisition, -) from neps.optimizers.bayesian_optimization.models.gp import ( + fit_and_acquire_from_gp, + encode_trials_for_gp, make_default_single_obj_gp, - optimize_acq, ) from neps.optimizers.intial_design import make_initial_design from neps.sampling import Prior -from neps.search_spaces.encoding import TensorEncoder +from neps.search_spaces.encoding import ConfigEncoder from neps.search_spaces.hyperparameters.categorical import CategoricalParameter if TYPE_CHECKING: @@ -34,62 +26,6 @@ from neps.state import BudgetInfo, Trial -def _missing_fill_strategy( - y: torch.Tensor, - strategy: Literal["mean", "worst", "3std", "nan"], - *, - lower_is_better: bool, -) -> torch.Tensor: - # Assumes minimization - if y.ndim != 1: - raise ValueError("Only supports single objective optimization for now!") - - match strategy: - case "nan": - return y - case "mean": - return torch.nan_to_num(y, nan=y.mean().item()) - case "worst": - worst = y.min() if lower_is_better else y.max() - return torch.nan_to_num(y, nan=worst.item()) - case "3std": - sign = 1 if lower_is_better else -1 - std = y.std() - return torch.nan_to_num(y, nan=y.mean().item() + sign * 3 * std.item()) - case _: - raise ValueError(f"Unknown strategy: {strategy}") - - -def _missing_y_strategy(y: torch.Tensor) -> torch.Tensor: - # TODO: Figure out what to do if there's no reported loss value. - # Some strategies: - # 1. Replace with NaN, in which case GPYtorch ignores it - # * Good if crash is random crash, in which case we do not wish to model - # a performance because of it. - # 2. Replace with worst value seen so far - # * Good if crash is systematic, in which case we wish to model it as - # basically, "don't go here" while remaining in the range of possible - # values for the GP. - # 3. Replace with mean - # * Same as above but keeps the optimization of the GP landscape - # smoother. Good if we have a mix of non-systematic and systematic - # crashed. Likely the safest option as GP will likely be unconfident in - # unsystematic crash cases, especially if it seems like a rare-event. - # Will also unlikely be a candidate region if systematic and we observe - # a few crashes there. However would take longer to learn of systematic - # crash regions. - return _missing_fill_strategy(y, strategy="mean", lower_is_better=True) - - -def _missing_cost_strategy(cost: torch.Tensor) -> torch.Tensor: - # TODO: Figure out what to do if there's no reported cost value - # Likely best to just fill in worst cost seen so far as this crash - # cost us a lot of time and we do not want to waste time on this - # region again. However if the crash was random, we might enter some - # issues. - return _missing_fill_strategy(cost, strategy="3std", lower_is_better=True) - - def _pibo_exp_term( n_sampled_already: int, ndims: int, @@ -120,13 +56,6 @@ def _pibo_exp_term( return math.exp(-n_bo_samples / ndims) -def _cost_used_budget_percentage(budget_info: BudgetInfo) -> float: - if budget_info.max_cost_budget is not None: - return budget_info.used_cost_budget / budget_info.max_cost_budget - - raise ValueError("No cost budget provided!") - - class BayesianOptimization(BaseOptimizer): """Implements the basic BO loop.""" @@ -137,9 +66,10 @@ def __init__( # noqa: D417 initial_design_size: int | None = None, use_priors: bool = False, use_cost: bool = False, + cost_on_log_scale: bool = True, sample_default_first: bool = False, device: torch.device | None = None, - encoder: TensorEncoder | None = None, + encoder: ConfigEncoder | None = None, seed: int | None = None, budget: Any | None = None, # TODO: remove surrogate_model: Any | None = None, # TODO: remove @@ -163,6 +93,7 @@ def __init__( # noqa: D417 If using `cost`, cost must be provided in the reports of the trials. + cost_on_log_scale: Whether to use the log of the cost when using cost. sample_default_first: Whether to sample the default configuration first. seed: Seed to use for the random number generator of samplers. device: Device to use for the optimization. @@ -181,14 +112,16 @@ def __init__( # noqa: D417 **pipeline_space.numerical, **pipeline_space.categoricals, } - self.encoder = TensorEncoder.default(params) if encoder is None else encoder + self.encoder = encoder or ConfigEncoder.default(params) self.prior = Prior.from_parameters(params) if use_priors is True else None self.seed = seed self.use_cost = use_cost + self.use_priors = use_priors + self.cost_on_log_scale = cost_on_log_scale self.device = device self.sample_default_first = sample_default_first self.n_initial_design = initial_design_size - self.initial_design_: list[dict[str, Any]] | None = None + self.init_design: list[dict[str, Any]] | None = None def ask( self, @@ -202,13 +135,14 @@ def ask( "Seed is not yet implemented for BayesianOptimization" ) - n_trials_sampled = len(trials) - config_id = str(n_trials_sampled + 1) + n_sampled = len(trials) + config_id = str(n_sampled + 1) + space = self.pipeline_space # If we havn't passed the intial design phase - if self.initial_design_ is None: - self.initial_design_ = make_initial_design( - space=self.pipeline_space, + if self.init_design is None: + self.init_design = make_initial_design( + space=space, encoder=self.encoder, sample_default_first=self.sample_default_first, sampler=self.prior if self.prior is not None else "sobol", @@ -219,124 +153,53 @@ def ask( sample_fidelity="max", ) - if n_trials_sampled < len(self.initial_design_): - config = self.initial_design_[n_trials_sampled] - return SampledConfig(id=config_id, config=config) + if n_sampled < len(self.init_design): + return SampledConfig(id=config_id, config=self.init_design[n_sampled]) - # Now we actually do the BO loop, start by encoding the data - # TODO: Lift this into runtime, let the optimizer advertise the encoding wants... - x_configs: list[Mapping[str, Any]] = [] - ys: list[float] = [] - costs: list[float] = [] - pending: list[Mapping[str, Any]] = [] - for trial in trials.values(): - if trial.state.pending(): - pending.append(trial.config) - else: - assert trial.report is not None - x_configs.append(trial.config) - ys.append( - trial.report.loss if trial.report.loss is not None else torch.nan - ) - if self.use_cost: - cost_z_score = trial.report.cost - costs.append(cost_z_score if cost_z_score is not None else torch.nan) - - x = self.encoder.pack(x_configs, device=self.device) - maybe_x_pending_tensor = None - if len(pending) > 0: - x_pending = self.encoder.pack(pending, device=self.device) - maybe_x_pending_tensor = x_pending.tensor - - y = torch.tensor(ys, dtype=torch.float64, device=self.device) - y = _missing_y_strategy(y) - - # Now fit our model - y_model = make_default_single_obj_gp( - x, - y, - # TODO: We should consider applying some heurisitc to see if this should - # also include a log transform, similar as we do to cost if using `use_cost`. - y_transform=Standardize(m=1), + # Otherwise, we encode trials and setup to fit and acquire from a GP + data, encoder = encode_trials_for_gp( + trials, space, device=self.device, encoder=self.encoder ) - y_likelihood = y_model.likelihood - fit_gpytorch_mll( - ExactMarginalLogLikelihood(likelihood=y_likelihood, model=y_model) - ) - - # NOTE: We use: - # * q - allows accounting for pending points, normally used to get a batch - # of points. - # * log - More numerically stable - # * Noisy - In Deep-Learning, we shouldn't take f.min() incase it was a noise - # spike. This accounts for noise in objective. - # * ExpectedImprovement - Cause ya know, the default. - acq = qLogNoisyExpectedImprovement( - y_model, - X_baseline=x.tensor, - X_pending=maybe_x_pending_tensor, - # Unfortunatly, there's no option to indicate that we minimize - # the AcqFunction so we need to do some kind of transformation. - # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607 - objective=LinearMCObjective(weights=torch.tensor([-1.0])), - ) + cost_percent = None + if self.use_cost: + if budget_info.max_cost_budget is None: + raise ValueError("Cost budget must be set if using cost") + cost_percent = budget_info.used_cost_budget / budget_info.max_cost_budget # If we should use the prior, weight the acquisition function by # the probability of it being sampled from the prior. + pibo_exp_term = None + prior = None if self.prior: pibo_exp_term = _pibo_exp_term( - n_trials_sampled, - self.encoder.ncols, - len(self.initial_design_), + n_sampled, encoder.ncols, len(self.init_design) ) + # If the exp term is insignificant, skip prior acq. weighting + prior = None if pibo_exp_term < 1e-4 else self.prior + + gp = make_default_single_obj_gp(x=data.x, y=data.y, encoder=encoder) + candidate = fit_and_acquire_from_gp( + gp=gp, + x_train=data.x, + y_train=data.y, + encoder=encoder, + acquisition=qLogNoisyExpectedImprovement( + model=gp, + X_baseline=data.x, + # Unfortunatly, there's no option to indicate that we minimize + # the AcqFunction so we need to do some kind of transformation. + # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607 + objective=LinearMCObjective(weights=torch.tensor([-1.0])), + X_pending=data.x_pending, + prune_baseline=True, + ), + prior=prior, + pibo_exp_term=pibo_exp_term, + costs=data.cost if self.use_cost else None, + cost_percentage_used=cost_percent, + costs_on_log_scale=self.cost_on_log_scale, + ) - # If the amount of weight derived from the pibo exponent becomes - # insignificant, we don't use it as it as it adds extra computational - # burden and introduces more chance of numerical instability. - significant_lower_bound = 1e-4 - if pibo_exp_term > significant_lower_bound: - acq = pibo_acquisition( - acq, - prior=self.prior, - prior_exponent=pibo_exp_term, - x_domain=self.encoder.domains, - X_pending=maybe_x_pending_tensor, - ) - - # If we should use cost, weight the acquisition function by the cost - # of the configurations. - if self.use_cost: - cost = torch.tensor(costs, dtype=torch.float64, device=self.device) - cost_z_score = _missing_cost_strategy(cost) - - cost_model = make_default_single_obj_gp( - x, - cost_z_score, - y_transform=ChainedOutcomeTransform( - # TODO: Maybe some way for a user to specify their cost - # is on a log scale? - log=Log(), - standardize=Standardize(m=1), - ), - ) - cost_likelihood = cost_model.likelihood - - # Optimize the cost model - fit_gpytorch_mll( - ExactMarginalLogLikelihood(likelihood=cost_likelihood, model=cost_model) - ) - acq = cost_cooled_acq( - acq_fn=acq, - model=cost_model, - used_budget_percentage=_cost_used_budget_percentage(budget_info), - X_pending=maybe_x_pending_tensor, - ) - - # Finally, optimize the acquisition function to get a configuration - candidates, _eis = optimize_acq(acq_fn=acq, encoder=self.encoder, acq_options={}) - - assert len(candidates) == 1, "Expected only one candidate!" - config = self.encoder.unpack(candidates)[0] - + config = encoder.decode(candidate)[0] return SampledConfig(id=config_id, config=config) diff --git a/neps/optimizers/intial_design.py b/neps/optimizers/intial_design.py index f2109f00..5993f68b 100644 --- a/neps/optimizers/intial_design.py +++ b/neps/optimizers/intial_design.py @@ -1,25 +1,24 @@ -from collections.abc import Sequence -from dataclasses import dataclass, field +from __future__ import annotations from typing import Literal, Any, Mapping from neps.sampling import Sampler from neps.sampling.priors import Prior -from neps.search_spaces.encoding import TensorEncoder +from neps.search_spaces.encoding import ConfigEncoder from neps.search_spaces.search_space import SearchSpace import torch def make_initial_design( space: SearchSpace, - encoder: TensorEncoder, + encoder: ConfigEncoder, sampler: Literal["sobol", "prior", "uniform"] | Sampler, sample_size: int | Literal["ndim"] | None = "ndim", sample_default_first: bool = True, sample_fidelity: ( Literal["min", "max", True] | int | float | dict[str, int | float] ) = True, - seed: int | None = None, + seed: torch.Generator | None = None, ) -> list[dict[str, Any]]: """Generate the initial design of the optimization process. @@ -50,7 +49,7 @@ def make_initial_design( When specified as a dictionary, the keys should be the names of the fidelity parameters and the values should be the target fidelities. If set to `True`, the configuration will have its fidelity randomly sampled. - seed: The seed to use for the random number generator of samplers. + seed: The seed to use for the random number generation. """ configs: list[dict[str, Any]] = [] @@ -124,7 +123,7 @@ def make_initial_design( seed=seed, ) uniq_x = torch.unique(encoded_configs, dim=0) - sample_configs = encoder.unpack(uniq_x[:sample_size]) + sample_configs = encoder.decode(uniq_x[:sample_size]) configs.extend([{**config, **fids} for config in sample_configs]) return configs diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index 569e0ccf..59de10c1 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -1,15 +1,21 @@ +from functools import partial from typing import Any, Mapping, Literal import numpy as np import torch +import warnings from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig -from neps.optimizers.bayesian_optimization.models.ftpfn import FTPFNSurrogate +from neps.optimizers.bayesian_optimization.models.ftpfn import ( + FTPFNSurrogate, + acquire_next_from_ftpfn, + encode_trials_for_ftpfn, +) from neps.optimizers.intial_design import make_initial_design from neps.sampling.priors import Prior from neps.sampling.samplers import Sampler from neps.search_spaces.domain import Domain -from neps.search_spaces.encoding import CategoricalToUnitNorm, TensorEncoder +from neps.search_spaces.encoding import CategoricalToUnitNorm, ConfigEncoder from neps.search_spaces.search_space import FloatParameter, IntegerParameter, SearchSpace from neps.state.trial import Trial from neps.state.optimizer import BudgetInfo @@ -87,7 +93,7 @@ def _tokenize( def _encode_for_ftpfn( trials: Mapping[str, Trial], - encoder: TensorEncoder, + encoder: ConfigEncoder, space: SearchSpace, budget_domain: Domain, device: torch.device | None = None, @@ -196,36 +202,6 @@ def _keep_highest_budget_evaluation(x: torch.Tensor) -> torch.Tensor: return sorted_x[ii] -def _acquire_pfn( - train_x: torch.Tensor, - train_y: torch.Tensor, - test_x: torch.Tensor, - ftpfn: FTPFNSurrogate, - y_to_beat: float, - how: Literal["pi", "ei", "ucb", "lcb"], -) -> torch.Tensor: - match how: - case "pi": - y_best = torch.full( - size=(len(test_x),), fill_value=y_to_beat, dtype=FTPFN_DTYPE - ) - return ftpfn.get_pi(train_x, train_y, test_x, y_best=y_best) - case "ei": - y_best = torch.full( - size=(len(test_x),), fill_value=y_to_beat, dtype=FTPFN_DTYPE - ) - return ftpfn.get_ei(train_x, train_y, test_x, y_best=y_best) - case "ucb": - y_best = torch.full( - size=(len(test_x),), fill_value=y_to_beat, dtype=FTPFN_DTYPE - ) - return ftpfn.get_ucb(train_x, train_y, test_x) - case "lcb": - return ftpfn.get_lcb(train_x, train_y, test_x) - case _: - raise ValueError(f"Unknown acquisition function {how}") - - class IFBO(BaseOptimizer): """Base class for MF-BO algorithms that use DyHPO-like acquisition and budgeting.""" @@ -284,7 +260,7 @@ def __init__( params = {**space.numerical, **space.categoricals} self._prior = Prior.from_parameters(params) if use_priors else None - self._ftpfn_encoder: TensorEncoder = TensorEncoder.default( + self._config_encoder: ConfigEncoder = ConfigEncoder.default( params, # FTPFN doesn't support categoricals and we were recomenned to just evenly distribute # in the unit norm @@ -293,6 +269,8 @@ def __init__( for cat_name, cat in space.categoricals.items() }, ) + self._border_sampler = Sampler.borders(len(params)) + self._cached_border_configs: torch.Tensor | None = None # Domain of fidelity values, i.e. what is given in the configs that we # give to the user to evaluate at. @@ -309,7 +287,7 @@ def ask( trials: Mapping[str, Trial], budget_info: BudgetInfo, optimizer_state: dict[str, Any], - seed: int | None = None, + seed: torch.Generator | None = None, ) -> SampledConfig: if seed is not None: raise NotImplementedError("Seed is not yet implemented for IFBO") @@ -321,7 +299,7 @@ def ask( if self._initial_design is None: self._initial_design = make_initial_design( space=self.pipeline_space, - encoder=self._ftpfn_encoder, + encoder=self._config_encoder, sample_default_first=self.sample_default_first, sampler="sobol" if self._prior is None else self._prior, seed=seed, @@ -333,128 +311,116 @@ def ask( return SampledConfig(id=f"{new_id}_0", config=self._initial_design[new_id]) # Otherwise, we proceed to surrogate phase - ftpfn = FTPFNSurrogate( - target_path=self.surrogate_model_args.get("target_path", None), - version=self.surrogate_model_args.get("version", "0.0.1"), - device=self.device, - ) - x_train, maximize_ys = _encode_for_ftpfn( + data = encode_trials_for_ftpfn( trials=trials, - encoder=self._ftpfn_encoder, space=self.pipeline_space, + encoder=self._config_encoder, budget_domain=self._budget_domain, device=self.device, ) - # PFN uses `0` id for test configurations, we remove this later - x_train[:, ID_COL] = x_train[:, ID_COL] + 1 - - # Fantasize the result of pending trials - is_pending = maximize_ys.isnan() - maximize_ys[is_pending] = ftpfn.get_mean_performance( - train_x=x_train[~is_pending], - train_y=maximize_ys[~is_pending], - test_x=x_train[is_pending], - ) - # We then sample a horizon, minimum one budget index increment and cast - # to the budget domain expected by the ftpfn model - rng = np.random.RandomState(seed) - lower_index = self._budget_ix_domain.lower - upper_index = self._budget_ix_domain.upper - horizon = self._budget_domain.cast_one( - rng.randint(lower_index, upper_index) + 1, - frm=self._budget_ix_domain, - ) + # TODO: Very little chance mfpi_random is best but for now it's stable + def _mfpi_random( + _X: torch.Tensor, + _y: torch.Tensor, + _acq_samples: torch.Tensor, + _ftpfn: FTPFNSurrogate, + how: Literal["pi", "ei"], + ) -> torch.Tensor: + rng = np.random.RandomState(None if seed is None else seed + len(trials)) + _low = self._budget_ix_domain.lower + _high = self._budget_ix_domain.upper + horizon_index = rng.randint(_low, _high) + 1 + horizon = self._budget_domain.cast_one( + horizon_index, frm=self._budget_ix_domain + ) + f_best = _y.max().item() + r = rng.uniform(-4, -1) + threshold = f_best + (10**r) * (1 - f_best) + + # NOTE: If converting f_inc to be seperate per acq sample, you + # need to add an extra batch dimension to y_best, i.e. (n, 1) + # Budget column is between 0 and 1, but we want to add the horizon + BUDGET_COL = 1 + _acq_samples[:, BUDGET_COL] += horizon + _acq_samples[:, BUDGET_COL] = torch.clamp( + _acq_samples[:, BUDGET_COL], max=self._budget_domain.upper + ) - # Now we sample some new configurations into the domain expected by the FTPFN - if self._prior is not None: - acq_sampler = self._prior - else: - acq_sampler = Sampler.uniform(ndim=self._ftpfn_encoder.ncols) + match how: + case "pi": + return _ftpfn.get_pi(_X, _y, _acq_samples, y_best=threshold) + case "ei": + return _ftpfn.get_ei(_X, _y, _acq_samples, y_best=threshold) + case _: + raise ValueError(f"Unknown acquisition strategy: {how=}") + + ndims = self._config_encoder.ncols - new_acq_configs = acq_sampler.sample( + # Sample some configurations at uniform for acq. + uniform_sampler = Sampler.uniform(ndim=ndims) + uniform_configs = uniform_sampler.sample( self.n_acquisition_new_configs, - to=self._ftpfn_encoder.domains, + to=self._config_encoder.domains, + seed=seed, device=self.device, - seed=None, # TODO - ) - acq_new = _tokenize( - ids=torch.zeros(self.n_acquisition_new_configs, device=self.device), - budgets=torch.full( - size=(self.n_acquisition_new_configs,), - fill_value=self._budget_domain.lower, - device=self.device, - ), - configs=new_acq_configs, + dtype=FTPFN_DTYPE, ) - # Construct all our samples for acqusition: - # 1. Take all non-pending configs - acq_existing = x_train[~is_pending].clone().detach() - - # 2. We only want to include the configuration at their highest - # budget evaluated, i.e. don't include config_0_0 if config_0_1 is highest - acq_existing = _keep_highest_budget_evaluation(acq_existing) - - # 3. Sub select all that are not fully evaluated - acq_existing = acq_existing[ - acq_existing[:, BUDGET_COL] < self._budget_domain.upper - ] - - # 4. Add in the new sampled configurations - acq_samples = torch.vstack([acq_existing, acq_new]) - - # 5. Add on the horizon to the budget - unclamped_budgets = acq_samples[:, BUDGET_COL] + horizon - - # 6. Clamp to the maximum of the budget domain - acq_samples[:, BUDGET_COL] = torch.clamp( - unclamped_budgets, max=self._budget_domain.upper - ) + # Also sample some border configurations for acq. + # OPTIM: If we are below the amount possible, there is no randomness and we can cache them + border_sampler = Sampler.borders(ndim=ndims) + N_border = 2**9 # 512, if we go over, we subselect 512 border configs + if N_border <= border_sampler.n_possible: + if self._cached_border_configs is not None: + border_configs = self._cached_border_configs + else: + self._cached_border_configs = border_sampler.sample( + n=N_border, + to=self._config_encoder.domains, + seed=seed, + device=self.device, + dtype=FTPFN_DTYPE, + ) + border_configs = self._cached_border_configs + else: + border_configs = border_sampler.sample( + n=N_border, + to=self._config_encoder.domains, + seed=seed, + device=self.device, + dtype=FTPFN_DTYPE, + ) - # Now get the PI of these samples according to MFPI_Random - maximize_best_y = maximize_ys.max().item() - lu = 10 ** rng.uniform(-4, -1) - f_inc = maximize_best_y * (1 - lu) - - acq_scores = _acquire_pfn( - train_x=x_train, - train_y=maximize_ys[~is_pending], - test_x=acq_samples, - ftpfn=ftpfn, - y_to_beat=f_inc, - how="pi", + id, current_fid, config = acquire_next_from_ftpfn( + ftpfn=FTPFNSurrogate( + target_path=self.surrogate_model_args.get("target_path", None), + version=self.surrogate_model_args.get("version", "0.0.1"), + device=self.device, + ), + data=data, + seed=seed, + encoder=self._config_encoder, + budget_domain=self._budget_domain, + fidelity_domain=self._fid_domain, + extra_acq_samples=torch.cat([uniform_configs, border_configs], dim=0), + acq_strategy=partial(_mfpi_random, how="ei"), ) - - # Extract out the row which had the best PI - best_ix = acq_scores.argmax() - best_id = int(acq_samples[best_ix, ID_COL].round().item()) - best_vector = acq_samples[best_ix, 2:].unsqueeze(0) - best_config = self._ftpfn_encoder.unpack(best_vector)[0] - - if best_id == 0: - # A newly sampled configuration was deemed more promising - config_id = f"{new_id}_0" - best_config[self._fidelity_name] = self._min_budget - previous_config_id = None - return SampledConfig(config_id, best_config, previous_config_id) - - # To get to the next fidelity value to provide, - # 1. Get the budget before we added the horizon - budget = float(unclamped_budgets[best_ix] - horizon) - - # 2. Cast to budget index domain - budget_ix = self._budget_ix_domain.cast_one(budget, frm=self._budget_domain) - - # 3. Increment it to the next budget index - budget_ix += 1 - - # 4. And finally convert back into the fidelity domain - fid_value = self._fid_domain.cast_one(budget_ix, frm=self._budget_ix_domain) - - real_best_id = best_id - 1 # NOTE: Remove the +1 we added to all ids earlier - best_config[self._fidelity_name] = fid_value - - config_id = f"{real_best_id}_{budget_ix}" - previous_config_id = f"{real_best_id}_{budget_ix - 1}" - return SampledConfig(config_id, best_config, previous_config_id) + if current_fid is None: + assert id is None + config[self._fidelity_name] = self._fid_domain.lower + return SampledConfig(id=f"{new_id}_0", config=config) + else: + current_budget_ix = self._budget_ix_domain.cast_one( + current_fid, frm=self._fid_domain + ) + next_budget_ix = current_budget_ix + 1 + next_fid = self._fid_domain.cast_one( + next_budget_ix, frm=self._budget_ix_domain + ) + config[self._fidelity_name] = next_fid + return SampledConfig( + id=f"{id}_{next_budget_ix}", + config=config, + previous_config_id=f"{id}_{current_budget_ix}", + ) diff --git a/neps/optimizers/multi_fidelity/sampling_policy.py b/neps/optimizers/multi_fidelity/sampling_policy.py index 58d75387..bc35f300 100644 --- a/neps/optimizers/multi_fidelity/sampling_policy.py +++ b/neps/optimizers/multi_fidelity/sampling_policy.py @@ -3,25 +3,39 @@ import logging from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Mapping +from botorch.acquisition import ( + AcquisitionFunction, + LinearMCObjective, + qLogNoisyExpectedImprovement, +) +from botorch.acquisition.analytic import SingleTaskGP +from botorch.fit import fit_gpytorch_mll +from gpytorch import ExactMarginalLogLikelihood import numpy as np import pandas as pd import torch from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping +from neps.optimizers.bayesian_optimization.acquisition_functions.pibo import ( + pibo_acquisition, +) from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import ( DecayingPriorWeightedAcquisition, ) from neps.optimizers.bayesian_optimization.acquisition_samplers import ( AcquisitionSamplerMapping, ) +from neps.optimizers.bayesian_optimization.models.gp import make_default_single_obj_gp from neps.optimizers.multi_fidelity_prior.utils import ( compute_config_dist, custom_crossover, local_mutation, update_fidelity, ) +from neps.sampling.priors import Prior +from neps.search_spaces.encoding import ConfigEncoder from neps.utils.common import instance_from_map if TYPE_CHECKING: @@ -273,67 +287,68 @@ class ModelPolicy(SamplingPolicy): def __init__( self, pipeline_space: SearchSpace, - surrogate_model: str | Any = "gp", - surrogate_model_args: dict | None = None, - acquisition: str | BaseAcquisition = "EI", - log_prior_weighted: bool = False, - acquisition_sampler: str | AcquisitionSampler = "random", - patience: int = 100, - logger=None, + prior: Prior | None = None, + use_cost: bool = False, + device: torch.device | None = None, ): - super().__init__(pipeline_space=pipeline_space, logger=logger) - - surrogate_model_args = surrogate_model_args or {} - self.surrogate_model = instance_from_map( - SurrogateModelMapping, - surrogate_model, - name="surrogate model", - kwargs=surrogate_model_args, + if prior: + raise NotImplementedError("Priors are not implemented yet.") + if use_cost: + raise NotImplementedError("Cost is not implemented yet.") + + super().__init__(pipeline_space=pipeline_space) + self.device = device + self.prior = prior + self._encoder = ConfigEncoder.default( + {**pipeline_space.numerical, **pipeline_space.categoricals} ) + self._model: SingleTaskGP | None = None + self._acq: AcquisitionFunction | None = None - self.acquisition = instance_from_map( - AcquisitionMapping, - acquisition, - name="acquisition function", - ) + def update_model( + self, + train_x: list[SearchSpace], + train_y: list[float], + pending_x: list[SearchSpace], + decay_t: float | None = None, + ): + x_train = self._encoder.encode([config.hp_values() for config in train_x]) + x_pending = self._encoder.encode([config.hp_values() for config in pending_x]) + y_train = torch.tensor(train_y, dtype=torch.float64, device=self.device) + + y_model = make_default_single_obj_gp(x_train, y_train, encoder=self._encoder) - # TODO: Enable only when a flag exists to toggle prior-based decaying of AF - # if pipeline_space.has_prior: - # self.acquisition = DecayingPriorWeightedAcquisition( - # self.acquisition, log=log_prior_weighted - # ) - - self.acquisition_sampler = instance_from_map( - AcquisitionSamplerMapping, - acquisition_sampler, - name="acquisition sampler function", - kwargs={"patience": patience, "pipeline_space": pipeline_space}, + fit_gpytorch_mll( + ExactMarginalLogLikelihood(likelihood=y_model.likelihood, model=y_model), + ) + acq = qLogNoisyExpectedImprovement( + y_model, + X_baseline=x_train, + X_pending=x_pending, + # Unfortunatly, there's no option to indicate that we minimize + # the AcqFunction so we need to do some kind of transformation. + # https://github.com/pytorch/botorch/issues/2316#issuecomment-2085964607 + objective=LinearMCObjective(weights=torch.tensor([-1.0])), ) - self.sampling_args: dict = {} - - def _fantasize_pending(self, train_x, train_y, pending_x): - if len(pending_x) == 0: - return train_x, train_y - # fit model on finished evaluations - self.surrogate_model.fit(train_x, train_y) - # hallucinating: predict for the pending evaluations - _y, _ = self.surrogate_model.predict(pending_x) - _y = _y.detach().numpy().tolist() - # appending to training data - train_x.extend(pending_x) - train_y.extend(_y) - return train_x, train_y - - def update_model(self, train_x, train_y, pending_x, decay_t=None): - if decay_t is None: - decay_t = len(train_x) - train_x, train_y = self._fantasize_pending(train_x, train_y, pending_x) - self.surrogate_model.fit(train_x, train_y) - self.acquisition.set_state(self.surrogate_model, decay_t=decay_t) - # TODO: set_state should generalize to all options - # no needed to set state of sampler when using `random` - # self.acquisition_sampler.set_state(x=train_x, y=train_y) + # If we have a prior, wrap the above acquisitionm with a prior weighting + if self.prior is not None: + assert decay_t is not None + # TODO: Ideally we have something based on budget and dimensions, not an arbitrary term + # This 10 is extracted from the old DecayingWeightedPrior + pibo_exp_term = 10 / decay_t + significant_lower_bound = 1e-4 # No significant impact beyond this point + if pibo_exp_term < significant_lower_bound: + acq = pibo_acquisition( + acq, + prior=self.prior, + prior_exponent=pibo_exp_term, + x_domain=self._encoder.domains, + x_pending=x_pending, + ) + + self._y_model = y_model + self._acq = acq def sample( self, @@ -354,8 +369,6 @@ def sample( variable set to the same value. This value is same as that of the fidelity value of the configs in the training data. """ - self.logger.info("Acquiring...") - # sampling random configurations samples = [ self.pipeline_space.sample(user_priors=False, ignore_fidelity=True) diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py index 62c81ed8..fc27bb6b 100644 --- a/neps/sampling/priors.py +++ b/neps/sampling/priors.py @@ -375,8 +375,9 @@ def sample( n: int | torch.Size, *, to: Domain | list[Domain], - seed: int | None = None, + seed: torch.Generator | None = None, device: torch.device | None = None, + dtype: torch.dtype | None = None, ) -> torch.Tensor: if seed is not None: raise NotImplementedError("Seeding is not yet implemented.") @@ -388,11 +389,11 @@ def sample( ) _n = torch.Size((n,)) if isinstance(n, int) else n - out = torch.empty(_out_shape, device=device, dtype=torch.float64) + out = torch.empty(_out_shape, device=device, dtype=dtype) for i, dist in enumerate(self.distributions): out[..., i] = dist.distribution.sample(_n) - return Domain.translate(out, frm=self._distribution_domains, to=to) + return Domain.translate(out, frm=self._distribution_domains, to=to, dtype=dtype) @dataclass @@ -422,8 +423,9 @@ def sample( n: int | torch.Size, *, to: Domain | list[Domain], - seed: int | None = None, + seed: torch.Generator | None = None, device: torch.device | None = None, + dtype: torch.dtype | None = None, ) -> torch.Tensor: if seed is not None: raise NotImplementedError("Seeding is not yet implemented.") @@ -433,8 +435,8 @@ def sample( if isinstance(n, int) else torch.Size((*n, self.ndims)) ) - samples = torch.rand(_n, device=device, dtype=torch.float64) - return Domain.translate(samples, frm=UNIT_FLOAT_DOMAIN, to=to) + samples = torch.rand(_n, device=device, dtype=dtype) + return Domain.translate(samples, frm=UNIT_FLOAT_DOMAIN, to=to, dtype=dtype) @dataclass @@ -485,7 +487,10 @@ def sample( n: int | torch.Size, *, to: Domain | list[Domain], - seed: int | None = None, + seed: torch.Generator | None = None, device: torch.device | None = None, + dtype: torch.dtype | None = None, ) -> torch.Tensor: - return self._weighted_sampler.sample(n, to=to, seed=seed, device=device) + return self._weighted_sampler.sample( + n, to=to, seed=seed, device=device, dtype=dtype + ) diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py index 64105534..c5c76b8e 100644 --- a/neps/sampling/samplers.py +++ b/neps/sampling/samplers.py @@ -34,8 +34,9 @@ def sample( n: int | torch.Size, *, to: Domain | list[Domain], - seed: int | None = None, + seed: torch.Generator | None = None, device: torch.device | None = None, + dtype: torch.dtype | None = None, ) -> torch.Tensor: """Sample `n` points and convert them to the given domain. @@ -47,7 +48,7 @@ def sample( to: If a single domain, `.ncols` columns will be produced form that one domain. If a list of domains, then it must have the same length as the number of columns, with each column being in the corresponding domain. - seed: The seed for the random number generator. + seed: The seed generator device: The device to cast the samples to. Returns: @@ -82,6 +83,18 @@ def uniform(cls, ndim: int) -> UniformPrior: return UniformPrior(ndims=ndim) + @classmethod + def borders(cls, ndim: int) -> BorderSampler: + """Create a border sampler. + + Args: + ndim: The number of dimensions to sample. + + Returns: + A border sampler. + """ + return BorderSampler(ndim=ndim) + # Technically this could be a prior with a uniform distribution @dataclass @@ -112,8 +125,9 @@ def sample( n: int | torch.Size, *, to: Domain | list[Domain], - seed: int | None = None, + seed: torch.Generator | None = None, device: torch.device | None = None, + dtype: torch.dtype | None = None, ) -> torch.Tensor: if seed is not None: raise NotImplementedError("Setting the seed is not supported yet") @@ -123,14 +137,15 @@ def sample( # and reshape the output tensor to the desired shape, if needed. _n = n if isinstance(n, int) else reduce(lambda x, y: x * y, n) + _seed = ( + None if seed is None else torch.randint(0, 2**31, (1,), generator=seed).item() + ) sobol = torch.quasirandom.SobolEngine( - dimension=self.ndim, - scramble=self.scramble, - seed=seed, + dimension=self.ndim, scramble=self.scramble, seed=_seed ) - out = torch.empty(_n, self.ncols, dtype=torch.float64, device=device) - x = sobol.draw(_n, dtype=torch.float64, out=out) + out = torch.empty(_n, self.ncols, dtype=dtype, device=device) + x = sobol.draw(_n, dtype=dtype, out=out) # If we got extra dimensions, such as batch dimensions, we need to # reshape the tensor to the desired shape. @@ -185,8 +200,9 @@ def sample( n: int | torch.Size, *, to: Domain | list[Domain], - seed: int | None = None, + seed: torch.Generator | None = None, device: torch.device | None = None, + dtype: torch.dtype | None = None, ) -> torch.Tensor: if seed is not None: raise NotImplementedError("Seeding is not yet implemented.") @@ -205,12 +221,15 @@ def sample( self.probabilities, total_samples, replacement=True, + generator=seed, out=chosen_samplers, ) # Create an empty tensor to hold all samples output_samples = torch.empty( - (total_samples, self.ncols), device=device, dtype=torch.float64 + (total_samples, self.ncols), + device=device, + dtype=dtype, ) # Loop through each sampler and its associated indices @@ -221,10 +240,73 @@ def sample( if len(indices) > 0: # Sample from the sampler for the required number of indices - samples_from_sampler = sampler.sample(len(indices), to=to, device=device) + samples_from_sampler = sampler.sample( + len(indices), + to=to, + seed=seed, + device=device, + dtype=dtype, + ) output_samples[indices] = samples_from_sampler # Reshape to the output shape including ncols dimension output_samples = output_samples.view(output_shape) return Domain.translate(output_samples, frm=UNIT_FLOAT_DOMAIN, to=to) + + +@dataclass +class BorderSampler(Sampler): + """A sampler that samples from the border of a hypercube.""" + + ndim: int + + @property + @override + def ncols(self) -> int: + return self.ndim + + @property + def n_possible(self) -> int: + """The amount of possible border configurations.""" + return 2**self.ndim + + @override + def sample( + self, + n: int | torch.Size, + *, + to: Domain | list[Domain], + seed: torch.Generator | None = None, + device: torch.device | None = None, + dtype: torch.dtype | None = None, + ) -> torch.Tensor: + _arange = torch.arange(self.n_possible, device=device, dtype=torch.int32) + # Calculate the total number of samples required + if isinstance(n, int): + total_samples = min(n, self.n_possible) + output_shape = (total_samples, self.ncols) + else: + total_samples = reduce(lambda x, y: x * y, n) + if total_samples > self.n_possible: + raise ValueError( + f"The shape of samples requested (={n}) is more than the number of " + f"possible border configurations (={self.n_possible})." + ) + output_shape = (*n, self.ncols) + + if self.n_possible <= total_samples: + configs = _arange + else: + # Otherwise, we take a random sample of the 2**n possible border configs + rand_ix = torch.randperm(self.n_possible, generator=seed, device=device)[ + :total_samples + ] + configs = _arange[rand_ix] + + # https://stackoverflow.com/a/63546308/5332072 + bit_masks = 2 ** _arange[: self.ndim] + configs = configs.unsqueeze(1).bitwise_and(bit_masks).ne(0).to(dtype) + # Reshape to the output shape including ncols dimension + configs.view(output_shape) + return Domain.translate(configs, frm=UNIT_FLOAT_DOMAIN, to=to) diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py index a5151aab..3ef203f1 100644 --- a/neps/search_spaces/domain.py +++ b/neps/search_spaces/domain.py @@ -89,19 +89,18 @@ class Domain(Generic[V]): value. """ - dtype: torch.dtype = field(init=False, repr=False) is_unit_float: bool = field(init=False, repr=False) midpoint: V = field(init=False, repr=False) is_log: bool = field(init=False, repr=False) length: V = field(init=False, repr=False) cardinality: int | None = field(init=False, repr=False) bounds: tuple[V, V] = field(init=False, repr=False) + preffered_dtype: torch.dtype = field(init=False, repr=False) def __post_init__(self): assert isinstance(self.lower, type(self.upper)) is_int = isinstance(self.lower, int) object.__setattr__(self, "is_log", self.log_bounds is not None) - object.__setattr__(self, "dtype", torch.int64 if is_int else torch.float64) object.__setattr__( self, "is_unit_float", @@ -116,10 +115,14 @@ def __post_init__(self): else: cardinality = None - object.__setattr__(self, "cardinality", cardinality) + preferred_dtype = torch.int64 if is_int else torch.float64 + object.__setattr__(self, "preffered_dtype", preferred_dtype) + mid = self.from_unit(torch.tensor(0.5)).item() - if self.dtype == torch.int64: + if is_int: mid = int(round(mid)) + + object.__setattr__(self, "cardinality", cardinality) object.__setattr__(self, "midpoint", mid) object.__setattr__(self, "bounds", (self.lower, self.upper)) @@ -203,17 +206,23 @@ def indices(cls, n: int) -> Domain[int]: """ return Domain.int(0, n - 1) - def to_unit(self, x: Tensor) -> Tensor: + def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor: """Transform a tensor of values from this domain to the unit interval [0, 1]. Args: x: Tensor of values in this domain to convert. + dtype: The dtype to convert to Returns: Same shape tensor with the values normalized to the unit interval [0, 1]. """ + if dtype is None: + dtype = torch.float64 + else: + assert dtype.is_floating_point, "Unit interval is only for floats." + if self.is_unit_float: - return x + return x.to(dtype) if self.log_bounds is not None: x = torch.log(x) @@ -221,19 +230,22 @@ def to_unit(self, x: Tensor) -> Tensor: else: lower, upper = self.lower, self.upper - return (x - lower) / (upper - lower) + x = (x - lower) / (upper - lower) + return x.type(dtype) - def from_unit(self, x: Tensor) -> Tensor: + def from_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor: """Transform a tensor of values from the unit interval [0, 1] to this domain. Args: x: A tensor of values in the unit interval [0, 1] to convert. + dtype: The dtype to convert to Returns: Same shape tensor with the lifted into this domain. """ + dtype = dtype or self.preffered_dtype if self.is_unit_float: - return x + return x.to(dtype) bins = self.bins if bins is not None: @@ -252,9 +264,9 @@ def from_unit(self, x: Tensor) -> Tensor: if self.round: x = torch.round(x) - return x.type(self.dtype) + return x.type(dtype) - def cast(self, x: Tensor, frm: Domain) -> Tensor: + def cast(self, x: Tensor, frm: Domain, *, dtype: torch.dtype | None = None) -> Tensor: """Cast a tensor of values frm the domain `frm` to this domain. If you need to cast a tensor of mixed domains, use @@ -263,10 +275,12 @@ def cast(self, x: Tensor, frm: Domain) -> Tensor: Args: x: Tensor of values in the `frm` domain to cast to this domain. frm: The domain to cast from. + dtype: The dtype to convert to Returns: Same shape tensor with the values cast to this domain. """ + dtype = dtype or self.preffered_dtype # NOTE: In general, we should always be able to go through the unit interval # [0, 1] to be able to transform between domains. However sometimes we can # bypass some steps, dependant on the domains, hence the ugliness... @@ -281,12 +295,12 @@ def cast(self, x: Tensor, frm: Domain) -> Tensor: if same_bounds and same_log_bounds and (self.bins is None or same_bins): if self.round: x = torch.round(x) - return x.type(self.dtype) if x.dtype != self.dtype else x + return x.type(dtype) # Shortcut 2. (From normalized) # The domain we are coming from is already normalized, we only need to lift if frm.is_unit_float: - return self.from_unit(x) # type: ignore + return self.from_unit(x, dtype=dtype) # type: ignore # Shortcut 3. (Log lift) # We can also shortcut out if the only diffrence is that we are coming frm the @@ -296,11 +310,10 @@ def cast(self, x: Tensor, frm: Domain) -> Tensor: x = torch.exp(x) if self.round: x = torch.round(x) - return x.type(self.dtype) + return x.type(dtype) # Otherwise, through the unit interval we go - norm = frm.to_unit(x) - lift = self.from_unit(norm) + lift = self.from_unit(frm.to_unit(x), dtype=dtype) return lift # noqa: RET504 @classmethod @@ -314,6 +327,8 @@ def translate( x: Tensor, frm: Domain | Iterable[Domain], to: Domain | Iterable[Domain], + *, + dtype: torch.dtype | None = None, ) -> Tensor: """Cast a tensor of mixed domains to a new set of mixed domains. @@ -326,6 +341,7 @@ def translate( to: List of domains to cast to. If list, must be length as `n_dims`, otherwise we assume the single domain provided is the one to be used across all dimensions. + dtype: The dtype of the converted tensor Returns: Tensor of the same shape as `x` with the last dimension casted @@ -341,7 +357,7 @@ def translate( # If both are not a list, we can just cast the whole tensor if isinstance(frm, Domain) and isinstance(to, Domain): - return to.cast(x, frm=frm) + return to.cast(x, frm=frm, dtype=dtype) frm = [frm] * ndims if isinstance(frm, Domain) else list(frm) to = [to] * ndims if isinstance(to, Domain) else list(to) @@ -360,9 +376,9 @@ def translate( f" Expected {ndims} from last dimension of {x.shape=}, got {len(to)}." ) - out = torch.empty_like(x) + out = torch.empty_like(x, dtype=dtype) for i, (f, t) in enumerate(zip(frm, to, strict=False)): - out[..., i] = t.cast(x[..., i], frm=f) + out[..., i] = t.cast(x[..., i], frm=f, dtype=dtype) return out diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py index eef1b25b..d47c5363 100644 --- a/neps/search_spaces/encoding.py +++ b/neps/search_spaces/encoding.py @@ -11,10 +11,7 @@ ) from typing_extensions import Protocol, override -import numpy as np -import numpy.typing as npt import torch -from grakel.utils import graph_from_networkx from neps.search_spaces.domain import ( UNIT_FLOAT_DOMAIN, @@ -25,10 +22,7 @@ from neps.search_spaces.hyperparameters.integer import IntegerParameter if TYPE_CHECKING: - import networkx as nx - from neps.search_spaces.parameter import Parameter - from neps.search_spaces.search_space import SearchSpace WLInput: TypeAlias = tuple[dict, dict | None, dict | None] V = TypeVar("V", int, float) @@ -183,50 +177,7 @@ def decode(self, x: torch.Tensor) -> list[V]: @dataclass -class WLInputTransformer(Transformer[WLInput]): - hp: str - - def encode(self, x: Sequence[nx.Graph]) -> list[WLInput]: - return [graph_from_networkx(g) for g in x] # type: ignore - - def decode(self, x: Mapping[str, Sequence[WLInput]]) -> dict[str, list[Any]]: - raise NotImplementedError("Cannot decode WLInput to values.") - - -@dataclass -class GraphEncoder: - transformers: dict[str, WLInputTransformer] - column_lookup: dict[str, int] = field(init=False) - - def __post_init__(self): - transformers = sorted(self.transformers.items(), key=lambda t: t[0]) - self.transformers = dict(transformers) - self.column_lookup: dict[str, int] = { - name: i for i, (name, _) in enumerate(self.transformers.items()) - } - - def select( - self, x: npt.NDArray[np.object_], hp: str | Sequence[str] - ) -> npt.NDArray[np.object_]: - # Kind of a redundant function but made to be compatible with TensorPack - if isinstance(hp, str): - return x[:, self.column_lookup[hp]] - - return x[:, [self.column_lookup[h] for h in hp]] - - def encode(self, x: Sequence[Any]) -> npt.NDArray[np.object_]: - buffer = np.empty((len(x), len(self.transformers)), dtype=np.object_) - for hp, transformer in self.transformers.items(): - values = [conf[hp] for conf in x] - buffer[:, self.column_lookup[hp]] = transformer.encode(values) # type: ignore - return buffer - - def decode_dicts(self, x: npt.NDArray[np.object_]) -> list[dict[str, Any]]: - raise NotImplementedError("Cannot decode graph embeddings.") - - -@dataclass -class TensorEncoder: +class ConfigEncoder: transformers: dict[str, TensorTransformer] index_of: dict[str, int] = field(init=False) domain_of: dict[str, Domain] = field(init=False) @@ -290,15 +241,7 @@ def encode( return buffer - def pack( - self, - x: Sequence[Mapping[str, Any]], - *, - device: torch.device | None = None, - ) -> TensorPack: - return TensorPack(self.encode(x, device=device), self) - - def unpack(self, x: torch.Tensor) -> list[dict[str, Any]]: + def decode(self, x: torch.Tensor) -> list[dict[str, Any]]: values: dict[str, list[Any]] = {} for hp_name, transformer in self.transformers.items(): lookup = self.index_of[hp_name] @@ -317,7 +260,7 @@ def default( parameters: Mapping[str, Parameter], *, custom_transformers: dict[str, TensorTransformer] | None = None, - ) -> TensorEncoder: + ) -> ConfigEncoder: custom = custom_transformers or {} sorted_params = sorted(parameters.items()) transformers: dict[str, TensorTransformer] = {} @@ -334,59 +277,13 @@ def default( case _: raise ValueError(f"Unsupported parameter type: {type(hp)}") - return TensorEncoder(transformers) + return ConfigEncoder(transformers) @dataclass -class TensorPack: - tensor: torch.Tensor - encoder: TensorEncoder - - def __len__(self) -> int: - return len(self.tensor) - - @property - def n_numerical(self) -> int: - return self.encoder.n_numerical - - @property - def n_categorical(self) -> int: - return self.encoder.n_categorical - - @property - def ncols(self) -> int: - return self.encoder.ncols - - @property - def domains(self) -> dict[str, Domain]: - return self.encoder.domains +class EncodedPending: + """Tensor data of pending configurations.""" - def select(self, hp: str | Sequence[str]) -> torch.Tensor | npt.NDArray[np.object_]: - return self.encoder.select(self.tensor, hp) - - def names(self) -> list[str]: - return self.encoder.names() - - def to_dicts(self) -> list[dict[str, Any]]: - return self.encoder.unpack(self.tensor) - - def split(self, index: int) -> tuple[TensorPack, TensorPack]: - left = TensorPack(self.encoder, tensor=self.tensor[:index]) - right = TensorPack(self.encoder, tensor=self.tensor[index:]) - return left, right - - def join(self, *other: TensorPack) -> TensorPack: - assert all(o.encoder == self.encoder for o in other) - - numerical = torch.cat([self.tensor, *[o.tensor for o in other]], dim=0) - return TensorPack(self.encoder, tensor=numerical) - - @classmethod - def default_encoding( - cls, - x: Sequence[Mapping[str, Any]], - space: SearchSpace, - ) -> TensorPack: - default_encoder = TensorEncoder.default(space) - tensor = default_encoder.encode(x) - return TensorPack(default_encoder, tensor) + ids: torch.Tensor + x: torch.Tensor + fid: torch.Tensor | None From 967e679bb8d17b6db91a301f7a330e020ffc52a3 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 2 Oct 2024 17:13:42 +0200 Subject: [PATCH 55/63] refactor(ifbo): Better acq function optimization --- .../bayesian_optimization/models/ftpfn.py | 246 +++++++-------- .../bayesian_optimization/optimizer.py | 4 +- neps/optimizers/intial_design.py | 8 +- neps/optimizers/multi_fidelity/ifbo.py | 297 +++++------------- neps/sampling/priors.py | 120 +++---- neps/sampling/samplers.py | 17 +- neps/search_spaces/domain.py | 63 +++- neps/search_spaces/encoding.py | 41 +-- 8 files changed, 320 insertions(+), 476 deletions(-) diff --git a/neps/optimizers/bayesian_optimization/models/ftpfn.py b/neps/optimizers/bayesian_optimization/models/ftpfn.py index 6c6464a6..4df2dfb8 100644 --- a/neps/optimizers/bayesian_optimization/models/ftpfn.py +++ b/neps/optimizers/bayesian_optimization/models/ftpfn.py @@ -1,16 +1,14 @@ from __future__ import annotations from collections.abc import Callable, Mapping -from dataclasses import dataclass from pathlib import Path -from typing import Any, Literal - +from typing import Any import torch from ifbo import FTPFN from neps.sampling.samplers import Sampler from neps.search_spaces.domain import Domain -from neps.search_spaces.encoding import CategoricalToUnitNorm, ConfigEncoder +from neps.search_spaces.encoding import ConfigEncoder from neps.search_spaces.search_space import SearchSpace from neps.state.trial import Trial @@ -101,7 +99,7 @@ def _cast_tensor_shapes(x: torch.Tensor) -> torch.Tensor: FTPFN_DTYPE = torch.float32 -def encode_trials_for_ftpfn( +def encode_ftpfn( trials: Mapping[str, Trial], space: SearchSpace, budget_domain: Domain, @@ -110,16 +108,14 @@ def encode_trials_for_ftpfn( device: torch.device | None = None, dtype: torch.dtype = FTPFN_DTYPE, error_value: float = 0.0, -) -> FTPFNData: + pending_value: float = torch.nan, +) -> tuple[torch.Tensor, torch.Tensor]: """Encode the trials into a format that the FTPFN model can understand. !!! warning "Pending trials" - For trials which do not have a loss reported yet, they are considered pending - and will have `torch.nan` as their score inside the returned y values. - If using - [`acquire_next_from_ftpfn()`][neps.optimizers.bayesian_optimization.models.ftpfn.acquire_next_from_ftpfn], - the result of these configurations will be fantasized. + For trials which do not have a loss reported yet, they are considered pending. + By default this is torch.nan and we recommend fantasizing these values. !!! warning "Error values" @@ -144,7 +140,9 @@ def encode_trials_for_ftpfn( assert space.fidelity_name is not None assert space.fidelity is not None assert 0 <= error_value <= 1 - train_configs = encoder.encode([t.config for t in selected.values()], device=device) + train_configs = encoder.encode( + [t.config for t in selected.values()], device=device, dtype=dtype + ) ids = torch.tensor( [int(config_id.split("_", maxsplit=1)[0]) for config_id in selected.keys()], device=device, @@ -158,13 +156,15 @@ def encode_trials_for_ftpfn( device=device, dtype=dtype, ) - train_budgets = budget_domain.cast(train_fidelities, frm=space.fidelity.domain) + train_budgets = budget_domain.cast( + train_fidelities, frm=space.fidelity.domain, dtype=dtype + ) # TODO: Document that it's on the user to ensure these are already all bounded # We could possibly include some bounded transform to assert this. minimize_ys = torch.tensor( [ - torch.nan + pending_value if trial.report is None else (error_value if trial.report.loss is None else trial.report.loss) for trial in trials.values() @@ -179,149 +179,119 @@ def encode_trials_for_ftpfn( f"\n{minimize_ys}" ) maximize_ys = 1 - minimize_ys - return FTPFNData( - ids=ids, - x=train_configs, - y=maximize_ys, - budgets=train_budgets, - pending_mask=minimize_ys.isnan(), + x_train = torch.cat( + [ids.unsqueeze(1), train_budgets.unsqueeze(1), train_configs], dim=1 ) + return x_train, maximize_ys -@dataclass -class FTPFNData: - """Dataclass to hold the data for the FTPFN model. - - The layout of the data is as follows: - - * `ids`: The configuration ids. These will have +1 added to them as FTPFN uses `0` - for test configurations, but NePS starts ids at `0`. - * `x`: The encoded configurations, includes everything that was encoded by the encoder - passed to - [`encode_trials_for_ftpfn()`][neps.optimizers.bayesian_optimization.models.ftpfn.encode_trials_for_ftpfn] - * `y`: The scores of the configurations, these are inverted such they are to be maximized, where 1 is the maximum - score obtainable and 0 is the minimum. Any configuration which did not have a loss gets a score of `nan`. - * `budgets`: The budgets of the configurations, normalized to the range [0, 1]. - These are normalized such that the lower bound of the fidelity domain maps to `1/max_fid` - while the upper bound maps to `1`. - * `pending_mask`: A mask to indicate which configurations are pending, i.e. have not been evaluated yet. - If there are no pending configurations, this should be `None`. - """ - - ids: torch.Tensor - x: torch.Tensor - y: torch.Tensor - budgets: torch.Tensor - pending_mask: torch.Tensor | None = None - - -def create_border_configs( - ndims: int, - *, - dtype: torch.dtype | None = None, - device: torch.device | None = None, - max_samples: int = 2**9, -) -> torch.Tensor: - n_samples = 2**ndims - _arange = torch.arange(n_samples, device=device, dtype=torch.int32) - # 2**9 is only 512 samples, so we can afford to exhaustively generate them - # We likely won't have this many hyperparameters anywho - if n_samples <= max_samples: - configs = _arange - else: - # Otherwise, we take a random sample of the 2**n possible border configs - rand_uniq_indices = torch.randperm(n_samples, device=device)[:max_samples] - configs = _arange[rand_uniq_indices] +def decode_ftpfn_data( + x: torch.Tensor, + encoder: ConfigEncoder, + budget_domain: Domain, + fidelity_domain: Domain, +) -> list[tuple[int | None, int | float, dict[str, Any]]]: + if x.ndim == 1: + x = x.unsqueeze(0) - # https://stackoverflow.com/a/63546308/5332072 - bit_masks = 2 ** _arange[ndims] - return configs.unsqueeze(1).bitwise_and(bit_masks).ne(0).to(dtype) + _raw_ids = x[:, 0].tolist() + # Here, we subtract 1 to get the real id, otherwise if it was a test ID, we say it had None + real_ids = [None if _id == 0 else int(_id) - 1 for _id in _raw_ids] + fidelities = fidelity_domain.cast(x[:, 1], frm=budget_domain).tolist() + configs = encoder.decode(x[:, 2:]) + return list(zip(real_ids, fidelities, configs)) def acquire_next_from_ftpfn( *, ftpfn: FTPFNSurrogate, - data: FTPFNData, + continuation_samples: torch.Tensor, encoder: ConfigEncoder, budget_domain: Domain, fidelity_domain: Domain, - seed: int | None = None, - acq_strategy: Callable[ - [torch.Tensor, torch.Tensor, torch.Tensor, FTPFNSurrogate], torch.Tensor - ], + initial_samplers: list[tuple[Sampler, int]], + local_search_sample_size: int = 128, + local_search_confidence: float = 0.95, # [0, 1] + acq_function: Callable[[torch.Tensor], torch.Tensor], + seed: torch.Generator | None = None, dtype: torch.dtype | None = FTPFN_DTYPE, - extra_acq_samples: torch.Tensor | None = None, -) -> tuple[int | None, int | float | None, dict[str, Any]]: - X = torch.cat([data.ids.unsqueeze(1), data.budgets.unsqueeze(1), data.x], dim=1).to( - dtype +) -> torch.Tensor: + # 1. Remove duplicate configurations from continuation_samples, keeping only the most recent eval + acq_existing = _keep_highest_budget_evaluation( + continuation_samples, id_col=0, budget_col=1 ) - ys = data.y.clone().detach() - - # In-fill pending with predicted performance - if data.pending_mask is not None: - not_pending = ~data.pending_mask - pending_ys = ftpfn.get_mean_performance( - train_x=X[not_pending], - train_y=ys[not_pending], - test_x=X[data.pending_mask], - ) - ys[data.pending_mask] = pending_ys - - # We also need to append existing configurations that are in training data, but bump up their - # budget by one step. - # 1. Exclude all configurations which are currently pending - acq_existing = X - if data.pending_mask is not None: - acq_existing = X[~data.pending_mask] - - # 2. Remove duplicate configurations from x train, keeping only the most recent eval - acq_existing = _keep_highest_budget_evaluation(acq_existing, id_col=0, budget_col=1) - # 3. Remove configs that have been fully evaluated + # 2. Remove configs that have been fully evaluated acq_existing = acq_existing[acq_existing[:, 1] < budget_domain.upper] - - # 4. Include the extra acquisition samples - if extra_acq_samples is None: - samples = [acq_existing] + if len(acq_existing) != 0: + # We keep a copy of the original budgets incase they get modified + # so we can return the fidelity of the sample that had the best acquisition score + budgets_prior_to_acq = acq_existing[:, 1].clone().detach() + + # Get the best configuration for continuation + acq_scores = acq_function(acq_existing) + best_ix = acq_scores.argmax() + + best_score = acq_scores[best_ix].item() + best_row = acq_existing[best_ix].clone().detach() + del acq_existing + del acq_scores else: - _shape = (len(extra_acq_samples), 1) - acq_extra = torch.cat( - [ - torch.zeros(_shape, dtype=dtype, device=ftpfn.device), - torch.full(_shape, budget_domain.lower, dtype=dtype, device=ftpfn.device), - extra_acq_samples, - ], - dim=1, - ) - samples = [acq_existing, acq_extra] - - # 5. Now we can fuse them together - acq_samples = torch.cat(samples, dim=0).to(dtype=dtype) - - # We keep a copy of the original budgets incase they get modified - # so we can return the fidelity of the sample that had the best acquisition score - budgets_prior_to_acq = acq_samples[:, 1].clone().detach() - - # Now we offload acquisition to the caller - acq_scores = acq_strategy(X, ys, acq_samples, ftpfn) - - # Extract out the row which had the best PI - best_ix = acq_scores.argmax() + best_score = -float("inf") + best_row = torch.tensor([]) + + # We'll be re-using 0 id and min budget alot, just create them once and re-use + _N = max(max(s[1] for s in initial_samplers), local_search_sample_size) + ids = torch.zeros((_N, 1), dtype=dtype, device=ftpfn.device) + min_budget = torch.full( + size=(_N, 1), fill_value=budget_domain.lower, dtype=dtype, device=ftpfn.device + ) - best_id = int(acq_samples[best_ix, 0].round().item()) - if best_id == 0: # It was a new acq. sample - best_real_id = None - best_fid = None - else: # It was a sample to continue, decrement the 1 added earlier - best_real_id = best_id - 1 - best_fid = fidelity_domain.cast_one( - budgets_prior_to_acq[best_ix].item(), frm=budget_domain + # Now begin acquisition maximization by sampling from given samplers and performing an additional + # round of local sampling around the best point + local_sample_confidence = [local_search_confidence] * len(encoder.domains) + for sampler, size in initial_samplers: + # 1. Use provided sampler and eval samples with acq + samples = sampler.sample( + size, to=encoder.domains, seed=seed, device=ftpfn.device, dtype=dtype ) - - best_vector = acq_samples[best_ix, 2:].unsqueeze(0) - best_config = encoder.decode(best_vector)[0] - - return best_real_id, best_fid, best_config + _N = len(samples) + X_test = torch.cat([ids[:_N], min_budget[:_N], samples], dim=1) + acq_scores = acq_function(X_test) + + # ... update best if needed + sample_best_ix = acq_scores.argmax() + sample_best_score = acq_scores[sample_best_ix] + sample_best_row = X_test[sample_best_ix].clone().detach() + if sample_best_score > best_score: + best_score = sample_best_score + best_row = sample_best_row + + # 2. Sample around best point from above samples and eval acq. + _mode = sample_best_row[2:] + local_sampler = Sampler.centered( + centers=list(zip(_mode.tolist(), local_sample_confidence)), + domains=encoder.domains, + ) + samples = local_sampler.sample( + local_search_sample_size, + to=encoder.domains, + seed=seed, + device=ftpfn.device, + dtype=dtype, + ) + _N = len(samples) + X_test = torch.cat([ids[:_N], min_budget[:_N], samples], dim=1) + acq_scores = acq_function(X_test) + + local_best_ix = acq_scores.argmax() + local_best_score = acq_scores[local_best_ix].clone().detach() + if local_best_score > best_score: + best_score = local_best_score + best_row = X_test[local_best_ix].clone().detach() + + # Finally, if the best + return best_row _CACHED_FTPFN_MODEL: dict[tuple[str, str], FTPFN] = {} diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index 27411158..a0137171 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -113,7 +113,9 @@ def __init__( # noqa: D417 **pipeline_space.categoricals, } self.encoder = encoder or ConfigEncoder.default(params) - self.prior = Prior.from_parameters(params) if use_priors is True else None + self.prior = ( + Prior.from_parameters(params.values()) if use_priors is True else None + ) self.seed = seed self.use_cost = use_cost self.use_priors = use_priors diff --git a/neps/optimizers/intial_design.py b/neps/optimizers/intial_design.py index 5993f68b..a2159eb0 100644 --- a/neps/optimizers/intial_design.py +++ b/neps/optimizers/intial_design.py @@ -113,15 +113,11 @@ def make_initial_design( case "uniform": sampler = Sampler.uniform(ndim=len(params)) case "prior": - sampler = Prior.from_parameters(params) + sampler = Prior.from_parameters(params.values()) case _: sampler = sampler - encoded_configs = sampler.sample( - sample_size * 2, - to=encoder.domains, - seed=seed, - ) + encoded_configs = sampler.sample(sample_size * 2, to=encoder.domains, seed=seed) uniq_x = torch.unique(encoded_configs, dim=0) sample_configs = encoder.decode(uniq_x[:sample_size]) configs.extend([{**config, **fids} for config in sample_configs]) diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index 59de10c1..9c72ae46 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -1,15 +1,15 @@ -from functools import partial +from __future__ import annotations from typing import Any, Mapping, Literal import numpy as np import torch -import warnings from neps.optimizers.base_optimizer import BaseOptimizer, SampledConfig from neps.optimizers.bayesian_optimization.models.ftpfn import ( FTPFNSurrogate, acquire_next_from_ftpfn, - encode_trials_for_ftpfn, + decode_ftpfn_data, + encode_ftpfn, ) from neps.optimizers.intial_design import make_initial_design from neps.sampling.priors import Prior @@ -23,8 +23,6 @@ # NOTE: Ifbo was trained using 32 bit FTPFN_DTYPE = torch.float32 -ID_COL = 0 -BUDGET_COL = 1 def _adjust_pipeline_space_to_match_stepsize( @@ -81,127 +79,6 @@ def _adjust_pipeline_space_to_match_stepsize( ) -def _tokenize( - ids: torch.Tensor, - budgets: torch.Tensor, - configs: torch.Tensor, -) -> torch.Tensor: - return torch.cat([ids.unsqueeze(1), budgets.unsqueeze(1), configs], dim=1).to( - FTPFN_DTYPE - ) - - -def _encode_for_ftpfn( - trials: Mapping[str, Trial], - encoder: ConfigEncoder, - space: SearchSpace, - budget_domain: Domain, - device: torch.device | None = None, - dtype: torch.dtype = FTPFN_DTYPE, -) -> tuple[torch.Tensor, torch.Tensor]: - """Encode the trials into a format that the FTPFN model can understand. - - !!! warning "loss values reported" - - The `ys` are a single dimension but consist of the losses inverted to scores. - As result, we have to assert that the loss values provided in the trials are - in the range [0, 1]. - - !!! note "X layout" - - The layout of the X is: - - ``` - | config_id | budget (normalized from fidelity) | hp_1 | hp_2 | ... | hp_n | - ``` - - Here the `budget` is normalized to the range [0, 1] while the hp parameters - are encoded according to the provided encoder, which should map the parameter - values from the original domain to some domain in [0, 1]. - - !!! warning "Pending and Error trials" - - We currently do not handle error cases, **and they are ignored**. - For trials which do not have a loss reported yet, they are considered pending - and will have `torch.nan` as their score inside the returned y values. - - Args: - trials: The trials to encode - encoder: The encoder to use - space: The search space - budget_domain: The domain to use for the budgets of the FTPFN - device: The device to use - dtype: The dtype to use - - Returns: - The encoded trials and their corresponding **scores** - """ - # Select all trials which have something we can actually use for modelling - # The absence of a report signifies pending - selected = { - trial_id: trial - for trial_id, trial in trials.items() - if trial.report is None or trial.report.loss is not None - } - assert space.fidelity_name is not None - assert space.fidelity is not None - train_configs = encoder.encode([t.config for t in selected.values()], device=device) - ids = torch.tensor( - [int(config_id.split("_", maxsplit=1)[0]) for config_id in selected.keys()], - device=device, - dtype=torch.float64, - ) - train_fidelities = torch.tensor( - [t.config[space.fidelity_name] for t in selected.values()], - device=device, - dtype=torch.float64, - ) - train_budgets = budget_domain.cast(train_fidelities, frm=space.fidelity.domain) - X = _tokenize( - ids=torch.tensor(ids, device=device), - budgets=train_budgets, - configs=train_configs, - ).to(dtype) - - # TODO: Document that it's on the user to ensure these are already all bounded - # We could possibly include some bounded transform to assert this. - minimize_ys = torch.tensor( - [ - trial.report.loss - if trial.report is not None and trial.report.loss is not None - else np.nan - for trial in trials.values() - ], - device=device, - dtype=FTPFN_DTYPE, - ) - if minimize_ys.max() > 1 or minimize_ys.min() < 0: - raise RuntimeError( - "ifBO requires that all loss values reported lie in the interval [0, 1]" - " but recieved loss value outside of that range!" - f"\n{minimize_ys}" - ) - maximize_ys = 1 - minimize_ys - return X, maximize_ys - - -def _keep_highest_budget_evaluation(x: torch.Tensor) -> torch.Tensor: - # Does a lexsort, same as if we sorted by (config_id, budget), where - # theyre are sorted according to increasing config_id and then increasing budget. - # x[i2] -> sorted by config id and budget - i1 = torch.argsort(x[:, BUDGET_COL]) - i2 = i1[torch.argsort(x[i1][:, ID_COL], stable=True)] - sorted_x = x[i2] - - # Now that it's sorted, we essentially want to count the occurence of each id into counts - _, counts = torch.unique_consecutive(sorted_x[:, ID_COL], return_counts=True) - - # Now we can use these counts to get to the last occurence of each id - # The -1 is because we want to index from 0 but sum starts at 1. - ii = counts.cumsum(0) - 1 - return sorted_x[ii] - - class IFBO(BaseOptimizer): """Base class for MF-BO algorithms that use DyHPO-like acquisition and budgeting.""" @@ -259,7 +136,7 @@ def __init__( self._initial_design: list[dict[str, Any]] | None = None params = {**space.numerical, **space.categoricals} - self._prior = Prior.from_parameters(params) if use_priors else None + self._prior = Prior.from_parameters(params.values()) if use_priors else None self._config_encoder: ConfigEncoder = ConfigEncoder.default( params, # FTPFN doesn't support categoricals and we were recomenned to just evenly distribute @@ -311,116 +188,98 @@ def ask( return SampledConfig(id=f"{new_id}_0", config=self._initial_design[new_id]) # Otherwise, we proceed to surrogate phase - data = encode_trials_for_ftpfn( + ftpfn = FTPFNSurrogate( + target_path=self.surrogate_model_args.get("target_path", None), + version=self.surrogate_model_args.get("version", "0.0.1"), + device=self.device, + ) + X, y = encode_ftpfn( trials=trials, space=self.pipeline_space, encoder=self._config_encoder, budget_domain=self._budget_domain, device=self.device, + pending_value=torch.nan, ) - # TODO: Very little chance mfpi_random is best but for now it's stable - def _mfpi_random( - _X: torch.Tensor, - _y: torch.Tensor, - _acq_samples: torch.Tensor, - _ftpfn: FTPFNSurrogate, - how: Literal["pi", "ei"], - ) -> torch.Tensor: - rng = np.random.RandomState(None if seed is None else seed + len(trials)) - _low = self._budget_ix_domain.lower - _high = self._budget_ix_domain.upper - horizon_index = rng.randint(_low, _high) + 1 - horizon = self._budget_domain.cast_one( - horizon_index, frm=self._budget_ix_domain - ) - f_best = _y.max().item() - r = rng.uniform(-4, -1) - threshold = f_best + (10**r) * (1 - f_best) - - # NOTE: If converting f_inc to be seperate per acq sample, you - # need to add an extra batch dimension to y_best, i.e. (n, 1) - # Budget column is between 0 and 1, but we want to add the horizon - BUDGET_COL = 1 - _acq_samples[:, BUDGET_COL] += horizon - _acq_samples[:, BUDGET_COL] = torch.clamp( - _acq_samples[:, BUDGET_COL], max=self._budget_domain.upper + # Fantasize if needed + pending_mask = torch.isnan(y) + if pending_mask.any(): + not_pending_mask = ~pending_mask + not_pending_X = X[not_pending_mask] + y[pending_mask] = ftpfn.get_mean_performance( + train_x=not_pending_X, + train_y=y[not_pending_mask], + test_x=X[pending_mask], ) - - match how: - case "pi": - return _ftpfn.get_pi(_X, _y, _acq_samples, y_best=threshold) - case "ei": - return _ftpfn.get_ei(_X, _y, _acq_samples, y_best=threshold) - case _: - raise ValueError(f"Unknown acquisition strategy: {how=}") - - ndims = self._config_encoder.ncols - - # Sample some configurations at uniform for acq. - uniform_sampler = Sampler.uniform(ndim=ndims) - uniform_configs = uniform_sampler.sample( - self.n_acquisition_new_configs, - to=self._config_encoder.domains, - seed=seed, - device=self.device, - dtype=FTPFN_DTYPE, + else: + not_pending_X = X + + # NOTE: Can't really abstract this, requires knowledge that: + # 1. The encoding is such that the loss is 1 - loss + # 2. The budget is the second column + # 3. The budget is encoded between 1/max_fid and 1 + rng = np.random.RandomState(None if seed is None else seed + len(trials)) + # Cast the a random budget index into the ftpfn budget domain + horizon_increment = self._budget_domain.cast_one( + rng.randint(*self._budget_ix_domain.bounds) + 1, + frm=self._budget_ix_domain, ) + f_best = y.max().item() + threshold = f_best + (10 ** rng.uniform(-4, -1)) * (1 - f_best) - # Also sample some border configurations for acq. - # OPTIM: If we are below the amount possible, there is no randomness and we can cache them - border_sampler = Sampler.borders(ndim=ndims) - N_border = 2**9 # 512, if we go over, we subselect 512 border configs - if N_border <= border_sampler.n_possible: - if self._cached_border_configs is not None: - border_configs = self._cached_border_configs - else: - self._cached_border_configs = border_sampler.sample( - n=N_border, - to=self._config_encoder.domains, - seed=seed, - device=self.device, - dtype=FTPFN_DTYPE, - ) - border_configs = self._cached_border_configs - else: - border_configs = border_sampler.sample( - n=N_border, - to=self._config_encoder.domains, - seed=seed, - device=self.device, - dtype=FTPFN_DTYPE, - ) + def _mfpi_random(samples: torch.Tensor) -> torch.Tensor: + # HACK: Because we are modifying the samples inplace, we do, and then undo the addition + original_budget_column = samples[..., 1].clone() + samples[..., 1].add_(horizon_increment).clamp_max_(self._budget_domain.upper) - id, current_fid, config = acquire_next_from_ftpfn( - ftpfn=FTPFNSurrogate( - target_path=self.surrogate_model_args.get("target_path", None), - version=self.surrogate_model_args.get("version", "0.0.1"), - device=self.device, - ), - data=data, - seed=seed, + scores = ftpfn.get_pi(X, y, samples, y_best=threshold) + + samples[..., 1] = original_budget_column + return scores + + # Do acquisition on ftpfn + sample_dims = self._config_encoder.ncols + best_row = acquire_next_from_ftpfn( + ftpfn=ftpfn, + # How to encode encoder=self._config_encoder, budget_domain=self._budget_domain, fidelity_domain=self._fid_domain, - extra_acq_samples=torch.cat([uniform_configs, border_configs], dim=0), - acq_strategy=partial(_mfpi_random, how="ei"), + # Acquisition function + acq_function=_mfpi_random, + # Which acquisition samples to consider for continuation + continuation_samples=not_pending_X, + # How to generate some initial samples + initial_samplers=[ + (Sampler.sobol(ndim=sample_dims), 512), + (Sampler.uniform(ndim=sample_dims), 512), + (Sampler.borders(ndim=sample_dims), 256), + ], + seed=seed, + # A next step local sampling around best point found by initial_samplers + local_search_sample_size=256, + local_search_confidence=0.95, ) - if current_fid is None: - assert id is None - config[self._fidelity_name] = self._fid_domain.lower + _id, fid, config = decode_ftpfn_data( + best_row, + self._config_encoder, + budget_domain=self._budget_domain, + fidelity_domain=self._fid_domain, + )[0] + + if _id is None: + config[self._fidelity_name] = fid return SampledConfig(id=f"{new_id}_0", config=config) else: - current_budget_ix = self._budget_ix_domain.cast_one( - current_fid, frm=self._fid_domain - ) - next_budget_ix = current_budget_ix + 1 - next_fid = self._fid_domain.cast_one( - next_budget_ix, frm=self._budget_ix_domain - ) + # Convert fidelity to budget index, bump by 1 and convert back + budget_ix = self._budget_ix_domain.cast_one(fid, frm=self._fid_domain) + next_ix = budget_ix + 1 + next_fid = self._fid_domain.cast_one(next_ix, frm=self._budget_ix_domain) + config[self._fidelity_name] = next_fid return SampledConfig( - id=f"{id}_{next_budget_ix}", + id=f"{_id}_{next_ix}", config=config, - previous_config_id=f"{id}_{current_budget_ix}", + previous_config_id=f"{_id}_{budget_ix}", ) diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py index fc27bb6b..4a77ca79 100644 --- a/neps/sampling/priors.py +++ b/neps/sampling/priors.py @@ -9,7 +9,7 @@ from __future__ import annotations -from collections.abc import Container, Iterable, Mapping, Sequence +from collections.abc import Iterable, Sequence from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any, Protocol from typing_extensions import override @@ -111,7 +111,7 @@ def uniform(cls, ncols: int) -> UniformPrior: @classmethod def from_parameters( cls, - parameters: dict[str, CategoricalParameter | FloatParameter | IntegerParameter], + parameters: Iterable[CategoricalParameter | FloatParameter | IntegerParameter], ) -> Prior: """Please refer to [`make_centered()`][neps.priors.Prior.make_centered] for more details. This is a shortcut method. @@ -124,16 +124,13 @@ def from_parameters( # accordingly in a `CenteredPrior` _mapping = {"low": 0.25, "medium": 0.5, "high": 0.75} - domains: dict[str, Domain] = {} - centers: dict[str, tuple[Any, float]] = {} - categoricals: set[str] = set() - for name, hp in parameters.items(): - domains[name] = hp.domain # type: ignore - - if isinstance(hp, CategoricalParameter): - categoricals.add(name) + domains: list[Domain] = [] + centers: list[tuple[Any, float] | None] = [] + for hp in parameters: + domains.append(hp.domain) if hp.default is None: + centers.append(None) continue confidence_str = hp.default_confidence_choice @@ -141,31 +138,25 @@ def from_parameters( center = ( hp._default_index if isinstance(hp, CategoricalParameter) else hp.default ) + centers.append((center, confidence_score)) - centers[name] = (center, confidence_score) - - # Uses truncnorms for numerical and weighted choices categoricals - return Prior.make_centered( - domains=domains, - centers=centers, - categoricals=categoricals, - ) + return Prior.make_centered(domains=domains, centers=centers) @classmethod def make_centered( cls, - domains: Mapping[str, Domain], - centers: Mapping[str, tuple[Any, float]], + domains: Iterable[Domain], + centers: Iterable[None | tuple[int | float, float]], *, - categoricals: Container[str] = (), device: torch.device | None = None, ) -> CenteredPrior: """Create a prior for a given list of domains. Will use a `TruncatedNormal` distribution for all parameters, - except those contained within `categoricals`, which will - use a `Categorical` instead. If no center is given for a domain, - a uniform prior will be used. + except those who have a domain marked with `is_categorical=True`, + using a `Categorical` distribution instead. + If the center for a given domain is `None`, a uniform prior + will be used instead. For non-categoricals, this will be interpreted as the mean and std `(1 - confidence)` for a truncnorm. For categorical values, @@ -180,68 +171,57 @@ def make_centered( Args: domains: domains over which to have a centered prior. - centers: centers for the priors. Should be a mapping - from the domain name to the center value and confidence level. - If no center is given, a uniform prior will be used. + centers: centers for the priors, i.e. the mode of the prior for that + domain, along with the confidence of that mode, which get's + re-interpreted as the std of the truncnorm or the probability + mass for the categorical. + + If `None`, a uniform prior will be used. !!! warning The values contained in centers should be contained within the domain. All confidence levels should be within the `[0, 1]` range. - categoricals: The names of the domains that are categorical and which - a `Categorical` distribution will be used, rather than a - `TruncatedNormal`. - - !!! warning - - Categoricals require that the corresponding domain has a - `.cardinality`, i.e. it is not a float/continuous domain. - - device: Device to place the tensors on. - + confidence: The confidence level for the center. Entries containing `None` + should match with `centers` that are `None`. If not, this is considered an + error. + device: Device to place the tensors on for distributions. Returns: A prior for the search space. """ - for name, (_, confidence) in centers.items(): - if not 0 <= confidence <= 1: - raise ValueError( - f"Confidence level for {name} must be in the range [0, 1]." - f" Got {confidence}." - ) + domains = list(domains) distributions: list[TorchDistributionWithDomain] = [] - for name, domain in domains.items(): - center_confidence = centers.get(name) - if center_confidence is None: + for domain, center_conf in zip(domains, centers, strict=True): + # If the center is None, we use a uniform distribution. We try to match + # the distributions to all be unit uniform as it can speed up sampling when + # consistentaly the same. This still works for categoricals + if center_conf is None: distributions.append(UNIT_UNIFORM_DIST) continue - center, confidence = center_confidence - if name in categoricals: - if domain.cardinality is None: - raise ValueError( - f"{name} is not a finite domain and cannot be used as a" - " categorical. Please remove it from the categoricals list." - ) - - if not isinstance(center, int): - raise ValueError( - f"{name} is a categorical domain and should have an integer" - f" center. Got {center} of type {type(center)}." - ) - - remaining_weight = 1 - confidence - distributed_weight = remaining_weight / (domain.cardinality - 1) + center, conf = center_conf + assert 0 <= conf <= 1 + + # If categorical, treat it as a weighted distribution over integers + if domain.is_categorical: + domain_as_ints = domain.as_integer_domain() + assert domain_as_ints.cardinality is not None + + weight_for_choice = conf + remaining_weight = 1 - weight_for_choice + + distributed_weight = remaining_weight / (domain_as_ints.cardinality - 1) weights = torch.full( - (domain.cardinality,), + (domain_as_ints.cardinality,), distributed_weight, device=device, dtype=torch.float64, ) - - weights[center] = confidence + center_index = domain_as_ints.cast_one(center, frm=domain) + weights[int(center_index)] = conf dist = TorchDistributionWithDomain( distribution=torch.distributions.Categorical( @@ -252,11 +232,9 @@ def make_centered( distributions.append(dist) continue - # We place a truncnorm over a unitnorm - unit_center = domain.to_unit( - torch.tensor(center, device=device, dtype=torch.float64) - ) - scale = torch.tensor(1 - confidence, device=device, dtype=torch.float64) + # Otherwise, we use a continuous truncnorm + unit_center = domain.to_unit_one(center) + scale = torch.tensor(1 - conf, device=device, dtype=torch.float64) a = torch.tensor(0.0, device=device, dtype=torch.float64) b = torch.tensor(1.0, device=device, dtype=torch.float64) dist = TorchDistributionWithDomain( diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py index c5c76b8e..dcf369a9 100644 --- a/neps/sampling/samplers.py +++ b/neps/sampling/samplers.py @@ -6,7 +6,7 @@ from __future__ import annotations -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from dataclasses import dataclass, field from functools import reduce from typing import TYPE_CHECKING, Protocol @@ -18,7 +18,7 @@ from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain if TYPE_CHECKING: - from neps.sampling.priors import UniformPrior + from neps.sampling.priors import CenteredPrior, UniformPrior class Sampler(Protocol): @@ -95,6 +95,19 @@ def borders(cls, ndim: int) -> BorderSampler: """ return BorderSampler(ndim=ndim) + @classmethod + def centered( + cls, + domains: list[Domain], + centers: Iterable[None | tuple[int | float, float]], + *, + device: torch.device | None = None, + ) -> CenteredPrior: + """See [`Prior.make_centered`][neps.sampling.priors.Prior.make_centered].""" + from neps.sampling.priors import Prior + + return Prior.make_centered(domains=domains, centers=centers, device=device) + # Technically this could be a prior with a uniform distribution @dataclass diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py index 3ef203f1..cc360a1c 100644 --- a/neps/search_spaces/domain.py +++ b/neps/search_spaces/domain.py @@ -89,9 +89,19 @@ class Domain(Generic[V]): value. """ + is_categorical: bool = False + """Whether the domain is representing a categorical. + + The domain does not use this information directly, but it can be useful for external + classes that consume Domain objects. This can only be set to `True` if the + `cardinality` of the domain is finite, i.e. `bins` is not `None` OR `round` + is `True` or the boundaries are both integers. + """ + is_unit_float: bool = field(init=False, repr=False) - midpoint: V = field(init=False, repr=False) + is_int: bool = field(init=False, repr=False) is_log: bool = field(init=False, repr=False) + midpoint: V = field(init=False, repr=False) length: V = field(init=False, repr=False) cardinality: int | None = field(init=False, repr=False) bounds: tuple[V, V] = field(init=False, repr=False) @@ -100,6 +110,7 @@ class Domain(Generic[V]): def __post_init__(self): assert isinstance(self.lower, type(self.upper)) is_int = isinstance(self.lower, int) + object.__setattr__(self, "is_int", is_int) object.__setattr__(self, "is_log", self.log_bounds is not None) object.__setattr__( self, @@ -114,6 +125,12 @@ def __post_init__(self): cardinality = int(self.upper - self.lower + 1) else: cardinality = None + if self.is_categorical: + raise ValueError( + "Categorical domain must have finite cardinality but" + " `bins` is `None` and `round` is `False` and" + " boundaries are not integers." + ) preferred_dtype = torch.int64 if is_int else torch.float64 object.__setattr__(self, "preffered_dtype", preferred_dtype) @@ -134,6 +151,7 @@ def float( *, log: bool = False, bins: int | None = None, + is_categorical: bool = False, ) -> Domain[float]: """Create a domain for a range of float values. @@ -142,6 +160,7 @@ def float( upper: The upper bound of the domain. log: Whether the domain is in log space. bins: The number of discrete bins to split the domain into. + is_categorical: Whether the domain is representing a categorical. Returns: A domain for a range of float values. @@ -152,6 +171,7 @@ def float( log_bounds=(math.log(lower), math.log(upper)) if log else None, bins=bins, round=False, + is_categorical=is_categorical, ) @classmethod @@ -162,6 +182,7 @@ def int( *, log: bool = False, bins: int | None = None, + is_categorical: bool = False, ) -> Domain[int]: """Create a domain for a range of integer values. @@ -170,6 +191,7 @@ def int( upper: The upper bound of the domain. log: Whether the domain is in log space. bins: The number of discrete bins to split the domain into. + is_categorical: Whether the domain is representing a categorical. Returns: A domain for a range of integer values. @@ -180,19 +202,11 @@ def int( log_bounds=(math.log(lower), math.log(upper)) if log else None, round=True, bins=bins, + is_categorical=is_categorical, ) - def next_value(self, x: Tensor) -> Tensor: - """Get the next value for a tensor of values.""" - if self.cardinality is None: - raise ValueError("Domain is non-finite, cannot get next value.") - cardinality_domain = Domain.indices(self.cardinality) - current_step = cardinality_domain.cast(x, frm=self) - bounded_next_step = (current_step + 1).clamp_max(self.cardinality - 1) - return self.cast(bounded_next_step, frm=cardinality_domain) - @classmethod - def indices(cls, n: int) -> Domain[int]: + def indices(cls, n: int, *, is_categorical: bool = False) -> Domain[int]: """Create a domain for a range of indices. Like range based functions this domain is inclusive of the lower bound @@ -200,11 +214,21 @@ def indices(cls, n: int) -> Domain[int]: Args: n: The number of indices. + is_categorical: Whether the domain is representing a categorical. Returns: A domain for a range of indices. """ - return Domain.int(0, n - 1) + return Domain.int(0, n - 1, is_categorical=is_categorical) + + def next_value(self, x: Tensor) -> Tensor: + """Get the next value for a tensor of values.""" + if self.cardinality is None: + raise ValueError("Domain is non-finite, cannot get next value.") + cardinality_domain = Domain.indices(self.cardinality) + current_step = cardinality_domain.cast(x, frm=self) + bounded_next_step = (current_step + 1).clamp_max(self.cardinality - 1) + return self.cast(bounded_next_step, frm=cardinality_domain) def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor: """Transform a tensor of values from this domain to the unit interval [0, 1]. @@ -416,5 +440,20 @@ def to_unit_one(self, x: float | int) -> float: """ return self.to_unit(torch.tensor(x)).item() + def as_integer_domain(self) -> Domain: + """Get the integer version of this domain. + + !!! warning + + This is only possible if this domain has a finite cardinality + """ + if self.cardinality is None: + raise ValueError( + "Cannot get integer representation of this domain as its" + " cardinality is non-finite." + ) + + return Domain.indices(self.cardinality, is_categorical=self.is_categorical) + UNIT_FLOAT_DOMAIN = Domain.float(0.0, 1.0) diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py index d47c5363..01c37720 100644 --- a/neps/search_spaces/encoding.py +++ b/neps/search_spaces/encoding.py @@ -2,21 +2,12 @@ from collections.abc import Mapping, Sequence from dataclasses import dataclass, field -from typing import ( - TYPE_CHECKING, - Any, - Generic, - TypeAlias, - TypeVar, -) +from typing import TYPE_CHECKING, Any, Generic, TypeAlias, TypeVar from typing_extensions import Protocol, override import torch -from neps.search_spaces.domain import ( - UNIT_FLOAT_DOMAIN, - Domain, -) +from neps.search_spaces.domain import UNIT_FLOAT_DOMAIN, Domain from neps.search_spaces.hyperparameters.categorical import CategoricalParameter from neps.search_spaces.hyperparameters.float import FloatParameter from neps.search_spaces.hyperparameters.integer import IntegerParameter @@ -58,7 +49,7 @@ class CategoricalToIntegerTransformer(TensorTransformer): def __post_init__(self): assert len(self.choices) > 0 - self.domain = Domain.indices(len(self.choices)) + self.domain = Domain.indices(len(self.choices), categorical=True) self._lookup = None if len(self.choices) > 3: try: @@ -104,6 +95,7 @@ class CategoricalToUnitNorm(TensorTransformer): _integer_transformer: CategoricalToIntegerTransformer = field(init=False) def __post_init__(self): + self._domain = Domain.float(0.0, 1.0, bins=len(self.choices), categorical=True) self._integer_transformer = CategoricalToIntegerTransformer(self.choices) @override @@ -119,12 +111,14 @@ def encode( x, dtype=dtype if dtype is not None else torch.float64, device=device, - out=out, + ) + binned_floats = self.domain.cast( + integers, frm=self._integer_transformer.domain, dtype=dtype ) if out is not None: - return integers.div_(len(self.choices) - 1) + return out.copy_(binned_floats) - return integers / (len(self.choices) - 1) + return binned_floats @override def decode(self, x: torch.Tensor) -> list[Any]: @@ -191,7 +185,7 @@ def __post_init__(self): n_numerical = 0 n_categorical = 0 for _, transformer in transformers: - if isinstance(transformer, CategoricalToIntegerTransformer): + if transformer.domain.is_categorical: n_categorical += 1 else: n_numerical += 1 @@ -223,9 +217,11 @@ def encode( x: Sequence[Mapping[str, Any]], *, device: torch.device | None = None, + dtype: torch.dtype | None = None, ) -> torch.Tensor: + dtype = torch.float64 if dtype is None else dtype width = len(self.transformers) - buffer = torch.empty((len(x), width), dtype=torch.float64, device=device) + buffer = torch.empty((len(x), width), dtype=dtype, device=device) for hp_name, transformer in self.transformers.items(): values = [conf[hp_name] for conf in x] @@ -235,7 +231,7 @@ def encode( transformer.encode( values, out=buffer[:, lookup], - dtype=torch.float64, + dtype=dtype, device=device, ) @@ -278,12 +274,3 @@ def default( raise ValueError(f"Unsupported parameter type: {type(hp)}") return ConfigEncoder(transformers) - - -@dataclass -class EncodedPending: - """Tensor data of pending configurations.""" - - ids: torch.Tensor - x: torch.Tensor - fid: torch.Tensor | None From 07eb8f2c347f3bc645804fc45ebf9e2e2608f66a Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 2 Oct 2024 19:12:21 +0200 Subject: [PATCH 56/63] test: Fixups --- .../doc_yamls/customizing_neps_optimizer.yaml | 5 - docs/doc_yamls/loading_own_optimizer.yaml | 2 - docs/doc_yamls/set_up_optimizer.yaml | 8 +- neps/api.py | 8 +- .../bayesian_optimization/models/ftpfn.py | 5 - .../bayesian_optimization/models/gp.py | 22 +- .../bayesian_optimization/optimizer.py | 11 +- .../bayesian_optimization.yaml | 3 +- neps/optimizers/default_searchers/pibo.yaml | 3 +- neps/optimizers/intial_design.py | 1 - neps/optimizers/multi_fidelity/hyperband.py | 22 +- neps/optimizers/multi_fidelity/ifbo.py | 7 +- .../multi_fidelity/sampling_policy.py | 24 +- .../multi_fidelity_prior/async_priorband.py | 8 +- .../multi_fidelity_prior/priorband.py | 14 +- neps/sampling/distributions.py | 32 +- neps/sampling/priors.py | 8 +- neps/sampling/samplers.py | 5 +- neps/search_spaces/domain.py | 10 +- neps/search_spaces/encoding.py | 167 +++++++++-- neps/search_spaces/hyperparameters/float.py | 2 +- neps/search_spaces/hyperparameters/integer.py | 2 +- .../hyperparameters/numerical.py | 3 +- neps/search_spaces/neighborhoods.py | 281 ------------------ neps/state/optimizer.py | 1 + neps/utils/common.py | 5 - neps/utils/run_args.py | 6 +- neps_examples/basic_usage/hyperparameters.py | 40 +-- .../solution_yamls/bo_neps_decided.yaml | 11 +- .../solution_yamls/pibo_neps_decided.yaml | 14 +- .../solution_yamls/user_yaml_bo.yaml | 10 +- tests/test_neps_api/test_api.py | 107 ++----- .../testing_yaml/optimizer_test.yaml | 10 +- .../run_args_optimizer_outside.yaml | 4 - tests/test_settings/test_settings.py | 40 ++- tests/test_state/test_neps_state.py | 4 + .../run_args_optional_loading_format.yaml | 1 - .../customizing_neps_optimizer.yaml | 9 +- .../loading_own_optimizer.yaml | 2 - .../set_up_optimizer.yaml | 8 +- .../optimizer_yamls/select_bo_run_args.yaml | 11 +- 41 files changed, 309 insertions(+), 627 deletions(-) delete mode 100644 neps/search_spaces/neighborhoods.py diff --git a/docs/doc_yamls/customizing_neps_optimizer.yaml b/docs/doc_yamls/customizing_neps_optimizer.yaml index a176dc74..93596fc8 100644 --- a/docs/doc_yamls/customizing_neps_optimizer.yaml +++ b/docs/doc_yamls/customizing_neps_optimizer.yaml @@ -19,8 +19,3 @@ searcher: name: "my_bayesian" # optional; changing the searcher_name for better recognition # Specific arguments depending on the searcher initial_design_size: 7 - surrogate_model: gp - acquisition: EI - acquisition_sampler: random - random_interleave_prob: 0.1 - diff --git a/docs/doc_yamls/loading_own_optimizer.yaml b/docs/doc_yamls/loading_own_optimizer.yaml index b23cd082..7a26a123 100644 --- a/docs/doc_yamls/loading_own_optimizer.yaml +++ b/docs/doc_yamls/loading_own_optimizer.yaml @@ -19,5 +19,3 @@ searcher: name: CustomOptimizer # class name within the file # Specific arguments depending on your searcher initial_design_size: 7 - surrogate_model: gp - acquisition: EI diff --git a/docs/doc_yamls/set_up_optimizer.yaml b/docs/doc_yamls/set_up_optimizer.yaml index f65af743..90b52671 100644 --- a/docs/doc_yamls/set_up_optimizer.yaml +++ b/docs/doc_yamls/set_up_optimizer.yaml @@ -1,11 +1,5 @@ strategy: bayesian_optimization # Specific arguments depending on the searcher initial_design_size: 7 -surrogate_model: gp -acquisition: EI -log_prior_weighted: false -acquisition_sampler: random -random_interleave_prob: 0.1 -disable_priors: false -prior_confidence: high +use_priors: true sample_default_first: false diff --git a/neps/api.py b/neps/api.py index 4f81b0cf..d1a1bd2a 100644 --- a/neps/api.py +++ b/neps/api.py @@ -1,7 +1,5 @@ """API for the neps package.""" - - import inspect import logging import warnings @@ -31,11 +29,7 @@ def run( run_pipeline: Callable | None = Default(None), root_directory: str | Path | None = Default(None), pipeline_space: ( - dict[str, Parameter | CS.ConfigurationSpace] - | str - | Path - | CS.ConfigurationSpace - | None + dict[str, Parameter] | str | Path | CS.ConfigurationSpace | None ) = Default(None), run_args: str | Path | None = Default(None), overwrite_working_directory: bool = Default(False), diff --git a/neps/optimizers/bayesian_optimization/models/ftpfn.py b/neps/optimizers/bayesian_optimization/models/ftpfn.py index 4df2dfb8..a6fe8b1e 100644 --- a/neps/optimizers/bayesian_optimization/models/ftpfn.py +++ b/neps/optimizers/bayesian_optimization/models/ftpfn.py @@ -208,7 +208,6 @@ def acquire_next_from_ftpfn( continuation_samples: torch.Tensor, encoder: ConfigEncoder, budget_domain: Domain, - fidelity_domain: Domain, initial_samplers: list[tuple[Sampler, int]], local_search_sample_size: int = 128, local_search_confidence: float = 0.95, # [0, 1] @@ -224,10 +223,6 @@ def acquire_next_from_ftpfn( # 2. Remove configs that have been fully evaluated acq_existing = acq_existing[acq_existing[:, 1] < budget_domain.upper] if len(acq_existing) != 0: - # We keep a copy of the original budgets incase they get modified - # so we can return the fidelity of the sample that had the best acquisition score - budgets_prior_to_acq = acq_existing[:, 1].clone().detach() - # Get the best configuration for continuation acq_scores = acq_function(acq_existing) best_ix = acq_scores.argmax() diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index 96d6b7e0..8b22a513 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -134,7 +134,7 @@ def optimize_acq( *, n_candidates_required: int = 1, num_restarts: int = 20, - n_intial_start_points: int | None = None, + n_intial_start_points: int = 256, acq_options: Mapping[str, Any] | None = None, maximum_allowed_categorical_combinations: int = 30, ) -> tuple[torch.Tensor, torch.Tensor]: @@ -146,9 +146,7 @@ def optimize_acq( bounds = torch.tensor([lower, upper], dtype=torch.float64) cat_transformers = { - name: t - for name, t in encoder.transformers.items() - if isinstance(t, CategoricalToIntegerTransformer) + name: t for name, t in encoder.transformers.items() if t.domain.is_categorical } if not any(cat_transformers): # Small heuristic to increase the number of candidates as our dimensionality @@ -172,7 +170,9 @@ def optimize_acq( # We need to generate the product of all possible combinations of categoricals, # first we do a sanity check n_combos = reduce( - lambda x, y: x * y, [len(t.choices) for t in cat_transformers.values()] + lambda x, y: x * y, # type: ignore + [t.domain.cardinality for t in cat_transformers.values()], + 1, ) if n_combos > maximum_allowed_categorical_combinations: raise ValueError( @@ -187,7 +187,10 @@ def optimize_acq( # First, just collect the possible values per cat column # NOTE: Botorchs optim requires them to be as floats cats: dict[int, list[float]] = { - encoder.index_of[name]: [float(i) for i in range(len(transformer.choices))] + encoder.index_of[name]: [ + float(i) + for i in range(len(transformer.domain.cardinality)) # type: ignore + ] for name, transformer in cat_transformers.items() } @@ -228,7 +231,10 @@ def encode_trials_for_gp( pending_configs: list[Mapping[str, Any]] = [] if encoder is None: - encoder = ConfigEncoder.default({**space.numerical, **space.categoricals}) + encoder = ConfigEncoder.default( + {**space.numerical, **space.categoricals}, + constants=space.constants, + ) for trial in trials.values(): if trial.report is None: @@ -272,7 +278,7 @@ def fit_and_acquire_from_gp( seed: int | None = None, n_candidates_required: int | None = None, num_restarts: int = 20, - n_initial_start_points: int | None = None, + n_initial_start_points: int = 256, maximum_allowed_categorical_combinations: int = 30, acq_options: Mapping[str, Any] | None = None, ) -> torch.Tensor: diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py index a0137171..f0d2addd 100644 --- a/neps/optimizers/bayesian_optimization/optimizer.py +++ b/neps/optimizers/bayesian_optimization/optimizer.py @@ -106,13 +106,22 @@ def __init__( # noqa: D417 """ if any(pipeline_space.graphs): raise NotImplementedError("Only supports flat search spaces for now!") + if any(pipeline_space.fidelities): + raise ValueError( + "Fidelities are not supported for BayesianOptimization." + " Please consider setting the fidelity to a constant value." + f" Got: {pipeline_space.fidelities}" + ) + super().__init__(pipeline_space=pipeline_space) params: dict[str, CategoricalParameter | FloatParameter | IntegerParameter] = { **pipeline_space.numerical, **pipeline_space.categoricals, } - self.encoder = encoder or ConfigEncoder.default(params) + self.encoder = encoder or ConfigEncoder.default( + params, constants=pipeline_space.constants + ) self.prior = ( Prior.from_parameters(params.values()) if use_priors is True else None ) diff --git a/neps/optimizers/default_searchers/bayesian_optimization.yaml b/neps/optimizers/default_searchers/bayesian_optimization.yaml index c3525cc4..2c34a3a3 100644 --- a/neps/optimizers/default_searchers/bayesian_optimization.yaml +++ b/neps/optimizers/default_searchers/bayesian_optimization.yaml @@ -2,5 +2,6 @@ strategy: bayesian_optimization # Arguments that can be modified by the user initial_design_size: null # Defaults to depending on number or hyperparameters use_cost: false # Whether to factor in cost when selecting new configurations -sample_default_first: # Whether to sample the default configuration first +use_priors: false # Whether to use user set priors in optimization +sample_default_first: false # Whether to sample the default configuration first device: null # Device to load the gaussian process model on with torch diff --git a/neps/optimizers/default_searchers/pibo.yaml b/neps/optimizers/default_searchers/pibo.yaml index 36bff8b2..cac0e8f8 100644 --- a/neps/optimizers/default_searchers/pibo.yaml +++ b/neps/optimizers/default_searchers/pibo.yaml @@ -2,5 +2,6 @@ strategy: pibo # Arguments that can be modified by the user initial_design_size: null # Defaults to depending on number or hyperparameters use_cost: false # Whether to factor in cost when selecting new configurations -sample_default_first: # Whether to sample the default configuration first +use_priors: true # Whether to use user set priors in optimization +sample_default_first: true # Whether to sample the default configuration first device: null # Device to load the gaussian process model on with torch diff --git a/neps/optimizers/intial_design.py b/neps/optimizers/intial_design.py index a2159eb0..dcfdbee3 100644 --- a/neps/optimizers/intial_design.py +++ b/neps/optimizers/intial_design.py @@ -105,7 +105,6 @@ def make_initial_design( "The sample size should be a positive integer if passing an int." ) - print("sample", sample_size, ndims) if sample_size is not None: match sampler: case "sobol": diff --git a/neps/optimizers/multi_fidelity/hyperband.py b/neps/optimizers/multi_fidelity/hyperband.py index f6c445ac..9319c50c 100644 --- a/neps/optimizers/multi_fidelity/hyperband.py +++ b/neps/optimizers/multi_fidelity/hyperband.py @@ -23,6 +23,7 @@ SuccessiveHalving, SuccessiveHalvingBase, ) +from neps.sampling.priors import Prior if typing.TYPE_CHECKING: from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( @@ -514,15 +515,6 @@ def __init__( self.pipeline_space.has_prior = self.use_priors - bo_args = { - "surrogate_model": surrogate_model, - "domain_se_kernel": domain_se_kernel, - "hp_kernels": hp_kernels, - "surrogate_model_args": surrogate_model_args, - "acquisition": acquisition, - "log_prior_weighted": log_prior_weighted, - "acquisition_sampler": acquisition_sampler, - } # counting non-fidelity dimensions in search space ndims = sum( 1 @@ -531,7 +523,17 @@ def __init__( ) n_min = ndims + 1 self.init_size = n_min + 1 # in BOHB: init_design >= N_min + 2 - self.model_policy = model_policy(pipeline_space, **bo_args) + + if self.use_priors: + parameters = { + **self.pipeline_space.numerical, + **self.pipeline_space.categoricals, + } + prior = Prior.from_parameters(parameters.values()) + else: + prior = None + + self.model_policy = model_policy(pipeline_space, prior=prior) for _, sh in self.sh_brackets.items(): sh.model_policy = self.model_policy diff --git a/neps/optimizers/multi_fidelity/ifbo.py b/neps/optimizers/multi_fidelity/ifbo.py index 9c72ae46..bd0014a2 100755 --- a/neps/optimizers/multi_fidelity/ifbo.py +++ b/neps/optimizers/multi_fidelity/ifbo.py @@ -139,6 +139,7 @@ def __init__( self._prior = Prior.from_parameters(params.values()) if use_priors else None self._config_encoder: ConfigEncoder = ConfigEncoder.default( params, + constants=self.pipeline_space.constants, # FTPFN doesn't support categoricals and we were recomenned to just evenly distribute # in the unit norm custom_transformers={ @@ -154,7 +155,7 @@ def __init__( self._fid_domain = space.fidelity.domain # Domain in which we should pass budgets to ifbo model - self._budget_domain = Domain.float(1 / self._max_budget, 1) + self._budget_domain = Domain.floating(1 / self._max_budget, 1) # Domain from which we assign an index to each budget self._budget_ix_domain = Domain.indices(fid_bins) @@ -185,7 +186,9 @@ def ask( ) if new_id < len(self._initial_design): - return SampledConfig(id=f"{new_id}_0", config=self._initial_design[new_id]) + config = self._initial_design[new_id] + config[self._fidelity_name] = self._min_budget + return SampledConfig(id=f"{new_id}_0", config=config) # Otherwise, we proceed to surrogate phase ftpfn = FTPFNSurrogate( diff --git a/neps/optimizers/multi_fidelity/sampling_policy.py b/neps/optimizers/multi_fidelity/sampling_policy.py index bc35f300..784e067b 100644 --- a/neps/optimizers/multi_fidelity/sampling_policy.py +++ b/neps/optimizers/multi_fidelity/sampling_policy.py @@ -3,7 +3,7 @@ import logging from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, Mapping +from typing import TYPE_CHECKING from botorch.acquisition import ( AcquisitionFunction, @@ -17,16 +17,9 @@ import pandas as pd import torch -from neps.optimizers.bayesian_optimization.acquisition_functions import AcquisitionMapping from neps.optimizers.bayesian_optimization.acquisition_functions.pibo import ( pibo_acquisition, ) -from neps.optimizers.bayesian_optimization.acquisition_functions.prior_weighted import ( - DecayingPriorWeightedAcquisition, -) -from neps.optimizers.bayesian_optimization.acquisition_samplers import ( - AcquisitionSamplerMapping, -) from neps.optimizers.bayesian_optimization.models.gp import make_default_single_obj_gp from neps.optimizers.multi_fidelity_prior.utils import ( compute_config_dist, @@ -36,15 +29,8 @@ ) from neps.sampling.priors import Prior from neps.search_spaces.encoding import ConfigEncoder -from neps.utils.common import instance_from_map if TYPE_CHECKING: - from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( - BaseAcquisition, - ) - from neps.optimizers.bayesian_optimization.acquisition_samplers.base_acq_sampler import ( - AcquisitionSampler, - ) from neps.search_spaces.search_space import SearchSpace TOLERANCE = 1e-2 # 1% @@ -291,8 +277,6 @@ def __init__( use_cost: bool = False, device: torch.device | None = None, ): - if prior: - raise NotImplementedError("Priors are not implemented yet.") if use_cost: raise NotImplementedError("Cost is not implemented yet.") @@ -300,7 +284,8 @@ def __init__( self.device = device self.prior = prior self._encoder = ConfigEncoder.default( - {**pipeline_space.numerical, **pipeline_space.categoricals} + {**pipeline_space.numerical, **pipeline_space.categoricals}, + constants=pipeline_space.constants, ) self._model: SingleTaskGP | None = None self._acq: AcquisitionFunction | None = None @@ -316,6 +301,8 @@ def update_model( x_pending = self._encoder.encode([config.hp_values() for config in pending_x]) y_train = torch.tensor(train_y, dtype=torch.float64, device=self.device) + # TODO: Most of this just copies BO and the duplication can be replaced + # once we don't have the two stage `update_model()` and `sample()` y_model = make_default_single_obj_gp(x_train, y_train, encoder=self._encoder) fit_gpytorch_mll( @@ -344,7 +331,6 @@ def update_model( prior=self.prior, prior_exponent=pibo_exp_term, x_domain=self._encoder.domains, - x_pending=x_pending, ) self._y_model = y_model diff --git a/neps/optimizers/multi_fidelity_prior/async_priorband.py b/neps/optimizers/multi_fidelity_prior/async_priorband.py index 0a859dec..a4963dce 100644 --- a/neps/optimizers/multi_fidelity_prior/async_priorband.py +++ b/neps/optimizers/multi_fidelity_prior/async_priorband.py @@ -13,6 +13,7 @@ AsynchronousSuccessiveHalvingWithPriors, ) from neps.optimizers.multi_fidelity_prior.priorband import PriorBandBase +from neps.sampling.priors import Prior if typing.TYPE_CHECKING: from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( @@ -121,7 +122,12 @@ def __init__( self.init_size = n_min + 1 # in BOHB: init_design >= N_dim + 2 if self.modelling_type == "joint" and self.initial_design_size is not None: self.init_size = self.initial_design_size - self.model_policy = model_policy(pipeline_space, **bo_args) + + parameters = {**self.pipeline_space.numerical, **self.pipeline_space.categoricals} + self.model_policy = model_policy( + pipeline_space, + prior=Prior.from_parameters(parameters.values()), + ) def get_config_and_ids( self, diff --git a/neps/optimizers/multi_fidelity_prior/priorband.py b/neps/optimizers/multi_fidelity_prior/priorband.py index f4bc067b..6b2f84be 100644 --- a/neps/optimizers/multi_fidelity_prior/priorband.py +++ b/neps/optimizers/multi_fidelity_prior/priorband.py @@ -15,6 +15,7 @@ compute_scores, get_prior_weight_for_decay, ) +from neps.sampling.priors import Prior if typing.TYPE_CHECKING: from neps.optimizers.bayesian_optimization.acquisition_functions.base_acquisition import ( @@ -335,13 +336,6 @@ def __init__( }, } - bo_args = { - "surrogate_model": surrogate_model, - "surrogate_model_args": surrogate_model_args, - "acquisition": acquisition, - "log_prior_weighted": log_prior_weighted, - "acquisition_sampler": acquisition_sampler, - } self.model_based = model_based self.modelling_type = modelling_type self.initial_design_size = initial_design_size @@ -355,7 +349,11 @@ def __init__( self.init_size = n_min + 1 # in BOHB: init_design >= N_min + 2 if self.modelling_type == "joint" and self.initial_design_size is not None: self.init_size = self.initial_design_size - self.model_policy = model_policy(pipeline_space, **bo_args) + parameters = {**self.pipeline_space.numerical, **self.pipeline_space.categoricals} + self.model_policy = model_policy( + pipeline_space, + prior=Prior.from_parameters(parameters.values()), + ) for _, sh in self.sh_brackets.items(): sh.sampling_policy = self.sampling_policy diff --git a/neps/sampling/distributions.py b/neps/sampling/distributions.py index f865d173..946bca77 100644 --- a/neps/sampling/distributions.py +++ b/neps/sampling/distributions.py @@ -98,22 +98,22 @@ def __init__( ) self._entropy = CONST_LOG_SQRT_2PI_E + self._log_Z - 0.5 * self._lpbb_m_lpaa_d_Z - @constraints.dependent_property - @override + @constraints.dependent_property # type: ignore + @override # type: ignore def support(self) -> constraints._Interval: return constraints.interval(self.a, self.b) @property - @override + @override # type: ignore def mean(self) -> torch.Tensor: return self._mean @property - @override + @override # type: ignore def variance(self) -> torch.Tensor: return self._variance - @override + @override # type: ignore def entropy(self) -> torch.Tensor: return self._entropy @@ -129,25 +129,25 @@ def _big_phi(self, x: torch.Tensor) -> torch.Tensor: def _inv_big_phi(x: torch.Tensor) -> torch.Tensor: return CONST_SQRT_2 * (2 * x - 1).erfinv() - @override + @override # type: ignore def cdf(self, value: torch.Tensor) -> torch.Tensor: if self._validate_args: self._validate_sample(value) return ((self._big_phi(value) - self._big_phi_a) / self._Z).clamp(0, 1) - @override + @override # type: ignore def icdf(self, value: torch.Tensor) -> torch.Tensor: y = self._big_phi_a + value * self._Z y = y.clamp(self.eps, 1 - self.eps) return self._inv_big_phi(y) - @override + @override # type: ignore def log_prob(self, value: torch.Tensor) -> torch.Tensor: if self._validate_args: self._validate_sample(value) return CONST_LOG_INV_SQRT_2PI - self._log_Z - (value**2) * 0.5 - @override + @override # type: ignore def rsample(self, sample_shape: torch.Size | None = None) -> torch.Tensor: if sample_shape is None: sample_shape = torch.Size([]) @@ -199,18 +199,18 @@ def __init__( self._variance = self._variance * self.scale**2 self._entropy += self._log_scale - def _to_std_rv(self, value): + def _to_std_rv(self, value: torch.Tensor) -> torch.Tensor: return (value - self.loc) / self.scale - def _from_std_rv(self, value): + def _from_std_rv(self, value: torch.Tensor) -> torch.Tensor: return value * self.scale + self.loc @override - def cdf(self, value): + def cdf(self, value: torch.Tensor) -> torch.Tensor: return super().cdf(self._to_std_rv(value)) @override - def icdf(self, value): + def icdf(self, value: torch.Tensor) -> torch.Tensor: sample = self._from_std_rv(super().icdf(value)) # clamp data but keep gradients @@ -224,7 +224,7 @@ def icdf(self, value): return sample @override - def log_prob(self, value): + def log_prob(self, value: torch.Tensor) -> torch.Tensor: value = self._to_std_rv(value) return super().log_prob(value) - self._log_scale @@ -240,7 +240,7 @@ class UniformWithUpperBound(Uniform): # OPTIM: This could probably be optimized a lot but I'm not sure how it effects # gradients. Could probably do a different path depending on if `value` requires # gradients or not. - @override + @override # type: ignore def log_prob(self, value: torch.Tensor) -> torch.Tensor: if self._validate_args: self._validate_sample(value) @@ -252,6 +252,8 @@ def log_prob(self, value: torch.Tensor) -> torch.Tensor: @dataclass class TorchDistributionWithDomain: + """A torch distribution with an associated domain it samples over.""" + distribution: Distribution domain: Domain diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py index 4a77ca79..e17db04b 100644 --- a/neps/sampling/priors.py +++ b/neps/sampling/priors.py @@ -289,7 +289,7 @@ class CenteredPrior(Prior): _meaningful_doms: list[Domain] = field(init=False) _meaningful_dists: list[Distribution] = field(init=False) - def __post_init__(self): + def __post_init__(self) -> None: self._distribution_domains = [dist.domain for dist in self.distributions] rest: list[tuple[int, Domain, Distribution]] = [] @@ -303,8 +303,8 @@ def __post_init__(self): self._meaningful_dists = [] return - self._meaningful_ixs, self._meaningful_doms, self._meaningful_dists = zip( - *rest, strict=False + self._meaningful_ixs, self._meaningful_doms, self._meaningful_dists = zip( # type: ignore + *rest, strict=True ) @property @@ -429,7 +429,7 @@ class WeightedPrior(Prior): _weighted_sampler: WeightedSampler = field(init=False, repr=False) - def __post_init__(self): + def __post_init__(self) -> None: from neps.sampling.samplers import WeightedSampler self._weighted_sampler = WeightedSampler( diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py index dcf369a9..cf1c1e7a 100644 --- a/neps/sampling/samplers.py +++ b/neps/sampling/samplers.py @@ -49,6 +49,7 @@ def sample( domain. If a list of domains, then it must have the same length as the number of columns, with each column being in the corresponding domain. seed: The seed generator + dtype: The dtype of the output tensor. device: The device to cast the samples to. Returns: @@ -120,7 +121,7 @@ class Sobol(Sampler): scramble: bool = True """Whether to scramble the Sobol sequence.""" - def __post_init__(self): + def __post_init__(self) -> None: if self.ndim < 1: raise ValueError( "The number of dimensions must be at least 1." @@ -181,7 +182,7 @@ class WeightedSampler(Sampler): probabilities: torch.Tensor = field(init=False, repr=False) """The probabilities for each sampler. Normalized weights.""" - def __post_init__(self): + def __post_init__(self) -> None: if len(self.samplers) < 2: raise ValueError( f"At least two samplers must be given. Got {len(self.samplers)}" diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py index cc360a1c..6c6c1b75 100644 --- a/neps/search_spaces/domain.py +++ b/neps/search_spaces/domain.py @@ -107,7 +107,7 @@ class Domain(Generic[V]): bounds: tuple[V, V] = field(init=False, repr=False) preffered_dtype: torch.dtype = field(init=False, repr=False) - def __post_init__(self): + def __post_init__(self) -> None: assert isinstance(self.lower, type(self.upper)) is_int = isinstance(self.lower, int) object.__setattr__(self, "is_int", is_int) @@ -144,7 +144,7 @@ def __post_init__(self): object.__setattr__(self, "bounds", (self.lower, self.upper)) @classmethod - def float( + def floating( cls, lower: Number, upper: Number, @@ -175,7 +175,7 @@ def float( ) @classmethod - def int( + def integer( cls, lower: Number, upper: Number, @@ -219,7 +219,7 @@ def indices(cls, n: int, *, is_categorical: bool = False) -> Domain[int]: Returns: A domain for a range of indices. """ - return Domain.int(0, n - 1, is_categorical=is_categorical) + return Domain.integer(0, n - 1, is_categorical=is_categorical) def next_value(self, x: Tensor) -> Tensor: """Get the next value for a tensor of values.""" @@ -456,4 +456,4 @@ def as_integer_domain(self) -> Domain: return Domain.indices(self.cardinality, is_categorical=self.is_categorical) -UNIT_FLOAT_DOMAIN = Domain.float(0.0, 1.0) +UNIT_FLOAT_DOMAIN = Domain.floating(0.0, 1.0) diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py index 01c37720..c20f60b3 100644 --- a/neps/search_spaces/encoding.py +++ b/neps/search_spaces/encoding.py @@ -1,3 +1,11 @@ +"""Encoding of hyperparameter configurations into tensors. + +For the most part, you can just use +[`ConfigEncoder.default()`][neps.search_spaces.encoding.ConfigEncoder.default] +to create an encoder over a list of hyperparameters, along with any constants you +want to include when decoding configurations. +""" + from __future__ import annotations from collections.abc import Mapping, Sequence @@ -17,16 +25,11 @@ WLInput: TypeAlias = tuple[dict, dict | None, dict | None] V = TypeVar("V", int, float) -T = TypeVar("T") - -class Transformer(Protocol[T]): - def encode(self, x: Sequence[Any]) -> T: ... - def decode(self, x: T) -> list[Any]: ... +class TensorTransformer(Protocol): + """A protocol for encoding and decoding hyperparameter values into tensors.""" - -class TensorTransformer(Transformer[torch.Tensor], Protocol): domain: Domain def encode( @@ -36,20 +39,45 @@ def encode( out: torch.Tensor | None = None, dtype: torch.dtype | None = None, device: torch.device | None = None, - ) -> torch.Tensor: ... + ) -> torch.Tensor: + """Encode a sequence of hyperparameter values into a tensor. + + Args: + x: A sequence of hyperparameter values. + out: An optional tensor to write the encoded values to. + dtype: The dtype of the tensor. + device: The device of the tensor. + + Returns: + The encoded tensor. + """ + ... + + def decode(self, x: torch.Tensor) -> list[Any]: + """Decode a tensor of hyperparameter values into a sequence of values. + + Args: + x: A tensor of hyperparameter values. + + Returns: + A sequence of hyperparameter values. + """ + ... @dataclass class CategoricalToIntegerTransformer(TensorTransformer): + """A transformer that encodes categorical values into integers.""" + choices: Sequence[Any] domain: Domain = field(init=False) _lookup: dict[Any, int] | None = field(init=False) - def __post_init__(self): + def __post_init__(self) -> None: assert len(self.choices) > 0 - self.domain = Domain.indices(len(self.choices), categorical=True) + self.domain = Domain.indices(len(self.choices), is_categorical=True) self._lookup = None if len(self.choices) > 3: try: @@ -89,13 +117,23 @@ def decode(self, x: torch.Tensor) -> list[Any]: @dataclass class CategoricalToUnitNorm(TensorTransformer): + """A transformer that encodes categorical values into a unit normalized tensor. + + If there are `n` choices, the tensor will have `n` bins between `0` and `1`. + """ + choices: Sequence[Any] domain: Domain = field(init=False) _integer_transformer: CategoricalToIntegerTransformer = field(init=False) - def __post_init__(self): - self._domain = Domain.float(0.0, 1.0, bins=len(self.choices), categorical=True) + def __post_init__(self) -> None: + self.domain = Domain.floating( + 0.0, + 1.0, + bins=len(self.choices), + is_categorical=True, + ) self._integer_transformer = CategoricalToIntegerTransformer(self.choices) @override @@ -130,21 +168,23 @@ def decode(self, x: torch.Tensor) -> list[Any]: # and `-0.5` as lower bound with `0.5` as upper bound. @dataclass class MinMaxNormalizer(TensorTransformer, Generic[V]): + """A transformer that normalizes values to the unit interval.""" + original_domain: Domain[V] bins: int | None = None domain: Domain[float] = field(init=False) - def __post_init__(self): + def __post_init__(self) -> None: if self.bins is None: self.domain = UNIT_FLOAT_DOMAIN else: - self.domain = Domain.float(0.0, 1.0, bins=self.bins) + self.domain = Domain.floating(0.0, 1.0, bins=self.bins) @override def encode( self, - x: list[V], + x: Sequence[V], *, out: torch.Tensor | None = None, dtype: torch.dtype | None = None, @@ -172,13 +212,41 @@ def decode(self, x: torch.Tensor) -> list[V]: @dataclass class ConfigEncoder: + """An encoder for hyperparameter configurations. + + This class is used to encode and decode hyperparameter configurations into tensors + and back. It's main uses currently are to support surrogate models that require + tensors. + + The primary methods/properties to be aware of are: + * [`default()`](neps.search_spaces.encoding.ConfigEncoder.default]: Create a default + encoder over a list of hyperparameters. Please see the method docs for more + details on how it encodes different types of hyperparameters. + * [`encode()`]]neps.search_spaces.encoding.ConfigEncoder.encode]: Encode a list of + configurations into a single tensor using the transforms of the encoder. + * [`decode()`][neps.search_spaces.encoding.ConfigEncoder.decode]: Decode a 2d tensor + of length `N` into a list of `N` configurations. + * [`domains`][neps.search_spaces.encoding.ConfigEncoder.domains): The + [`Domain`][neps.search_spaces.domain.Domain] that each hyperparameter is encoded + into. This is useful in combination with classes like + [`Sampler`][neps.sampling.samplers.Sampler], + [`Prior`][neps.sampling.priors.Prior], and + [`TorchDistributionWithDomain`][neps.sampling.distributions.TorchDistributionWithDomain], + which require knowledge of the + domains of each column for the tensor, for example, to sample values directly + into the encoded space, getting log probabilities of the encoded values. + * [`ncols`][neps.search_spaces.encoding.ConfigEncoder.ncols]: The number of columns + in the encoded tensor, useful for initializing some `Sampler`s. + """ + transformers: dict[str, TensorTransformer] index_of: dict[str, int] = field(init=False) domain_of: dict[str, Domain] = field(init=False) + constants: Mapping[str, Any] = field(default_factory=dict) n_numerical: int = field(init=False) n_categorical: int = field(init=False) - def __post_init__(self): + def __post_init__(self) -> None: transformers = sorted(self.transformers.items(), key=lambda t: t[0]) self.transformers = dict(transformers) @@ -197,21 +265,14 @@ def __post_init__(self): @property def ncols(self) -> int: + """The number of columns in the encoded tensor.""" return len(self.transformers) @property def domains(self) -> list[Domain]: + """The domains of the encoded hyperparameters.""" return list(self.domain_of.values()) - def names(self) -> list[str]: - return list(self.transformers.keys()) - - def select(self, x: torch.Tensor, hp: str | Sequence[str]) -> torch.Tensor: - if isinstance(hp, str): - return x[:, self.index_of[hp]] - - return x[:, [self.index_of[h] for h in hp]] - def encode( self, x: Sequence[Mapping[str, Any]], @@ -219,6 +280,26 @@ def encode( device: torch.device | None = None, dtype: torch.dtype | None = None, ) -> torch.Tensor: + """Encode a list of hyperparameter configurations into a tensor. + + !!! warning "Constants" + + Constants included in configurations will not be encoded into the tensor, + but are included when decoding. + + !!! warning "Parameters with no transformers" + + Any parameters in the configurations, whos key is not in + `self.transformers`, will be ignored. + + Args: + x: A list of hyperparameter configurations. + device: The device of the tensor. + dtype: The dtype of the tensor. + + Returns: + A tensor of shape `(len(x), ncols)` containing the encoded configurations. + """ dtype = torch.float64 if dtype is None else dtype width = len(self.transformers) buffer = torch.empty((len(x), width), dtype=dtype, device=device) @@ -238,12 +319,25 @@ def encode( return buffer def decode(self, x: torch.Tensor) -> list[dict[str, Any]]: + """Decode a tensor of hyperparameter configurations into a list of configurations. + + Args: + x: A tensor of shape `(N, ncols)` containing the encoded configurations. + + Returns: + A list of `N` configurations, including any constants that were included + when creating the encoder. + """ values: dict[str, list[Any]] = {} + N = len(x) for hp_name, transformer in self.transformers.items(): lookup = self.index_of[hp_name] tensor = x[:, lookup] values[hp_name] = transformer.decode(tensor) + constants = {name: [v] * N for name, v in self.constants.items()} + values.update(constants) + keys = list(values.keys()) return [ dict(zip(keys, vals, strict=False)) @@ -254,9 +348,28 @@ def decode(self, x: torch.Tensor) -> list[dict[str, Any]]: def default( cls, parameters: Mapping[str, Parameter], + constants: Mapping[str, Any] | None = None, *, custom_transformers: dict[str, TensorTransformer] | None = None, ) -> ConfigEncoder: + """Create a default encoder over a list of hyperparameters. + + This method creates a default encoder over a list of hyperparameters. It + automatically creates transformers for each hyperparameter based on its type. + The transformers are as follows: + + * `FloatParameter` and `IntegerParameter` are normalized to the unit interval. + * `CategoricalParameter` is transformed into an integer. + + Args: + parameters: A mapping of hyperparameter names to hyperparameters. + constants: A mapping of constant hyperparameters to include when decoding. + custom_transformers: A mapping of hyperparameter names to custom transformers. + + Returns: + A `ConfigEncoder` instance + """ + constants = constants or {} custom = custom_transformers or {} sorted_params = sorted(parameters.items()) transformers: dict[str, TensorTransformer] = {} @@ -267,10 +380,10 @@ def default( match hp: case FloatParameter() | IntegerParameter(): - transformers[name] = MinMaxNormalizer(hp.domain) + transformers[name] = MinMaxNormalizer(hp.domain) # type: ignore case CategoricalParameter(): transformers[name] = CategoricalToIntegerTransformer(hp.choices) case _: raise ValueError(f"Unsupported parameter type: {type(hp)}") - return ConfigEncoder(transformers) + return ConfigEncoder(transformers, constants=constants) diff --git a/neps/search_spaces/hyperparameters/float.py b/neps/search_spaces/hyperparameters/float.py index a0ab83e8..bf7768c4 100644 --- a/neps/search_spaces/hyperparameters/float.py +++ b/neps/search_spaces/hyperparameters/float.py @@ -72,7 +72,7 @@ def __init__( default=float(default) if default is not None else None, default_confidence=default_confidence, is_fidelity=is_fidelity, - domain=Domain.float(lower, upper, log=log), + domain=Domain.floating(lower, upper, log=log), ) @override diff --git a/neps/search_spaces/hyperparameters/integer.py b/neps/search_spaces/hyperparameters/integer.py index 2f534323..fdebcce7 100644 --- a/neps/search_spaces/hyperparameters/integer.py +++ b/neps/search_spaces/hyperparameters/integer.py @@ -77,7 +77,7 @@ def __init__( is_fidelity=is_fidelity, default=int(np.rint(default)) if default is not None else None, default_confidence=default_confidence, - domain=Domain.int(lower, upper, log=log), + domain=Domain.integer(lower, upper, log=log), ) # We subtract/add 0.499999 from lower/upper bounds respectively, such that diff --git a/neps/search_spaces/hyperparameters/numerical.py b/neps/search_spaces/hyperparameters/numerical.py index 2e98de9f..f403feb0 100644 --- a/neps/search_spaces/hyperparameters/numerical.py +++ b/neps/search_spaces/hyperparameters/numerical.py @@ -94,6 +94,7 @@ def __init__( log: Whether the hyperparameter is in log space. default: The default value of the hyperparameter. is_fidelity: Whether the hyperparameter is a fidelity parameter. + domain: The domain of the hyperparameter. default_confidence: The default confidence choice. """ super().__init__(value=None, default=default, is_fidelity=is_fidelity) # type: ignore @@ -136,7 +137,7 @@ def __init__( self.lower: T = lower self.upper: T = upper self.log: bool = log - self.domain = domain + self.domain: Domain[T] = domain self.log_value: float | None = None self.log_bounds: tuple[float, float] | None = None self.log_default: float | None = None diff --git a/neps/search_spaces/neighborhoods.py b/neps/search_spaces/neighborhoods.py deleted file mode 100644 index 91c34a6f..00000000 --- a/neps/search_spaces/neighborhoods.py +++ /dev/null @@ -1,281 +0,0 @@ -from __future__ import annotations - -from typing import TypeVar - -import numpy as np - -from neps.search_spaces.domain import Domain -from neps.utils.types import Arr, f64, i64 - -V = TypeVar("V", f64, i64) - -UNIQUE_NEIGHBOR_GENERATOR_N_RETRIES = 8 -UNIQUE_NEIGHBOR_GENERATOR_SAMPLE_MULTIPLIER = 4 - -NON_UNIQUE_NEIGHBORS_N_RETRIES = 8 -NON_UNIQUE_NEIGHBORS_SAMPLE_MULTIPLIER = 4 - -# Small enough but prevents needing to keep re-allocating temporary memory -# 50 * 8 = 400 bytes -_SMALL = 50 -_SMALL_CACHED_ARANGE = np.arange(_SMALL, dtype=i64) - - -def unorded_finite_neighbors( - pivot: V, - domain: Domain[V], - *, - n: int, - seed: np.random.Generator, -) -> Arr[V]: - N = domain.cardinality - assert N is not None, "Domain must be finite." - if N <= _SMALL: - full_range = _SMALL_CACHED_ARANGE[: domain.cardinality] - else: - full_range = np.arange(N, dtype=i64) - - range_domain = Domain.indices(N) - _pivot = range_domain.cast(pivot, frm=domain) - - left = full_range[:_pivot] - right = full_range[_pivot + 1 :] - _range = np.concatenate((left, right)) - - seed.shuffle(_range) - - return domain.cast(_range[:n], frm=range_domain) - - -def neighbors( - pivot: V, - domain: Domain[V], - *, - n: int, - std: float, - seed: np.random.Generator, - n_retries: int = NON_UNIQUE_NEIGHBORS_N_RETRIES, - sample_multiplier: int = NON_UNIQUE_NEIGHBORS_SAMPLE_MULTIPLIER, -) -> Arr[V]: - """Create a neighborhood of `n` neighbors around `pivot` with a normal distribution. - - If you need unique neighbors, you should use - [`unique_neighborhood`][neps.search_spaces.neighborhoods.unique_neighborhood]. - - !!! tip - - [`unique_neighborhood`][neps.search_spaces.neighborhoods.unique_neighborhood] - is quite expensive in certain situations as it has to repeatedly sample and check - for uniqueness. If you can afford duplicates, use this function instead. - - If [`domain.cardinality == None`][neps.search_spaces.domain.Domain.cardinality], - and you can afford an infentesimally small percentage change of duplicates, - you should use this function instead. - - !!! warning - - It is up to the caller to ensure that the pivot lies within the domain, - including at one of the bins if the domain is quantized. - - Args: - pivot: The center of the neighborhood. - domain: The domain to get neighbors from. - n: The number of neighbors to generate. - std: The standard deviation of the normal distribution. - seed: The random seed to use. - n_retries: - The number of retries to attempt to generate unique neighbors. - Each retry increases the standard deviation of the normal distribution to - prevent rejection sampling from failing. - sample_multiplier: - A multiplier which multiplies by `n` to determine the number of samples to - generate for try. By oversampling, we prevent having to repeated calls to - sampling. This prevents having to do more rounds of sampling when too many - samples are out of bounds, useful for when the `pivot` is near the bounds. - - Tuning this may be beneficial in unique circumstances, however we advise - leaving this as a default. - - Returns: - An array of `n` neighbors around `pivot`. - """ - # Generate batches of n * BUFFER_MULTIPLIER candidates, filling the above - # buffer until we have enough valid candidates. - # We should not overflow as the buffer - offset = 0 - SAMPLE_SIZE = n * sample_multiplier - BUFFER_SIZE = (n + 1) * sample_multiplier - - # We extend the range of stds to try to find neighbors - neighbors: Arr[V] = np.empty(BUFFER_SIZE, dtype=domain.dtype) - stds = np.linspace(std, 1.0, n_retries + 1, endpoint=True) - - lower = domain.lower - upper = domain.upper - range_size = upper - lower - sample_domain = Domain.float(lower, upper) - - for _std in stds: - candidates = seed.normal(pivot, _std * range_size, size=(SAMPLE_SIZE,)) - - bounded_candidates = candidates[(candidates >= lower) & (candidates <= upper)] - maybe_valid = domain.cast(bounded_candidates, frm=sample_domain) - - # High chance of overlap with original point if there's a finite amount of - # possible elements - if domain.cardinality is not None: - valid = maybe_valid[maybe_valid != pivot] - else: - valid = maybe_valid - - n_candidates = len(valid) - neighbors[offset : offset + n_candidates] = valid - offset += n_candidates - - if offset >= n: - return neighbors[:n] - - raise ValueError( - f"Failed to find enough neighbors with {n_retries} retries." - f" Given {n} neighbors, we only found {offset}." - f" The `Normals` for sampling neighbors were" - f" Normal(mu={pivot}, sigma={list(stds)})" - f" which were meant to find vectorized neighbors of the vector {pivot}," - " which was expected to be in the range" - f" ({lower}, {lower}).", - ) - - -def unique_neighborhood( - pivot: V, - domain: Domain[V], - *, - n: int, - seed: np.random.Generator, - std: float, - n_retries: int = UNIQUE_NEIGHBOR_GENERATOR_N_RETRIES, - sample_multiplier: int = UNIQUE_NEIGHBOR_GENERATOR_SAMPLE_MULTIPLIER, -) -> Arr[V]: - """Create a neighborhood of `n` neighbors around `pivot` with a normal distribution. - - The neighborhood is created by sampling from a normal distribution centered around - `pivot` with a standard deviation of `std`. The samples are then quantized to the - range `[lower, upper]` with `bins` bins. The number of samples is `n`. - - !!! tip - - [`unique_neighborhood`][neps.search_spaces.neighborhoods.unique_neighborhood] - is quite expensive in certain situations as it has to repeatedly sample and check - for uniqueness. If you can afford duplicates, use this function instead. - - If [`domain.cardinality == None`][neps.search_spaces.domain.Domain.cardinality], - and you can afford an infentesimally small percentage change of duplicates, - you should use [`neighbors`][neps.search_spaces.neighborhoods.neighbors] instead. - - !!! warning - - If there are not enough unique neighbors to sample from, the function will - return less than `n` neighbors. - - !!! warning - - It is up to the caller to ensure that the pivot lies within the domain, - including at one of the bins if the domain is quantized. - - - Args: - pivot: The center of the neighborhood. - domain: The domain to get neighbors from. - n: The number of neighbors to generate. - std: The standard deviation of the normal distribution. - seed: The random seed to use. - n_retries: - The number of retries to attempt to generate unique neighbors. - Each retry increases the standard deviation of the normal distribution to prevent - rejection sampling from failing. - sample_multiplier: - A multiplier which multiplies by `n` to determine the number of samples to - generate for try. By oversampling, we prevent having to repeated calls to - both sampling and unique checking. - - However, oversampling makes a tradeoff when the `std` is not high enough to - generate `n` unique neighbors, effectively sampling more of the same duplicates. - - Tuning this may be beneficial in unique circumstances, however we advise leaving - this as a default. - - Returns: - An array of `n` neighbors around `pivot`, or less than `n` if not enough unique - neighbors could be generated. - """ # noqa: E501 - # Different than other neighborhoods as it's unnormalized and - # the quantization is directly integers. - assert n < 1000000, "Can only generate less than 1 million neighbors." - assert 0 < std < 1.0, "Standard deviation must be in the range (0, 1)." - lower = domain.lower - upper = domain.upper - - # In the easiest case, we have a domain with finite elements and we need - # more neighbors than are possible. We then generate all of them. - # We can do this simply with a range and removing the pivot. - if domain.cardinality is not None and n >= domain.cardinality - 1: - range_domain = Domain.indices(domain.cardinality) - int_pivot = range_domain.cast(pivot, frm=domain) - - if int_pivot == 0: - _range = np.arange(1, domain.cardinality, dtype=i64) - return domain.cast(_range, frm=range_domain) - - if int_pivot == domain.cardinality - 1: - _range = np.arange(0, domain.cardinality - 1, dtype=i64) - return domain.cast(_range, frm=range_domain) - - left = np.arange(0, int_pivot, dtype=i64) - right = np.arange(int_pivot + 1, domain.cardinality, dtype=i64) - _range = np.concatenate((left, right)) - - return domain.cast(_range, frm=range_domain) - - # Otherwise, we use a repeated sampling strategy where we slowly increase the - # std of a normal, centered on `center`, slowly expanding `std` such that - # rejection won't fail. - - # We set up a buffer that can hold the number of neighbors we need, plus some - # extra excess from sampling, preventing us from having to reallocate memory. - # We also include the initial value in the buffer, as we will remove it later. - SAMPLE_SIZE = n * sample_multiplier - BUFFER_SIZE = n * (sample_multiplier + 1) - neighbors = np.empty(BUFFER_SIZE + 1, dtype=domain.dtype) - neighbors[0] = pivot - offset = 1 # Indexes into current progress of filling buffer - stds = np.linspace(std, 1.0, n_retries + 1, endpoint=True) - sample_domain = Domain.float(lower, upper) - - range_size = upper - lower - for _std in stds: - # Generate candidates in vectorized space - candidates = seed.normal(pivot, _std * range_size, size=SAMPLE_SIZE) - valid = (candidates >= lower) & (candidates <= upper) - - candidates = domain.cast(x=candidates[valid], frm=sample_domain) - - # Find new unique neighbors - uniq = np.unique(candidates) - new_uniq = np.setdiff1d(uniq, neighbors[:offset], assume_unique=True) - - n_new_unique = len(new_uniq) - neighbors[offset : offset + n_new_unique] = new_uniq - offset += n_new_unique - - # We have enough neighbors, we can stop - if offset - 1 >= n: - # Ensure we don't include the initial value point - return neighbors[1 : n + 1] - - raise ValueError( - f"Failed to find enough neighbors with {n_retries} retries." - f" Given {n=} neighbors to generate, we only found {offset - 1}." - f" The normal's for sampling neighbors were Normal({pivot}, {list(stds)})" - f" which were meant to find neighbors of {pivot}. in the range" - f" ({lower}, {upper}).", - ) diff --git a/neps/state/optimizer.py b/neps/state/optimizer.py index 07155015..92accddb 100644 --- a/neps/state/optimizer.py +++ b/neps/state/optimizer.py @@ -24,6 +24,7 @@ def remaining_cost_budget(self) -> float | None: return self.max_cost_budget - self.used_cost_budget def clone(self) -> BudgetInfo: + """Create a copy of the budget info.""" return BudgetInfo( max_cost_budget=self.max_cost_budget, used_cost_budget=self.used_cost_budget, diff --git a/neps/utils/common.py b/neps/utils/common.py index d0fb2137..643ecc6d 100644 --- a/neps/utils/common.py +++ b/neps/utils/common.py @@ -3,16 +3,11 @@ from __future__ import annotations import inspect -import random from collections.abc import Iterable, Mapping, Sequence from functools import partial from pathlib import Path from typing import Any -from functools import partial -from pathlib import Path -from typing import Any, Iterable, Mapping, Sequence -import numpy as np import torch import yaml diff --git a/neps/utils/run_args.py b/neps/utils/run_args.py index bd2664e1..9f5cc60d 100644 --- a/neps/utils/run_args.py +++ b/neps/utils/run_args.py @@ -41,7 +41,7 @@ MAX_EVALUATIONS_PER_RUN = "max_evaluations_per_run" -def get_run_args_from_yaml(path: str) -> dict: +def get_run_args_from_yaml(path: str | Path) -> dict: """Load and validate NEPS run arguments from a specified YAML configuration file provided via run_args. @@ -116,7 +116,7 @@ def get_run_args_from_yaml(path: str) -> dict: return settings -def config_loader(path: str) -> dict: +def config_loader(path: str | Path) -> dict: """Loads a YAML file and returns the contents under the 'run_args' key. Args: @@ -506,7 +506,7 @@ class Settings: arguments (run_args (yaml) and neps func_args). """ - def __init__(self, func_args: dict, yaml_args: str | Default | None = None): + def __init__(self, func_args: dict, yaml_args: Path | str | Default | None = None): """Initializes the Settings object by merging function arguments with YAML configuration settings and assigning them to class attributes. It checks for necessary configurations and handles default values where specified. diff --git a/neps_examples/basic_usage/hyperparameters.py b/neps_examples/basic_usage/hyperparameters.py index 3f346949..164b49cb 100644 --- a/neps_examples/basic_usage/hyperparameters.py +++ b/neps_examples/basic_usage/hyperparameters.py @@ -2,51 +2,29 @@ import time import numpy as np -import math -import random import neps -PRINT = False - -def run_pipeline(float1, float2, float3, integer1, integer2): - if PRINT: - print("float1:", float1) - print("float2:", float2) - print("float3:", float3) - # print("categorical:", categorical) - print("integer1:", integer1) - print("integer2:", integer2) - loss = -float( - integer2 - * np.sum( - [ - (float1 * float2 / (float3 + 1)), # * (int(categorical) + 1), - integer1, - ] - ) - ) # Random noise +def run_pipeline(float1, float2, categorical, integer1, integer2): + loss = -float(np.sum([float1, float2, int(categorical), integer1, integer2])) # time.sleep(0.7) # For demonstration purposes - return {"loss": loss, "cost": float(integer2)} + return loss pipeline_space = dict( - float1=neps.FloatParameter(lower=0, upper=1, default=0.95), - float2=neps.FloatParameter(lower=0, upper=20, default=19.5), - float3=neps.FloatParameter(lower=0, upper=5, default=0.5), - # categorical=neps.CategoricalParameter(choices=[0, 1]), - integer1=neps.IntegerParameter(lower=0, upper=1, default=1), - integer2=neps.IntegerParameter(lower=1, upper=1000, log=True, default=950), + float1=neps.FloatParameter(lower=0, upper=1), + float2=neps.FloatParameter(lower=-10, upper=10), + categorical=neps.CategoricalParameter(choices=[0, 1]), + integer1=neps.IntegerParameter(lower=0, upper=1), + integer2=neps.IntegerParameter(lower=1, upper=1000, log=True), ) logging.basicConfig(level=logging.INFO) neps.run( run_pipeline=run_pipeline, - searcher="bayesian_optimization", pipeline_space=pipeline_space, root_directory="results/hyperparameters_example", post_run_summary=True, - max_evaluations_total=50, - use_priors=True, + max_evaluations_total=15, ) diff --git a/tests/test_neps_api/solution_yamls/bo_neps_decided.yaml b/tests/test_neps_api/solution_yamls/bo_neps_decided.yaml index 76935d6c..98be780b 100644 --- a/tests/test_neps_api/solution_yamls/bo_neps_decided.yaml +++ b/tests/test_neps_api/solution_yamls/bo_neps_decided.yaml @@ -3,11 +3,8 @@ searcher_alg: bayesian_optimization searcher_selection: neps-default neps_decision_tree: true searcher_args: - initial_design_size: 10 - surrogate_model: gp - acquisition: EI - log_prior_weighted: false - acquisition_sampler: mutation - random_interleave_prob: 0.0 - disable_priors: true + initial_design_size: null + use_priors: false + use_cost: false sample_default_first: false + device: null diff --git a/tests/test_neps_api/solution_yamls/pibo_neps_decided.yaml b/tests/test_neps_api/solution_yamls/pibo_neps_decided.yaml index 7d5f19da..3b7c36b2 100644 --- a/tests/test_neps_api/solution_yamls/pibo_neps_decided.yaml +++ b/tests/test_neps_api/solution_yamls/pibo_neps_decided.yaml @@ -3,12 +3,8 @@ searcher_alg: pibo searcher_selection: neps-default neps_decision_tree: true searcher_args: - initial_design_size: 10 - surrogate_model: gp - acquisition: EI - log_prior_weighted: false - acquisition_sampler: mutation - random_interleave_prob: 0.0 - disable_priors: false - prior_confidence: medium - sample_default_first: false + initial_design_size: null + use_priors: true + use_cost: false + sample_default_first: true + device: null diff --git a/tests/test_neps_api/solution_yamls/user_yaml_bo.yaml b/tests/test_neps_api/solution_yamls/user_yaml_bo.yaml index 156d67e4..1a20bc12 100644 --- a/tests/test_neps_api/solution_yamls/user_yaml_bo.yaml +++ b/tests/test_neps_api/solution_yamls/user_yaml_bo.yaml @@ -4,11 +4,5 @@ searcher_selection: user-yaml neps_decision_tree: false searcher_args: initial_design_size: 5 - surrogate_model: gp - acquisition: EI - log_prior_weighted: false - acquisition_sampler: random - random_interleave_prob: 0.1 - disable_priors: false - prior_confidence: high - sample_default_first: false + use_priors: true + sample_default_first: true diff --git a/tests/test_neps_api/test_api.py b/tests/test_neps_api/test_api.py index b4a54940..cebbdcc5 100644 --- a/tests/test_neps_api/test_api.py +++ b/tests/test_neps_api/test_api.py @@ -26,105 +26,32 @@ def no_logs_gte_error(caplog): assert not errors -testing_scripts = [ - "default_neps", - "baseoptimizer_neps", - "user_yaml_neps", -] +HERE = Path(__file__).resolve().parent -examples_folder = Path(__file__, "..", "testing_scripts").resolve() -solution_folder = Path(__file__, "..", "solution_yamls").resolve() +testing_scripts = ["default_neps", "baseoptimizer_neps", "user_yaml_neps"] +EXAMPLES_FOLDER = HERE / "testing_scripts" +SOLUTION_FOLDER = HERE / "solution_yamls" neps_api_example_script = [ - examples_folder / f"{example}.py" for example in testing_scripts + EXAMPLES_FOLDER / f"{example}.py" for example in testing_scripts ] @pytest.mark.neps_api -def test_default_examples(tmp_path): +@pytest.mark.parametrize("example_script", neps_api_example_script) +def test_default_examples(tmp_path: Path, example_script: Path) -> None: # Running the example files holding multiple neps.run commands. - - runpy.run_path( - neps_api_example_script[0], - run_name="__main__", - ) - - # Testing each folder with its corresponding expected dictionary - for folder_name in os.listdir(tmp_path): - folder_path = os.path.join(tmp_path, folder_name) - - assert os.path.exists(folder_path), f"Directory does not exist: {folder_path}" - - info_yaml_path = os.path.join(folder_path, ".optimizer_info", "info.yaml") - - assert os.path.exists( - str(info_yaml_path) - ), f"File does not exist: {info_yaml_path}\n{os.listdir(folder_path)}" - - # Load the YAML file - with open(str(info_yaml_path)) as yaml_config: - loaded_data = yaml.safe_load(yaml_config) - - with open(str(solution_folder / (folder_name + ".yaml"))) as solution_yaml: - expected_data = yaml.safe_load(solution_yaml) - - assert loaded_data == expected_data - - -@pytest.mark.neps_api -def test_baseoptimizer_examples(tmp_path): - # Running the example files holding multiple neps.run commands. - - runpy.run_path( - neps_api_example_script[1], - run_name="__main__", - ) + runpy.run_path(str(example_script), run_name="__main__") # Testing each folder with its corresponding expected dictionary - for folder_name in os.listdir(tmp_path): - folder_path = os.path.join(tmp_path, folder_name) - - assert os.path.exists(folder_path), f"Directory does not exist: {folder_path}" - - info_yaml_path = os.path.join(folder_path, ".optimizer_info", "info.yaml") - - assert os.path.exists( - str(info_yaml_path) - ), f"File does not exist: {info_yaml_path}" - - # Load the YAML file - with open(str(info_yaml_path)) as yaml_config: - loaded_data = yaml.safe_load(yaml_config) - - with open(str(solution_folder / (folder_name + ".yaml"))) as solution_yaml: - expected_data = yaml.safe_load(solution_yaml) - - assert loaded_data == expected_data - - -@pytest.mark.neps_api -def test_user_created_yaml_examples(tmp_path): - runpy.run_path( - neps_api_example_script[2], - run_name="__main__", - ) - - # Testing each folder with its corresponding expected dictionary - for folder_name in os.listdir(tmp_path): - folder_path = os.path.join(tmp_path, folder_name) - - assert os.path.exists(folder_path), f"Directory does not exist: {folder_path}" - - info_yaml_path = os.path.join(folder_path, ".optimizer_info", "info.yaml") - - assert os.path.exists( - str(info_yaml_path) - ), f"File does not exist: {info_yaml_path}" + for folder in tmp_path.iterdir(): + info_yaml_path = folder / ".optimizer_info" / "info.yaml" - # Load the YAML file - with open(str(info_yaml_path)) as yaml_config: - loaded_data = yaml.safe_load(yaml_config) + assert info_yaml_path.exists() + loaded_data = yaml.safe_load(info_yaml_path.read_text()) - with open(str(solution_folder / (folder_name + ".yaml"))) as solution_yaml: - expected_data = yaml.safe_load(solution_yaml) + solution_yaml_path = SOLUTION_FOLDER / (folder.name + ".yaml") + solution_data = yaml.safe_load(solution_yaml_path.read_text()) - assert loaded_data == expected_data + assert ( + loaded_data == solution_data + ), f"Solution Path: {solution_yaml_path}\nLoaded Path: {info_yaml_path}\n" diff --git a/tests/test_neps_api/testing_yaml/optimizer_test.yaml b/tests/test_neps_api/testing_yaml/optimizer_test.yaml index f65af743..a4deff20 100644 --- a/tests/test_neps_api/testing_yaml/optimizer_test.yaml +++ b/tests/test_neps_api/testing_yaml/optimizer_test.yaml @@ -1,11 +1,5 @@ strategy: bayesian_optimization # Specific arguments depending on the searcher initial_design_size: 7 -surrogate_model: gp -acquisition: EI -log_prior_weighted: false -acquisition_sampler: random -random_interleave_prob: 0.1 -disable_priors: false -prior_confidence: high -sample_default_first: false +use_priors: true +sample_default_first: true diff --git a/tests/test_settings/run_args_optimizer_outside.yaml b/tests/test_settings/run_args_optimizer_outside.yaml index 1dbfce01..4380904e 100644 --- a/tests/test_settings/run_args_optimizer_outside.yaml +++ b/tests/test_settings/run_args_optimizer_outside.yaml @@ -12,9 +12,5 @@ searcher: name: my_bayesian # Specific arguments depending on the searcher initial_design_size: 7 - surrogate_model: gp - acquisition: EI - acquisition_sampler: random - random_interleave_prob: 0.1 overwrite_working_directory: True diff --git a/tests/test_settings/test_settings.py b/tests/test_settings/test_settings.py index fe649563..1244bcf6 100644 --- a/tests/test_settings/test_settings.py +++ b/tests/test_settings/test_settings.py @@ -1,7 +1,8 @@ +from __future__ import annotations + from neps.utils.run_args import Settings, Default import pytest -import neps -from neps.utils.run_args import get_run_args_from_yaml +from pathlib import Path from tests.test_yaml_run_args.test_yaml_run_args import ( run_pipeline, hook1, @@ -9,9 +10,8 @@ pipeline_space, ) from neps.optimizers.bayesian_optimization.optimizer import BayesianOptimization -from typing import Union, Callable, Dict, List, Type -BASE_PATH = "tests/test_settings" +BASE_PATH = Path("tests") / "test_settings" run_pipeline = run_pipeline hook1 = hook1 hook2 = hook2 @@ -86,7 +86,7 @@ "searcher": Default("default"), "searcher_kwargs": {}, }, - "/run_args_required.yaml", + "run_args_required.yaml", { "run_pipeline": run_pipeline, "root_directory": "path/to/root_directory", @@ -128,7 +128,7 @@ "searcher": Default("default"), "searcher_kwargs": {}, }, - "/run_args_optional.yaml", + "run_args_optional.yaml", { "run_pipeline": run_pipeline, "root_directory": "path/to/root_directory", @@ -170,7 +170,7 @@ "searcher": "default", "searcher_kwargs": {}, }, - "/overwrite_run_args.yaml", + "overwrite_run_args.yaml", { "run_pipeline": run_pipeline, "root_directory": "path/to/root_directory", @@ -218,7 +218,7 @@ "sample_default_at_target": False, }, }, - "/run_args_optimizer_settings.yaml", + "run_args_optimizer_settings.yaml", { "run_pipeline": run_pipeline, "root_directory": "path/to/root_directory", @@ -273,11 +273,10 @@ "pre_load_hooks": Default(None), "searcher": Default("default"), "searcher_kwargs": { - "random_interleave_prob": 0.2, "initial_design_size": 9, }, }, - "/run_args_optimizer_outside.yaml", + "run_args_optimizer_outside.yaml", { "run_pipeline": run_pipeline, "root_directory": "path/to/root_directory", @@ -295,24 +294,21 @@ "cost_value_on_error": None, "pre_load_hooks": None, "searcher": my_bayesian, - "searcher_kwargs": { - "acquisition": "EI", - "acquisition_sampler": "random", - "random_interleave_prob": 0.2, - "initial_design_size": 9, - }, + "searcher_kwargs": {"initial_design_size": 9}, }, ), ], ) -def test_check_settings(func_args: Dict, yaml_args: str, expected_output: Dict) -> None: +def test_check_settings(func_args: dict, yaml_args: str, expected_output: dict) -> None: """ Check if expected settings are set """ - if not isinstance(yaml_args, Default): - yaml_args = BASE_PATH + yaml_args - settings = Settings(func_args, yaml_args) - print(settings) + if isinstance(yaml_args, str): + args = BASE_PATH / yaml_args + else: + args = yaml_args + + settings = Settings(func_args, args) for key, value in expected_output.items(): assert getattr(settings, key) == value @@ -347,7 +343,7 @@ def test_check_settings(func_args: Dict, yaml_args: str, expected_output: Dict) ], ) def test_settings_initialization_error( - func_args: Dict, yaml_args: Union[str, Default], error: Exception + func_args: dict, yaml_args: str | Default, error: Exception ) -> None: """ Test if Settings raises Error when essential arguments are missing diff --git a/tests/test_state/test_neps_state.py b/tests/test_state/test_neps_state.py index ab3a6b6a..641b54ba 100644 --- a/tests/test_state/test_neps_state.py +++ b/tests/test_state/test_neps_state.py @@ -79,6 +79,10 @@ def case_search_space_fid_with_prior() -> SearchSpace: # OPTIMIZER_FAILS_WITH_FIDELITY = [ "random_search", + "bayesian_optimization", + "pibo", + "cost_cooling_bayesian_optimization", + "cost_cooling", ] # There's no programattic way to check if a class requires a fidelity. diff --git a/tests/test_yaml_run_args/run_args_optional_loading_format.yaml b/tests/test_yaml_run_args/run_args_optional_loading_format.yaml index 26bdad83..aa96558f 100644 --- a/tests/test_yaml_run_args/run_args_optional_loading_format.yaml +++ b/tests/test_yaml_run_args/run_args_optional_loading_format.yaml @@ -20,6 +20,5 @@ searcher: # Optional Loading path: "neps/optimizers/bayesian_optimization/optimizer.py" name: BayesianOptimization initial_design_size: 5 - surrogate_model: gp pre_load_hooks: hook1: "tests/test_yaml_run_args/test_yaml_run_args.py" diff --git a/tests/test_yaml_run_args/test_declarative_usage_docs/customizing_neps_optimizer.yaml b/tests/test_yaml_run_args/test_declarative_usage_docs/customizing_neps_optimizer.yaml index 5ddaf23e..da0e7460 100644 --- a/tests/test_yaml_run_args/test_declarative_usage_docs/customizing_neps_optimizer.yaml +++ b/tests/test_yaml_run_args/test_declarative_usage_docs/customizing_neps_optimizer.yaml @@ -7,10 +7,7 @@ pipeline_space: lower: 1e-5 upper: 1e-1 log: True # Log scale for learning rate - epochs: - lower: 5 - upper: 20 - is_fidelity: True + epochs: 20 optimizer: choices: [adam, sgd, adamw] batch_size: 64 @@ -22,9 +19,5 @@ searcher: name: "my_bayesian" # Specific arguments depending on the searcher initial_design_size: 7 - surrogate_model: gp - acquisition: EI - acquisition_sampler: random - random_interleave_prob: 0.1 overwrite_working_directory: True diff --git a/tests/test_yaml_run_args/test_declarative_usage_docs/loading_own_optimizer.yaml b/tests/test_yaml_run_args/test_declarative_usage_docs/loading_own_optimizer.yaml index fce52034..9c0a4864 100644 --- a/tests/test_yaml_run_args/test_declarative_usage_docs/loading_own_optimizer.yaml +++ b/tests/test_yaml_run_args/test_declarative_usage_docs/loading_own_optimizer.yaml @@ -18,7 +18,5 @@ searcher: name: BayesianOptimization # Specific arguments depending on your searcher initial_design_size: 7 - surrogate_model: gp - acquisition: EI overwrite_working_directory: True diff --git a/tests/test_yaml_run_args/test_declarative_usage_docs/set_up_optimizer.yaml b/tests/test_yaml_run_args/test_declarative_usage_docs/set_up_optimizer.yaml index f65af743..90b52671 100644 --- a/tests/test_yaml_run_args/test_declarative_usage_docs/set_up_optimizer.yaml +++ b/tests/test_yaml_run_args/test_declarative_usage_docs/set_up_optimizer.yaml @@ -1,11 +1,5 @@ strategy: bayesian_optimization # Specific arguments depending on the searcher initial_design_size: 7 -surrogate_model: gp -acquisition: EI -log_prior_weighted: false -acquisition_sampler: random -random_interleave_prob: 0.1 -disable_priors: false -prior_confidence: high +use_priors: true sample_default_first: false diff --git a/tests/test_yaml_run_args/test_run_args_by_neps_run/optimizer_yamls/select_bo_run_args.yaml b/tests/test_yaml_run_args/test_run_args_by_neps_run/optimizer_yamls/select_bo_run_args.yaml index af5259d0..9871ca63 100644 --- a/tests/test_yaml_run_args/test_run_args_by_neps_run/optimizer_yamls/select_bo_run_args.yaml +++ b/tests/test_yaml_run_args/test_run_args_by_neps_run/optimizer_yamls/select_bo_run_args.yaml @@ -3,11 +3,8 @@ searcher_alg: bayesian_optimization searcher_selection: user-run_args-yaml neps_decision_tree: false searcher_args: - initial_design_size: 10 - surrogate_model: gp - acquisition: EI - log_prior_weighted: false - acquisition_sampler: mutation - random_interleave_prob: 0.0 - disable_priors: true + initial_design_size: null + use_priors: false + use_cost: false sample_default_first: false + device: null From 39ffe48a6db0ed53227356ea422721b5ed2d6843 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Fri, 4 Oct 2024 12:32:48 +0200 Subject: [PATCH 57/63] fix(optimize_acq): Generation of fixed categorical values --- neps/optimizers/bayesian_optimization/models/gp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py index 8b22a513..0f5fd7b7 100644 --- a/neps/optimizers/bayesian_optimization/models/gp.py +++ b/neps/optimizers/bayesian_optimization/models/gp.py @@ -189,7 +189,7 @@ def optimize_acq( cats: dict[int, list[float]] = { encoder.index_of[name]: [ float(i) - for i in range(len(transformer.domain.cardinality)) # type: ignore + for i in range(transformer.domain.cardinality) # type: ignore ] for name, transformer in cat_transformers.items() } From c3aeb30f17bcde35a7ea6e40e0685aa9d4ab2f99 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Fri, 4 Oct 2024 13:41:55 +0200 Subject: [PATCH 58/63] test: Fixup examples --- neps_examples/__init__.py | 23 +++++++++++++++++++---- tests/test_examples.py | 1 + 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/neps_examples/__init__.py b/neps_examples/__init__.py index df7d3589..a48933d7 100644 --- a/neps_examples/__init__.py +++ b/neps_examples/__init__.py @@ -1,7 +1,23 @@ all_main_examples = { # Used for printing in python -m neps_examples - "basic_usage": ["analyse", "architecture", "architecture_and_hyperparameters", "hpo_usage_example", "hyperparameters"], - "convenience": ["logging_additional_info", "neps_tblogger_tutorial", "running_on_slurm_scripts", "neps_x_lightning", "working_directory_per_pipeline"], - "efficiency": ["expert_priors_for_hyperparameters", "multi_fidelity", "multi_fidelity_and_expert_priors"], + "basic_usage": [ + "analyse", + "architecture", + "architecture_and_hyperparameters", + "hpo_usage_example", + "hyperparameters", + ], + "convenience": [ + "logging_additional_info", + "neps_tblogger_tutorial", + "running_on_slurm_scripts", + "neps_x_lightning", + "working_directory_per_pipeline", + ], + "efficiency": [ + "expert_priors_for_hyperparameters", + "multi_fidelity", + "multi_fidelity_and_expert_priors", + ], } core_examples = [ # Run locally and on github actions @@ -15,7 +31,6 @@ "basic_usage/architecture_and_hyperparameters", "experimental/hierarchical_architecture", "efficiency/expert_priors_for_hyperparameters", - "experimental/hierarchical_architecture_hierarchical_GP", "convenience/logging_additional_info", "convenience/working_directory_per_pipeline", "convenience/neps_tblogger_tutorial", diff --git a/tests/test_examples.py b/tests/test_examples.py index 5575eb4d..6942e8d7 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -40,6 +40,7 @@ def test_core_examples(example): if example.name in ( "architecture.py", + "architecture_and_hyperparameters.py", "hierarchical_architecture.py", "expert_priors_for_architecture_and_hyperparameters.py", ): From d1596518f3d52fbca85dcf95b812fa213949e67e Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Mon, 7 Oct 2024 16:45:50 +0200 Subject: [PATCH 59/63] test(domain): Initial tests --- neps/search_spaces/domain.py | 21 ++-- tests/test_domain.py | 205 +++++++++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+), 12 deletions(-) create mode 100644 tests/test_domain.py diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py index 6c6c1b75..6f6e2693 100644 --- a/neps/search_spaces/domain.py +++ b/neps/search_spaces/domain.py @@ -221,15 +221,6 @@ def indices(cls, n: int, *, is_categorical: bool = False) -> Domain[int]: """ return Domain.integer(0, n - 1, is_categorical=is_categorical) - def next_value(self, x: Tensor) -> Tensor: - """Get the next value for a tensor of values.""" - if self.cardinality is None: - raise ValueError("Domain is non-finite, cannot get next value.") - cardinality_domain = Domain.indices(self.cardinality) - current_step = cardinality_domain.cast(x, frm=self) - bounded_next_step = (current_step + 1).clamp_max(self.cardinality - 1) - return self.cast(bounded_next_step, frm=cardinality_domain) - def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor: """Transform a tensor of values from this domain to the unit interval [0, 1]. @@ -242,10 +233,11 @@ def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor: """ if dtype is None: dtype = torch.float64 - else: - assert dtype.is_floating_point, "Unit interval is only for floats." + elif not dtype.is_floating_point: + raise ValueError(f"Unit interval only allows floating dtypes, got {dtype}.") - if self.is_unit_float: + bins = self.bins + if self.is_unit_float and self.bins is not None: return x.to(dtype) if self.log_bounds is not None: @@ -255,6 +247,11 @@ def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor: lower, upper = self.lower, self.upper x = (x - lower) / (upper - lower) + + if bins is not None: + quantization_levels = torch.floor(x * bins).clip(0, bins - 1) + x = quantization_levels / (bins - 1) + return x.type(dtype) def from_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor: diff --git a/tests/test_domain.py b/tests/test_domain.py new file mode 100644 index 00000000..0893337f --- /dev/null +++ b/tests/test_domain.py @@ -0,0 +1,205 @@ +from pytest_cases import parametrize + +import torch +import pytest +from neps.search_spaces.domain import Domain + +T = torch.tensor + + +@parametrize( + "x, frm, expected", + [ + # Remains unchanged if from unit-float + (T([0, 0.5, 1.0]), Domain.unit_float(), T([0, 0.5, 1.0])), + # Converts integers to float + (T([0, 1]), Domain.unit_float(), T([0.0, 1.0])), + # Integer conversion + (T([0, 1, 2, 3, 4]), Domain.integer(0, 4), T([0.0, 0.25, 0.5, 0.75, 1.0])), + # Negatives + ( + T([-0.5, -0.25, 0.0, 0.25, 0.5]), + Domain.floating(-0.5, 0.5), + T([0.0, 0.25, 0.5, 0.75, 1.0]), + ), + # Log scale + ( + T([1e-4, 1e-3, 1e-2, 1e-1, 1]), + Domain.floating(1e-4, 1, log=True), + T([0.0, 0.25, 0.5, 0.75, 1.0]), + ), + # Binned + ( + torch.arange(10), + Domain.integer(0, 10, bins=5), + T([0.0, 0.0, 0.25, 0.25, 0.5, 0.5, 0.75, 0.75, 1.0, 1.0]), + ), + ], +) +def test_domain_to_unit(x: torch.Tensor, frm: Domain, expected: torch.Tensor) -> None: + y = frm.to_unit(x) + assert y.dtype == torch.float64 + torch.testing.assert_close(y, expected, check_dtype=False, msg=f"{y} != {expected}") + + +def test_domain_to_unit_dtype_with_floating() -> None: + domain = Domain.integer(0, 4) + x = T([0, 1, 2, 3, 4], dtype=torch.int32) + + expected_64 = T([0.0, 0.25, 0.5, 0.75, 1.0], dtype=torch.float64) + y_64 = domain.to_unit(x, dtype=torch.float64) + torch.testing.assert_close(y_64, expected_64, check_dtype=True) + + expected_32 = T([0.0, 0.25, 0.5, 0.75, 1.0], dtype=torch.float32) + y_32 = domain.to_unit(x, dtype=torch.float32) + torch.testing.assert_close(y_32, expected_32, check_dtype=True) + + +def test_domain_to_unit_dtype_with_integer_fails() -> None: + domain = Domain.integer(0, 4) + x = T([0, 1, 2, 3, 4], dtype=torch.int32) + + with pytest.raises(ValueError, match="only allows floating dtypes"): + domain.to_unit(x, dtype=torch.int32) + + +@parametrize( + "x, to, expected", + [ + # Remains unchanged if from unit-float + ( + T([0, 0.5, 1.0]), + Domain.unit_float(), + T([0, 0.5, 1.0], dtype=torch.float64), + ), + # Converts floats to integers + ( + T([0.0, 1.0]), + Domain.integer(0, 1), + T([0, 1], dtype=torch.int64), + ), + # Integer range + ( + T([0, 0.25, 0.5, 0.75, 1.0]), + Domain.integer(0, 4), + T([0, 1, 2, 3, 4], dtype=torch.int64), + ), + # Negatives + ( + T([0.0, 0.25, 0.5, 0.75, 1.0]), + Domain.floating(-0.5, 0.5), + T([-0.5, -0.25, 0.0, 0.25, 0.5], dtype=torch.float64), + ), + # Log scale + ( + T([0.0, 0.25, 0.5, 0.75, 1.0]), + Domain.floating(1e-4, 1, log=True), + T([1e-4, 1e-3, 1e-2, 1e-1, 1], dtype=torch.float64), + ), + # Binned + ( + T([0.0, 0.25, 0.5, 0.75, 1.0]), + Domain.integer(0, 20, bins=5), + T([0, 5, 10, 15, 20], dtype=torch.int64), + ), + ], +) +def test_domain_from_unit(x: torch.Tensor, to: Domain, expected: torch.Tensor) -> None: + x = x.to(dtype=torch.float64) + y = to.from_unit(x) + torch.testing.assert_close(y, expected, check_dtype=True, msg=f"{y} != {expected}") + + +def test_domain_from_unit_dtype() -> None: + x = T([0.0, 0.25, 0.5, 0.75, 1.0], dtype=torch.float64) + domain = Domain.integer(0, 4) + + expected_f64 = T([0.0, 1.0, 2.0, 3.0, 4.0], dtype=torch.float64) + y_f64 = domain.from_unit(x, dtype=torch.float64) + torch.testing.assert_close(y_f64, expected_f64, check_dtype=True) + + expected_f32 = T([0, 1, 2, 3, 4], dtype=torch.float32) + y_f32 = domain.from_unit(x, dtype=torch.float32) + torch.testing.assert_close(y_f32, expected_f32, check_dtype=True) + + expected_i32 = T([0, 1, 2, 3, 4], dtype=torch.int32) + y_i32 = domain.from_unit(x, dtype=torch.int32) + torch.testing.assert_close(y_i32, expected_i32, check_dtype=True) + + expected_i64 = T([0, 1, 2, 3, 4], dtype=torch.int64) + y_i64 = domain.from_unit(x, dtype=torch.int64) + torch.testing.assert_close(y_i64, expected_i64, check_dtype=True) + + +@parametrize( + "x, frm, to, expected", + [ + ( + T([1e-2, 1e-1, 1e0, 1e1, 1e2], dtype=torch.float64), + Domain.floating(1e-2, 1e2, log=True), + Domain.floating(-2, 2), + T([-2, -1, 0, 1, 2], dtype=torch.float64), + ), + ( + T([0, 2, 4, 6, 8], dtype=torch.int64), + Domain.integer(0, 8, bins=5), + Domain.integer(0, 4), + T([0, 1, 2, 3, 4], dtype=torch.int64), + ), + ( + T([10, 12.5, 15], dtype=torch.float64), + Domain.floating(10, 15), + Domain.floating(2, 3), + T([2, 2.5, 3.0], dtype=torch.float64), + ), + ], +) +def test_domain_casting( + x: torch.Tensor, frm: Domain, to: Domain, expected: torch.Tensor +) -> None: + y = to.cast(x, frm=frm) + torch.testing.assert_close(y, expected, check_dtype=True, msg=f"{y} != {expected}") + + x_back = frm.cast(y, frm=to) + torch.testing.assert_close(x_back, x, check_dtype=True, msg=f"{x_back} != {x}") + + +@parametrize( + "x, frm, to, expected", + [ + ( + # This test combines all the previous cast domains in one go as a single tensor + T( + [ + [1e-2, 1e-1, 1e0, 1e1, 1e2], + [0, 2, 4, 6, 8], + [10, 12.5, 15, 17.5, 20], + ] + ).transpose(0, 1), + [ + Domain.floating(1e-2, 1e2, log=True), + Domain.integer(0, 8, bins=5), + Domain.floating(10, 20), + ], # from + [Domain.floating(-2, 2), Domain.integer(0, 4), Domain.floating(2, 4)], # to + T( + [ + [-2, -1, 0, 1, 2], + [0, 1, 2, 3, 4], + [2, 2.5, 3, 3.5, 4], + ] + ).transpose(0, 1), + ), + ], +) +def test_translate( + x: torch.Tensor, + frm: list[Domain], + to: list[Domain], + expected: torch.Tensor, +) -> None: + y = Domain.translate(x, frm=frm, to=to) + torch.testing.assert_close(y, expected, check_dtype=True, msg=f"{y} != {expected}") + + x_back = Domain.translate(y, frm=to, to=frm) + torch.testing.assert_close(x_back, x, check_dtype=True, msg=f"{x_back} != {x}") From 65d99189818de04a5a140bbf36126b8c428afe00 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Mon, 7 Oct 2024 17:59:33 +0200 Subject: [PATCH 60/63] test(ConfigEncoder): Initial tests --- neps/search_spaces/encoding.py | 26 +++++- tests/test_config_encoder.py | 143 +++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 3 deletions(-) create mode 100644 tests/test_config_encoder.py diff --git a/neps/search_spaces/encoding.py b/neps/search_spaces/encoding.py index c20f60b3..78958b90 100644 --- a/neps/search_spaces/encoding.py +++ b/neps/search_spaces/encoding.py @@ -240,9 +240,10 @@ class ConfigEncoder: """ transformers: dict[str, TensorTransformer] + constants: Mapping[str, Any] = field(default_factory=dict) + index_of: dict[str, int] = field(init=False) domain_of: dict[str, Domain] = field(init=False) - constants: Mapping[str, Any] = field(default_factory=dict) n_numerical: int = field(init=False) n_categorical: int = field(init=False) @@ -369,7 +370,23 @@ def default( Returns: A `ConfigEncoder` instance """ - constants = constants or {} + if constants is not None: + overlap = set(parameters) & set(constants) + if any(overlap): + raise ValueError( + "`constants=` and `parameters=` cannot have overlapping" + f" keys: {overlap=}" + ) + if custom_transformers is not None: + overlap = set(custom_transformers) & set(constants) + if any(overlap): + raise ValueError( + f"Can not apply `custom_transformers=`" + f" to `constants=`: {overlap=}" + ) + else: + constants = {} + custom = custom_transformers or {} sorted_params = sorted(parameters.items()) transformers: dict[str, TensorTransformer] = {} @@ -384,6 +401,9 @@ def default( case CategoricalParameter(): transformers[name] = CategoricalToIntegerTransformer(hp.choices) case _: - raise ValueError(f"Unsupported parameter type: {type(hp)}") + raise ValueError( + f"Unsupported parameter type: {type(hp)}. If hp is a constant, " + " please provide it as `constants=`." + ) return ConfigEncoder(transformers, constants=constants) diff --git a/tests/test_config_encoder.py b/tests/test_config_encoder.py new file mode 100644 index 00000000..6ed7d344 --- /dev/null +++ b/tests/test_config_encoder.py @@ -0,0 +1,143 @@ +import torch +import pytest +from neps.search_spaces.domain import Domain +from neps.search_spaces.encoding import ( + CategoricalToIntegerTransformer, + ConfigEncoder, + MinMaxNormalizer, +) +from neps.search_spaces.hyperparameters import ( + CategoricalParameter, + FloatParameter, + IntegerParameter, +) + + +def test_config_encoder_default() -> None: + parameters = { + "b": IntegerParameter(5, 6), + "a": FloatParameter(5, 6), + "c": CategoricalParameter(["cat", "mouse", "dog"]), + } + + encoder = ConfigEncoder.default(parameters) + + # Min-max numericals, integer categoricals. + assert encoder.transformers == { + "a": MinMaxNormalizer(parameters["a"].domain), + "b": MinMaxNormalizer(parameters["b"].domain), + "c": CategoricalToIntegerTransformer(parameters["c"].choices), + } + + # Domains, (of each column) match those of the transformers + assert encoder.domains == [ + Domain.unit_float(), + Domain.unit_float(), + Domain.indices(n=len(parameters["c"].choices), is_categorical=True), + ] + + assert encoder.ncols == len(parameters) + assert encoder.n_numerical == 2 + assert encoder.n_categorical == 1 + assert encoder.index_of == {"a": 0, "b": 1, "c": 2} + assert encoder.domain_of == { + "a": Domain.unit_float(), + "b": Domain.unit_float(), + "c": Domain.indices(n=len(parameters["c"].choices), is_categorical=True), + } + assert encoder.constants == {} + + configs = [ + {"a": 5.5, "b": 5, "c": "cat"}, + {"a": 5.5, "b": 5, "c": "dog"}, + {"a": 6, "b": 6, "c": "mouse"}, + ] + encoded = encoder.encode(configs) + expcected_encoding = torch.tensor( + [ + # a, b, c + [0.5, 0.0, 0.0], # config 1 + [0.5, 0.0, 2.0], # config 2 + [1.0, 1.0, 1.0], # config 3 + ], + dtype=torch.float64, + ) + torch.testing.assert_close(encoded, expcected_encoding, check_dtype=True) + + decoded = encoder.decode(encoded) + assert decoded == configs + + +def test_config_encoder_accepts_custom_transformers() -> None: + parameters = { + "b": IntegerParameter(5, 6), + "a": FloatParameter(5, 6), + "c": CategoricalParameter(["cat", "mouse", "dog"]), + } + encoder = ConfigEncoder.default( + parameters, + custom_transformers={ + "c": CategoricalToIntegerTransformer(parameters["c"].choices) + }, + ) + assert encoder.transformers["c"] == CategoricalToIntegerTransformer( + parameters["c"].choices + ) + + +def test_config_encoder_removes_constants_in_encoding_and_includes_in_decoding() -> None: + parameters = { + "b": IntegerParameter(5, 6), + "a": FloatParameter(5, 6), + "c": CategoricalParameter(["cat", "mouse", "dog"]), + } + + x = "raspberry" + + encoder = ConfigEncoder.default(parameters, constants={"x": x}) + assert encoder.constants == {"x": x} + + enc_x = encoder.encode([{"a": 5.5, "b": 5, "c": "cat", "x": x}]) + + assert enc_x.shape == (1, 3) # No x, just a, b, c + + dec_x = encoder.decode(enc_x) + assert dec_x == [{"a": 5.5, "b": 5, "c": "cat", "x": x}] + + # This doesn't have to hold true, but it's our current behaviour, we could make + # weaker gaurantees but then we'd have to clone the constants, even if it's very large + assert dec_x[0]["x"] is x + + +def test_config_encoder_complains_if_missing_entry_in_config() -> None: + parameters = { + "b": IntegerParameter(5, 6), + "a": FloatParameter(5, 6), + "c": CategoricalParameter(["cat", "mouse", "dog"]), + } + + encoder = ConfigEncoder.default(parameters) + + with pytest.raises(KeyError): + encoder.encode([{"a": 5.5, "b": 5}]) + + +def test_config_encoder_sorts_parameters_by_name_for_consistent_ordering() -> None: + parameters = { + "b": IntegerParameter(0, 1), + "a": FloatParameter(0, 1), + "c": CategoricalParameter([0, 1]), + } + p1 = dict(sorted(parameters.items())) + p2 = dict(sorted(parameters.items(), reverse=True)) + + encoder_1 = ConfigEncoder.default(p1) + encoder_2 = ConfigEncoder.default(p2) + + assert encoder_1.index_of["a"] == 0 + assert encoder_1.index_of["b"] == 1 + assert encoder_1.index_of["c"] == 2 + + assert encoder_2.index_of["a"] == 0 + assert encoder_2.index_of["b"] == 1 + assert encoder_2.index_of["c"] == 2 From fdfb3633f12b8da375f44fa3c6324005db7c167e Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Mon, 7 Oct 2024 17:59:48 +0200 Subject: [PATCH 61/63] fix(Domain): use cardinality for quantizing through unit interval --- neps/search_spaces/domain.py | 28 ++++++++++++++-------------- tests/test_domain.py | 27 ++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/neps/search_spaces/domain.py b/neps/search_spaces/domain.py index 6f6e2693..7f9a914f 100644 --- a/neps/search_spaces/domain.py +++ b/neps/search_spaces/domain.py @@ -131,6 +131,7 @@ def __post_init__(self) -> None: " `bins` is `None` and `round` is `False` and" " boundaries are not integers." ) + object.__setattr__(self, "cardinality", cardinality) preferred_dtype = torch.int64 if is_int else torch.float64 object.__setattr__(self, "preffered_dtype", preferred_dtype) @@ -139,7 +140,6 @@ def __post_init__(self) -> None: if is_int: mid = int(round(mid)) - object.__setattr__(self, "cardinality", cardinality) object.__setattr__(self, "midpoint", mid) object.__setattr__(self, "bounds", (self.lower, self.upper)) @@ -188,7 +188,7 @@ def integer( Args: lower: The lower bound of the domain. - upper: The upper bound of the domain. + upper: The upper bound of the domain (inclusive). log: Whether the domain is in log space. bins: The number of discrete bins to split the domain into. is_categorical: Whether the domain is representing a categorical. @@ -236,8 +236,8 @@ def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor: elif not dtype.is_floating_point: raise ValueError(f"Unit interval only allows floating dtypes, got {dtype}.") - bins = self.bins - if self.is_unit_float and self.bins is not None: + q = self.cardinality + if self.is_unit_float and q is None: return x.to(dtype) if self.log_bounds is not None: @@ -248,9 +248,9 @@ def to_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor: x = (x - lower) / (upper - lower) - if bins is not None: - quantization_levels = torch.floor(x * bins).clip(0, bins - 1) - x = quantization_levels / (bins - 1) + if q is not None: + quantization_levels = torch.floor(x * q).clip(0, q - 1) + x = quantization_levels / (q - 1) return x.type(dtype) @@ -268,10 +268,10 @@ def from_unit(self, x: Tensor, *, dtype: torch.dtype | None = None) -> Tensor: if self.is_unit_float: return x.to(dtype) - bins = self.bins - if bins is not None: - quantization_levels = torch.floor(x * bins).clip(0, bins - 1) - x = quantization_levels / (bins - 1) + q = self.cardinality + if q is not None: + quantization_levels = torch.floor(x * q).clip(0, q - 1) + x = quantization_levels / (q - 1) # Now we scale to the new domain if self.log_bounds is not None: @@ -312,8 +312,8 @@ def cast(self, x: Tensor, frm: Domain, *, dtype: torch.dtype | None = None) -> T # have to go through unit space to figure out the bins same_bounds = self.lower == frm.lower and self.upper == frm.upper same_log_bounds = self.log_bounds == frm.log_bounds - same_bins = self.bins == frm.bins - if same_bounds and same_log_bounds and (self.bins is None or same_bins): + same_cardinality = self.cardinality == frm.cardinality + if same_bounds and same_log_bounds and same_cardinality: if self.round: x = torch.round(x) return x.type(dtype) @@ -327,7 +327,7 @@ def cast(self, x: Tensor, frm: Domain, *, dtype: torch.dtype | None = None) -> T # We can also shortcut out if the only diffrence is that we are coming frm the # log bounds of this domain. We dont care if where we came from was binned or not, # we just lift it up with `np.exp` and round if needed - if (self.lower, self.upper) == frm.log_bounds and self.bins is None: + if (self.lower, self.upper) == frm.log_bounds and self.cardinality is None: x = torch.exp(x) if self.round: x = torch.round(x) diff --git a/tests/test_domain.py b/tests/test_domain.py index 0893337f..0b9f2b97 100644 --- a/tests/test_domain.py +++ b/tests/test_domain.py @@ -190,6 +190,30 @@ def test_domain_casting( ] ).transpose(0, 1), ), + ( + # This was a random case found while testing samplers which seemed to fail + # Uniform noise convert to integers + # 0-0.25 -> 12, + # 0.25-0.5 -> 13, + # 0.5-0.75 -> 14 + # 0.75-1 -> 15 + T( + [ + [0.2350, 0.6488, 0.6411], + [0.6457, 0.2897, 0.6879], + [0.7401, 0.4268, 0.7607], + ] + ), + Domain.unit_float(), + Domain.integer(12, 15), + T( + [ + [12, 14, 14], + [14, 13, 14], + [14, 13, 15], + ] + ), + ), ], ) def test_translate( @@ -200,6 +224,3 @@ def test_translate( ) -> None: y = Domain.translate(x, frm=frm, to=to) torch.testing.assert_close(y, expected, check_dtype=True, msg=f"{y} != {expected}") - - x_back = Domain.translate(y, frm=to, to=frm) - torch.testing.assert_close(x_back, x, check_dtype=True, msg=f"{x_back} != {x}") From 35427ebef1737d111134e58164abe7d0fb01a608 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Mon, 7 Oct 2024 18:38:32 +0200 Subject: [PATCH 62/63] test(Samplers): Initial tests and dtype fixes --- neps/sampling/priors.py | 43 ++++++++++++++++++++++++++------------- neps/sampling/samplers.py | 34 ++++++++++++++++++++++--------- 2 files changed, 53 insertions(+), 24 deletions(-) diff --git a/neps/sampling/priors.py b/neps/sampling/priors.py index e17db04b..3180e0ac 100644 --- a/neps/sampling/priors.py +++ b/neps/sampling/priors.py @@ -106,7 +106,7 @@ def uniform(cls, ncols: int) -> UniformPrior: Args: ncols: The number of columns in the tensor to sample. """ - return UniformPrior(ndims=ncols) + return UniformPrior(ndim=ncols) @classmethod def from_parameters( @@ -342,8 +342,10 @@ def log_prob(self, x: torch.Tensor, *, frm: list[Domain] | Domain) -> torch.Tens first_i, first_dist = next(itr) log_probs = first_dist.log_prob(translated_x[..., first_i]) + _weight = 1 / len(self.distributions) + for i, dist in itr: - log_probs = log_probs + dist.log_prob(translated_x[..., i]) + log_probs = log_probs + _weight * dist.log_prob(translated_x[..., i]) return log_probs @@ -381,18 +383,18 @@ class UniformPrior(Prior): Uses a UnitUniform under the hood before converting to the value domain. """ - ndims: int + ndim: int """The number of columns in the tensor to sample from.""" @property @override def ncols(self) -> int: - return self.ndims + return self.ndim @override def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor: # NOTE: We just assume everything is in bounds... - shape = x.shape[:-1] + shape = x.shape[:-1] # Select everything up to last dimension (configuration) return torch.zeros(shape, dtype=torch.float64, device=x.device) @override @@ -409,11 +411,16 @@ def sample( raise NotImplementedError("Seeding is not yet implemented.") _n = ( - torch.Size((n, self.ndims)) + torch.Size((n, self.ndim)) if isinstance(n, int) - else torch.Size((*n, self.ndims)) + else torch.Size((*n, self.ndim)) ) - samples = torch.rand(_n, device=device, dtype=dtype) + # Doesn't like integer dtypes + if dtype is not None and dtype.is_floating_point: + samples = torch.rand(_n, device=device, dtype=dtype) + else: + samples = torch.rand(_n, device=device) + return Domain.translate(samples, frm=UNIT_FLOAT_DOMAIN, to=to, dtype=dtype) @@ -437,9 +444,9 @@ def __post_init__(self) -> None: ) @property - def probabilities(self) -> torch.Tensor: + def sampler_probabilities(self) -> torch.Tensor: """The probabilities for each sampler. Normalized weights.""" - return self._weighted_sampler.probabilities + return self._weighted_sampler.sampler_probabilities @property @override @@ -450,12 +457,20 @@ def ncols(self) -> int: def log_prob(self, x: torch.Tensor, *, frm: Domain | list[Domain]) -> torch.Tensor: # OPTIM: Avoid an initial allocation by using the output of the first # distribution to store the weighted probabilities - itr = zip(self.probabilities, self.priors, strict=False) + itr = zip(self.sampler_probabilities, self.priors, strict=False) first_prob, first_prior = next(itr) - weighted_probs = first_prob * first_prior.log_prob(x, frm=frm) - for prob, prior in itr: - weighted_probs = weighted_probs + prob * prior.log_prob(x, frm=frm) + if first_prob == 0.0: + weighted_probs = first_prob * first_prior.log_prob(x, frm=frm) + else: + weighted_probs = torch.zeros( + x.shape[:-1], dtype=torch.float64, device=x.device + ) + + for sampler_prob, prior in itr: + if sampler_prob == 0.0: + continue + weighted_probs = weighted_probs + sampler_prob * prior.log_prob(x, frm=frm) return weighted_probs diff --git a/neps/sampling/samplers.py b/neps/sampling/samplers.py index cf1c1e7a..d3439888 100644 --- a/neps/sampling/samplers.py +++ b/neps/sampling/samplers.py @@ -82,7 +82,7 @@ def uniform(cls, ndim: int) -> UniformPrior: """ from neps.sampling.priors import UniformPrior - return UniformPrior(ndims=ndim) + return UniformPrior(ndim=ndim) @classmethod def borders(cls, ndim: int) -> BorderSampler: @@ -158,8 +158,12 @@ def sample( dimension=self.ndim, scramble=self.scramble, seed=_seed ) - out = torch.empty(_n, self.ncols, dtype=dtype, device=device) - x = sobol.draw(_n, dtype=dtype, out=out) + # If integer dtype, sobol will refuse, we need to cast then + if dtype is not None and not dtype.is_floating_point: + x = sobol.draw(_n, dtype=torch.float64) + x = x.to(dtype=dtype, device=device) + else: + x = sobol.draw(_n, dtype=dtype) # If we got extra dimensions, such as batch dimensions, we need to # reshape the tensor to the desired shape. @@ -179,7 +183,7 @@ class WeightedSampler(Sampler): weights: torch.Tensor """The weights for each sampler.""" - probabilities: torch.Tensor = field(init=False, repr=False) + sampler_probabilities: torch.Tensor = field(init=False, repr=False) """The probabilities for each sampler. Normalized weights.""" def __post_init__(self) -> None: @@ -201,7 +205,7 @@ def __post_init__(self) -> None: ) self._ncols = ncols[0] - self.probabilities = self.weights / self.weights.sum() + self.sampler_probabilities = self.weights / self.weights.sum() @property @override @@ -218,6 +222,16 @@ def sample( device: torch.device | None = None, dtype: torch.dtype | None = None, ) -> torch.Tensor: + if dtype is None: + if isinstance(to, Domain): + dtype = to.preffered_dtype + else: + dtype = ( + torch.float64 + if any(d.preffered_dtype.is_floating_point for d in to) + else torch.int64 + ) + if seed is not None: raise NotImplementedError("Seeding is not yet implemented.") @@ -232,7 +246,7 @@ def sample( # Randomly select which sampler to sample from for each of the total_samples chosen_samplers = torch.empty((total_samples,), device=device, dtype=torch.int64) chosen_samplers = torch.multinomial( - self.probabilities, + self.sampler_probabilities, total_samples, replacement=True, generator=seed, @@ -264,9 +278,7 @@ def sample( output_samples[indices] = samples_from_sampler # Reshape to the output shape including ncols dimension - output_samples = output_samples.view(output_shape) - - return Domain.translate(output_samples, frm=UNIT_FLOAT_DOMAIN, to=to) + return output_samples.view(output_shape) @dataclass @@ -295,6 +307,8 @@ def sample( device: torch.device | None = None, dtype: torch.dtype | None = None, ) -> torch.Tensor: + dtype = dtype or torch.float64 + _arange = torch.arange(self.n_possible, device=device, dtype=torch.int32) # Calculate the total number of samples required if isinstance(n, int): @@ -322,5 +336,5 @@ def sample( bit_masks = 2 ** _arange[: self.ndim] configs = configs.unsqueeze(1).bitwise_and(bit_masks).ne(0).to(dtype) # Reshape to the output shape including ncols dimension - configs.view(output_shape) + configs = configs.view(output_shape) return Domain.translate(configs, frm=UNIT_FLOAT_DOMAIN, to=to) From d59b13458e1fb16a5819753fd2a62c91bfadf389 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Mon, 7 Oct 2024 18:38:51 +0200 Subject: [PATCH 63/63] test(Sampler): Include test file -_- --- tests/test_samplers.py | 93 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 tests/test_samplers.py diff --git a/tests/test_samplers.py b/tests/test_samplers.py new file mode 100644 index 00000000..dbc92287 --- /dev/null +++ b/tests/test_samplers.py @@ -0,0 +1,93 @@ +from pytest_cases import parametrize +from neps.sampling.samplers import Sampler, Sobol, WeightedSampler, BorderSampler +from neps.sampling.priors import Prior, UniformPrior, WeightedPrior + +import torch + +from neps.search_spaces.domain import Domain + + +def _make_centered_prior(ndim: int) -> Prior: + return Prior.make_centered( + domains=[Domain.unit_float() for _ in range(ndim)], + centers=[(0.5, 0.5) for _ in range(ndim)], + ) + + +@parametrize( + "sampler", + [ + Sobol(ndim=3), + BorderSampler(ndim=3), + UniformPrior(ndim=3), + # Convenence method for making a distribution around center points + _make_centered_prior(ndim=3), + WeightedSampler( + [UniformPrior(ndim=3), _make_centered_prior(3), Sobol(ndim=3)], + weights=torch.tensor([0.5, 0.25, 0.25]), + ), + WeightedPrior( + [UniformPrior(ndim=3), _make_centered_prior(3), UniformPrior(ndim=3)], + weights=torch.tensor([0.5, 0.25, 0.25]), + ), + ], +) +def test_sampler_samples_into_domain(sampler: Sampler) -> None: + assert sampler.ncols == 3 + + domain_to_sample_into = Domain.integer(12, 15) + for _ in range(10): + x = sampler.sample( + n=5, + to=domain_to_sample_into, + seed=None, + ) + + assert x.shape == (5, 3) + assert (x >= 12).all() + assert (x <= 15).all() + + x = sampler.sample( + n=torch.Size((2, 1)), + to=domain_to_sample_into, + seed=None, + ) + assert x.shape == (2, 1, 3) + assert (x >= 12).all() + assert (x <= 15).all() + + +@parametrize( + "prior", + [ + UniformPrior(ndim=3), + # Convenence method for making a distribution around center points + _make_centered_prior(ndim=3), + WeightedPrior( + [UniformPrior(ndim=3), _make_centered_prior(3), UniformPrior(ndim=3)], + weights=torch.tensor([0.5, 0.25, 0.25]), + ), + ], +) +def test_priors_give_positive_pdfs(prior: Prior) -> None: + # NOTE: The uniform prior does not check that + assert prior.ncols == 3 + domain = Domain.floating(10, 100) + + x = prior.sample(n=5, to=domain, seed=None) + assert x.shape == (5, 3) + assert (x >= 10).all() + assert (x <= 100).all() + + probs = prior.prob(x, frm=domain) + assert (probs >= 0).all() + assert probs.shape == (5,) + + x = prior.sample(n=torch.Size((2, 1)), to=domain, seed=None) + assert x.shape == (2, 1, 3) + assert (x >= 10).all() + assert (x <= 100).all() + + probs = prior.prob(x, frm=domain) + assert (probs >= 0).all() + assert probs.shape == (2, 1)