From 334dc52f398bdc355544731c1a4fe80c87509756 Mon Sep 17 00:00:00 2001 From: mcastanoUQ Date: Tue, 10 Dec 2024 09:51:02 +0100 Subject: [PATCH] Adding anisotropic Kernel computation and regularization option in the likelihood --- smt/applications/mfck.py | 296 +++++++++++++++++++-------------------- 1 file changed, 144 insertions(+), 152 deletions(-) diff --git a/smt/applications/mfck.py b/smt/applications/mfck.py index a42fc4fe6..88fd14e18 100644 --- a/smt/applications/mfck.py +++ b/smt/applications/mfck.py @@ -13,14 +13,14 @@ Conference on Neural Information Processing Systems. """ -# import warnings +#import warnings import numpy as np from scipy.linalg import solve_triangular from scipy import optimize from smt.sampling_methods import LHS -from smt.utils.kriging import differences, componentwise_distance +from smt.utils.kriging import (differences, componentwise_distance) from smt.surrogate_models.krg_based import KrgBased - +from smt.utils.misc import standardization class MFCK(KrgBased): def _initialize(self): @@ -38,7 +38,7 @@ def _initialize(self): ) declare( "rho_bounds", - [-5.0, 5.0], + [-5., 5.], types=(list, np.ndarray), desc="Bounds for the rho parameter used in the autoregressive model", ) @@ -50,18 +50,20 @@ def _initialize(self): ) declare( "sigma_bounds", - [1e-1, 100], + [1e-2, 100], types=(list, np.ndarray), desc="Bounds for the variance parameter", ) - - self.options["nugget"] = ( - 1e-9 # Incresing the nugget for numerical stability reasons - ) - self.options["hyper_opt"] = ( - "Cobyla" # MFCK doesn't support gradient-based optimizers + declare( + "lambda", + 0.1, + types=(float), + desc="Regularization parameter", ) + self.options["nugget"] = 1e-9 # Incresing the nugget for numerical stability reasons + self.options["hyper_opt"] = "Cobyla" # MFCK doesn't support gradient-based optimizers + def train(self): """ Overrides MFK implementation @@ -79,88 +81,72 @@ def train(self): i = i + 1 xt.append(self.training_points[None][0][0]) yt.append(self.training_points[None][0][1]) - self.lvl = i + 1 + self.lvl = i+1 self.X = xt self.y = np.vstack(yt) self._check_param() - self.nx = 1 # Forcing this in order to consider isotropic kernels (i.e., only one lengthscale) + + ( + _, + _, + self.X_offset, + self.y_mean, + self.X_scale, + self.y_std, + ) = standardization(np.concatenate(xt, axis=0), np.concatenate(yt, axis=0)) + + self.X_norma_all = [(x - self.X_offset) / self.X_scale for x in xt] + self.y_norma_all = np.vstack([(f - self.y_mean) / self.y_std for f in yt]) + if self.lvl == 1: # For a single level, initialize theta_ini, lower_bounds, and # upper_bounds with consistent shapes - theta_ini = np.hstack( - (self.options["sigma0"], self.options["theta0"][0]) - ) # Kernel variance + theta0 - lower_bounds = np.hstack( - (self.options["sigma_bounds"][0], self.options["theta_bounds"][0]) - ) - upper_bounds = np.hstack( - (self.options["sigma_bounds"][1], self.options["theta_bounds"][1]) - ) - theta_ini = np.log10(theta_ini) - lower_bounds = np.log10(lower_bounds) - upper_bounds = np.log10(upper_bounds) - x_opt = theta_ini + theta_ini = np.hstack((self.options["sigma0"], + self.options["theta0"])) # Variance + initial theta values + lower_bounds = np.hstack((self.options["sigma_bounds"][0], + np.full(self.nx, self.options["theta_bounds"][0]))) + upper_bounds = np.hstack((self.options["sigma_bounds"][1], + np.full(self.nx, self.options["theta_bounds"][1]))) + # Apply log10 to theta_ini and bounds + nb_params = len(self.options["theta0"]) + theta_ini[:nb_params+1] = np.log10(theta_ini[:nb_params+1]) + lower_bounds[:nb_params+1] = np.log10(lower_bounds[:nb_params+1]) + upper_bounds[:nb_params+1] = np.log10(upper_bounds[:nb_params+1]) else: for lvl in range(self.lvl): if lvl == 0: # Initialize theta_ini for level 0 - theta_ini = np.hstack( - (self.options["sigma0"], self.options["theta0"][0]) - ) # Variance + initial theta values - lower_bounds = np.hstack( - ( - self.options["sigma_bounds"][0], - np.full(self.nx, self.options["theta_bounds"][0]), - ) - ) - upper_bounds = np.hstack( - ( - self.options["sigma_bounds"][1], - np.full(self.nx, self.options["theta_bounds"][1]), - ) - ) + theta_ini = np.hstack((self.options["sigma0"], + self.options["theta0"])) # Variance + initial theta values + lower_bounds = np.hstack((self.options["sigma_bounds"][0], + np.full(self.nx, self.options["theta_bounds"][0]))) + upper_bounds = np.hstack((self.options["sigma_bounds"][1], + np.full(self.nx, self.options["theta_bounds"][1]))) # Apply log10 to theta_ini and bounds nb_params = len(self.options["theta0"]) - theta_ini[: nb_params + 1] = np.log10(theta_ini[: nb_params + 1]) - lower_bounds[: nb_params + 1] = np.log10( - lower_bounds[: nb_params + 1] - ) - upper_bounds[: nb_params + 1] = np.log10( - upper_bounds[: nb_params + 1] - ) + theta_ini[:nb_params+1] = np.log10(theta_ini[:nb_params+1]) + lower_bounds[:nb_params+1] = np.log10(lower_bounds[:nb_params+1]) + upper_bounds[:nb_params+1] = np.log10(upper_bounds[:nb_params+1]) elif lvl > 0: # For additional levels, append to theta_ini, lower_bounds, and upper_bounds - thetat = np.hstack( - (self.options["sigma0"], self.options["theta0"][0]) - ) - lower_boundst = np.hstack( - ( - self.options["sigma_bounds"][0], - np.full(self.nx, self.options["theta_bounds"][0]), - ) - ) - upper_boundst = np.hstack( - ( - self.options["sigma_bounds"][1], - np.full(self.nx, self.options["theta_bounds"][1]), - ) - ) + thetat = np.hstack((self.options["sigma0"], + self.options["theta0"])) + lower_boundst = np.hstack((self.options["sigma_bounds"][0], + np.full(self.nx, self.options["theta_bounds"][0]))) + upper_boundst = np.hstack((self.options["sigma_bounds"][1], + np.full(self.nx, self.options["theta_bounds"][1]))) # Apply log10 to the newly added values thetat = np.log10(thetat) lower_boundst = np.log10(lower_boundst) upper_boundst = np.log10(upper_boundst) # Append to theta_ini, lower_bounds, and upper_bounds - theta_ini = np.hstack([theta_ini, thetat, self.options["rho0"]]) + theta_ini = np.hstack([theta_ini, thetat,self.options["rho0"]]) lower_bounds = np.hstack([lower_bounds, lower_boundst]) upper_bounds = np.hstack([upper_bounds, upper_boundst]) # Finally, append the rho bounds - lower_bounds = np.hstack( - [lower_bounds, self.options["rho_bounds"][0]] - ) - upper_bounds = np.hstack( - [upper_bounds, self.options["rho_bounds"][1]] - ) + lower_bounds = np.hstack([lower_bounds, self.options["rho_bounds"][0]]) + upper_bounds = np.hstack([upper_bounds, self.options["rho_bounds"][1]]) theta_ini = theta_ini[:].T x_opt = theta_ini @@ -168,9 +154,9 @@ def train(self): if self.options["hyper_opt"] == "Cobyla": if self.options["n_start"] > 1: sampling = LHS( - xlimits=np.stack((lower_bounds, upper_bounds), axis=1), - criterion="ese", - random_state=0, + xlimits = np.stack((lower_bounds, upper_bounds), axis=1), + criterion = "ese", + random_state = 0, ) theta_lhs_loops = sampling(self.options["n_start"]) theta0 = np.vstack((theta_ini, theta_lhs_loops)) @@ -185,13 +171,15 @@ def train(self): optimal_theta_res_loop = optimize.minimize( self.neg_log_likelihood_scipy, theta0[j, :], - method="COBYLA", - constraints=[{"fun": con, "type": "ineq"} for con in constraints], - options={ - "rhobeg": 0.1, - "tol": 1e-4, - "maxiter": 200, - }, + method = "COBYLA", + constraints = [ + {"fun": con, "type": "ineq"} for con in constraints + ], + options = { + "rhobeg": 0.2, + "tol": 1e-6, + "maxiter": 100, + } ) x_opt_iter = optimal_theta_res_loop.x @@ -213,7 +201,7 @@ def train(self): opt.set_lower_bounds(lower_bounds) # Lower bounds for each dimension opt.set_upper_bounds(upper_bounds) # Upper bounds for each dimension opt.set_min_objective(self.neg_log_likelihood_nlopt) - opt.set_maxeval(5000) + opt.set_maxeval(1000) opt.set_xtol_rel(1e-6) x0 = np.copy(theta_ini) x_opt = opt.optimize(x0) @@ -221,10 +209,12 @@ def train(self): raise ValueError( f"The optimizer {self.options['hyper_opt']} is not available" ) + x_opt[0] = 10**(x_opt[0]) # Apply 10** to Sigma 0 + x_opt[1:self.nx+1] = 10**(x_opt[1:self.nx+1]) # Apply 10** to length scales 0 + x_opt[self.nx+1 :: self.nx+2] = 10**(x_opt[self.nx+1 :: self.nx+2]) # Apply 10** to sigmas gamma - x_opt[0:2] = 10 ** (x_opt[0:2]) - x_opt[2::3] = 10 ** (x_opt[2::3]) - x_opt[3::3] = 10 ** (x_opt[3::3]) + for i in np.arange(self.nx+2, x_opt.shape[0]-1, self.nx+2): + x_opt[i:i+self.nx] = 10**x_opt[i:i+self.nx] self.optimal_theta = x_opt def eta(self, j, jp, rho): @@ -253,10 +243,13 @@ def compute_cross_K(self, x, xp, L, Lp, param): """ cov_value = 0.0 - param0 = param[0:2] - sigmas_gamma = param[2::3] - ls_gamma = param[3::3] - rho_values = param[4::3] + sigma_0 = param[0] + l_0 = param[1:self.nx+1] + #param0 = param[0 : self.nx+1] + sigmas_gamma = param[self.nx+1 :: self.nx+2] + l_s = [param[i:i+self.nx].tolist() for i in np.arange(self.nx+2, param.shape[0]-1, self.nx+2)] + #ls_gamma = param[3::3] + rho_values = param[2 + 2*self.nx::self.nx+2] # Sum of j=0 until l_^prime for j in range(Lp + 1): @@ -265,12 +258,11 @@ def compute_cross_K(self, x, xp, L, Lp, param): if j == 0: # Cov(γ_j(x), γ_j(x')) using the kernel for K_00 - cov_gamma_j = self._compute_K(x, xp, param0) + cov_gamma_j = self._compute_K(x, xp, [sigma_0,l_0]) else: # Cov(γ_j(x), γ_j(x')) using the kernel - cov_gamma_j = self._compute_K( - x, xp, [sigmas_gamma[j - 1], ls_gamma[j - 1]] - ) + cov_gamma_j = self._compute_K(x, xp, + [sigmas_gamma[j-1], l_s[j-1]]) # Add to the value of the covariance cov_value += eta_j_l * eta_j_lp * cov_gamma_j @@ -290,50 +282,49 @@ def predict_all_levels(self, x): covariances: (list, np.array) Returns the conditional covariance matrixes per level. """ - self.K = self.compute_blockwise_K(self.optimal_theta) means = [] covariances = [] + x = (x - self.X_offset) / self.X_scale + if self.lvl == 1: - k_XX = self._compute_K(self.X[0], self.X[0], self.optimal_theta[0:2]) - k_xX = self._compute_K(x, self.X[0], self.optimal_theta[0:2]) - k_xx = self._compute_K(x, x, self.optimal_theta[0:2]) - # To be adapted using the Cholesky decomposition - k_XX_inv = np.linalg.inv( - k_XX + self.options["nugget"] * np.eye(k_XX.shape[0]) - ) - means.append(np.dot(k_xX, np.matmul(k_XX_inv, self.y))) - covariances.append( - k_xx - np.matmul(k_xX, np.matmul(k_XX_inv, k_xX.transpose())) - ) + sigma = self.optimal_theta[0] # Apply 10** to Sigma 0 + l_s = [self.optimal_theta[1:self.nx+1]] # Apply 10** to length scales 0 + + k_XX = self._compute_K(self.X_norma_all[0], self.X_norma_all[0],[sigma, l_s[0]]) + k_xX = self._compute_K(x, self.X_norma_all[0], [sigma, l_s[0]]) + k_xx = self._compute_K(x,x,[sigma, l_s[0]]) + + L = np.linalg.cholesky(k_XX + self.options["nugget"] * np.eye(k_XX.shape[0])) + + beta1 = solve_triangular(L, k_xX.T, lower=True) + alpha1 = solve_triangular(L, self.y_norma_all, lower=True) + means.append(self.y_std * np.dot(beta1.T, alpha1)+ self.y_mean) + covariances.append(k_xx - np.dot(beta1.T, beta1)) + # k_XX_inv = np.linalg.inv(k_XX + self.options["nugget"]*np.eye(k_XX.shape[0])) + # means.append( np.dot(k_xX, np.matmul(k_XX_inv, self.y))) + # covariances.append(k_xx - np.matmul(k_xX, + # np.matmul(k_XX_inv, + # k_xX.transpose()))) else: - L = np.linalg.cholesky( - self.K + self.options["nugget"] * np.eye(self.K.shape[0]) - ) + self.K = self.compute_blockwise_K(self.optimal_theta) + L = np.linalg.cholesky(self.K + self.options["nugget"] * np.eye(self.K.shape[0])) k_xX = [] for ind in range(self.lvl): - k_xx = self.compute_cross_K(x, x, ind, ind, self.optimal_theta) + k_xx = self.compute_cross_K(x, x, ind, ind,self.optimal_theta) for j in range(self.lvl): if ind >= j: - k_xX.append( - self.compute_cross_K( - self.X[j], x, ind, j, self.optimal_theta - ) - ) + k_xX.append(self.compute_cross_K(self.X_norma_all[j], x, ind, j,self.optimal_theta)) else: - k_xX.append( - self.compute_cross_K( - self.X[j], x, j, ind, self.optimal_theta - ) - ) + k_xX.append(self.compute_cross_K(self.X_norma_all[j], x, j, ind,self.optimal_theta)) beta1 = solve_triangular(L, np.vstack(k_xX), lower=True) - alpha1 = solve_triangular(L, self.y, lower=True) - means.append(np.dot(beta1.T, alpha1)) + alpha1 = solve_triangular(L, self.y_norma_all, lower=True) + means.append(self.y_std * np.dot(beta1.T, alpha1)+ self.y_mean) covariances.append(k_xx - np.dot(beta1.T, beta1)) k_xX.clear() - return means, covariances + return means,covariances def _predict(self, x): """ @@ -351,20 +342,21 @@ def _predict(self, x): """ means, covariances = self.predict_all_levels(x) - return means[self.lvl - 1], covariances[self.lvl - 1] + return means[self.lvl-1], covariances[self.lvl-1] def neg_log_likelihood(self, param, grad=None): if self.lvl == 1: - self.K = self._compute_K(self.X[0], self.X[0], param[0:2]) + sigma = param[0] + l_s = [param[1: self.nx+1]] + self.K = self._compute_K(self.X[0], self.X[0], [sigma,l_s]) else: self.K = self.compute_blockwise_K(param) - L = np.linalg.cholesky( - self.K + self.options["nugget"] * np.eye(self.K.shape[0]) - ) - beta = solve_triangular(L, self.y, lower=True) - NMLL = 1 / 2 * (2 * np.sum(np.log(np.diag(L))) + np.dot(beta.T, beta)) - (nmll,) = NMLL[0] + reg_term = self.options['lambda'] * np.sum(np.power(param,2)) + L = np.linalg.cholesky(self.K + self.options["nugget"]*np.eye(self.K.shape[0])) + beta = solve_triangular(L, self.y_norma_all, lower=True) + NMLL = np.sum(np.log(np.diag(L))) + np.dot(beta.T,beta) + reg_term + nmll, = NMLL[0] return nmll def neg_log_likelihood_scipy(self, param): @@ -372,9 +364,12 @@ def neg_log_likelihood_scipy(self, param): Likelihood for Cobyla-scipy (SMT) optimizer """ param = np.array(param, copy=True) - param[0:2] = 10 ** (param[0:2]) - param[2::3] = 10 ** (param[2::3]) - param[3::3] = 10 ** (param[3::3]) + param[0] = 10**(param[0]) # Apply 10** to Sigma 0 + param[1:self.nx+1] = 10**(param[1:self.nx+1]) # Apply 10** to length scales 0 + param[self.nx+1 :: self.nx+2] = 10**(param[self.nx+1 :: self.nx+2]) # Apply 10** to sigmas gamma + + for i in np.arange(self.nx+2, param.shape[0]-1, self.nx+2): + param[i:i+self.nx] = 10**param[i:i+self.nx] return self.neg_log_likelihood(param) def neg_log_likelihood_nlopt(self, param, grad=None): @@ -382,34 +377,34 @@ def neg_log_likelihood_nlopt(self, param, grad=None): Likelihood for nlopt optimizers """ param = np.array(param, copy=True) - param[0:2] = 10 ** (param[0:2]) - param[2::3] = 10 ** (param[2::3]) - param[3::3] = 10 ** (param[3::3]) + param[0] = 10**(param[0]) # Apply 10** to Sigma 0 + param[1:self.nx+1] = 10**(param[1:self.nx+1]) # Apply 10** to length scales 0 + param[self.nx+1 :: self.nx+2] = 10**(param[self.nx+1 :: self.nx+2]) # Apply 10** to sigmas gamma + + for i in np.arange(self.nx+2, param.shape[0]-1, self.nx+2): + param[i:i+self.nx] = 10**param[i:i+self.nx] return self.neg_log_likelihood(param, grad) + def compute_blockwise_K(self, param): K_block = {} n = self.y.shape[0] for jp in range(self.lvl): for j in range(self.lvl): if jp >= j: - K_block[str(jp) + str(j)] = self.compute_cross_K( - self.X[j], self.X[jp], jp, j, param - ) + K_block[str(jp)+str(j)] = self.compute_cross_K(self.X_norma_all[j], + self.X_norma_all[jp], + jp, j, param) K = np.zeros((n, n)) row_init, col_init = 0, 0 for j in range(self.lvl): col_init = row_init for jp in range(j, self.lvl): - r, c = K_block[str(jp) + str(j)].shape - K[row_init : row_init + r, col_init : col_init + c] = K_block[ - str(jp) + str(j) - ] + r, c = K_block[str(jp)+str(j)].shape + K[row_init:row_init+r, col_init:col_init+c] = K_block[str(jp)+str(j)] if j != jp: - K[col_init : col_init + c, row_init : row_init + r] = K_block[ - str(jp) + str(j) - ].T + K[col_init:col_init+c, row_init:row_init+r] = K_block[str(jp)+str(j)].T col_init += c row_init += r return K @@ -421,13 +416,10 @@ def _compute_K(self, A: np.ndarray, B: np.ndarray, param): """ # Compute pairwise componentwise L1-distances between A and B dx = differences(A, B) - d = componentwise_distance( - dx, - self.options["corr"], - self.X[0].shape[1], - power=self.options["pow_exp_power"], - ) - self.corr.theta = np.full(self.X[0].shape[1], param[1]) + d = componentwise_distance(dx, self.options["corr"], + self.X[0].shape[1], + power=self.options["pow_exp_power"]) + self.corr.theta = np.asarray(param[1]) r = self.corr(d) R = r.reshape(A.shape[0], B.shape[0]) K = param[0] * R