From 334dc52f398bdc355544731c1a4fe80c87509756 Mon Sep 17 00:00:00 2001
From: mcastanoUQ <mcastanoa@uqvirtual.edu.co>
Date: Tue, 10 Dec 2024 09:51:02 +0100
Subject: [PATCH] Adding anisotropic Kernel computation and regularization
 option in the likelihood

---
 smt/applications/mfck.py | 296 +++++++++++++++++++--------------------
 1 file changed, 144 insertions(+), 152 deletions(-)

diff --git a/smt/applications/mfck.py b/smt/applications/mfck.py
index a42fc4fe6..88fd14e18 100644
--- a/smt/applications/mfck.py
+++ b/smt/applications/mfck.py
@@ -13,14 +13,14 @@
 Conference on Neural Information Processing Systems.
 """
 
-# import warnings
+#import warnings
 import numpy as np
 from scipy.linalg import solve_triangular
 from scipy import optimize
 from smt.sampling_methods import LHS
-from smt.utils.kriging import differences, componentwise_distance
+from smt.utils.kriging import (differences, componentwise_distance)
 from smt.surrogate_models.krg_based import KrgBased
-
+from smt.utils.misc import standardization
 
 class MFCK(KrgBased):
     def _initialize(self):
@@ -38,7 +38,7 @@ def _initialize(self):
         )
         declare(
             "rho_bounds",
-            [-5.0, 5.0],
+            [-5., 5.],
             types=(list, np.ndarray),
             desc="Bounds for the rho parameter used in the autoregressive model",
         )
@@ -50,18 +50,20 @@ def _initialize(self):
         )
         declare(
             "sigma_bounds",
-            [1e-1, 100],
+            [1e-2, 100],
             types=(list, np.ndarray),
             desc="Bounds for the variance parameter",
         )
-
-        self.options["nugget"] = (
-            1e-9  # Incresing the nugget for numerical stability reasons
-        )
-        self.options["hyper_opt"] = (
-            "Cobyla"  # MFCK doesn't support gradient-based optimizers
+        declare(
+            "lambda",
+            0.1,
+            types=(float),
+            desc="Regularization parameter",
         )
 
+        self.options["nugget"] = 1e-9 # Incresing the nugget for numerical stability reasons
+        self.options["hyper_opt"] = "Cobyla" # MFCK doesn't support gradient-based optimizers
+
     def train(self):
         """
         Overrides MFK implementation
@@ -79,88 +81,72 @@ def train(self):
             i = i + 1
         xt.append(self.training_points[None][0][0])
         yt.append(self.training_points[None][0][1])
-        self.lvl = i + 1
+        self.lvl = i+1
         self.X = xt
         self.y = np.vstack(yt)
         self._check_param()
-        self.nx = 1  # Forcing this in order to consider isotropic kernels (i.e., only one lengthscale)
+
+        (
+            _,
+            _,
+            self.X_offset,
+            self.y_mean,
+            self.X_scale,
+            self.y_std,
+        ) = standardization(np.concatenate(xt, axis=0), np.concatenate(yt, axis=0))
+
+        self.X_norma_all = [(x - self.X_offset) / self.X_scale for x in xt]
+        self.y_norma_all = np.vstack([(f - self.y_mean) / self.y_std for f in yt])
+
         if self.lvl == 1:
             # For a single level, initialize theta_ini, lower_bounds, and
             # upper_bounds with consistent shapes
-            theta_ini = np.hstack(
-                (self.options["sigma0"], self.options["theta0"][0])
-            )  # Kernel variance + theta0
-            lower_bounds = np.hstack(
-                (self.options["sigma_bounds"][0], self.options["theta_bounds"][0])
-            )
-            upper_bounds = np.hstack(
-                (self.options["sigma_bounds"][1], self.options["theta_bounds"][1])
-            )
-            theta_ini = np.log10(theta_ini)
-            lower_bounds = np.log10(lower_bounds)
-            upper_bounds = np.log10(upper_bounds)
-            x_opt = theta_ini
+            theta_ini = np.hstack((self.options["sigma0"],
+                                   self.options["theta0"]))  # Variance + initial theta values
+            lower_bounds = np.hstack((self.options["sigma_bounds"][0],
+                                      np.full(self.nx, self.options["theta_bounds"][0])))
+            upper_bounds = np.hstack((self.options["sigma_bounds"][1],
+                                      np.full(self.nx, self.options["theta_bounds"][1])))
+            # Apply log10 to theta_ini and bounds
+            nb_params = len(self.options["theta0"])
+            theta_ini[:nb_params+1] = np.log10(theta_ini[:nb_params+1])
+            lower_bounds[:nb_params+1] = np.log10(lower_bounds[:nb_params+1])
+            upper_bounds[:nb_params+1] = np.log10(upper_bounds[:nb_params+1])
         else:
             for lvl in range(self.lvl):
                 if lvl == 0:
                     # Initialize theta_ini for level 0
-                    theta_ini = np.hstack(
-                        (self.options["sigma0"], self.options["theta0"][0])
-                    )  # Variance + initial theta values
-                    lower_bounds = np.hstack(
-                        (
-                            self.options["sigma_bounds"][0],
-                            np.full(self.nx, self.options["theta_bounds"][0]),
-                        )
-                    )
-                    upper_bounds = np.hstack(
-                        (
-                            self.options["sigma_bounds"][1],
-                            np.full(self.nx, self.options["theta_bounds"][1]),
-                        )
-                    )
+                    theta_ini = np.hstack((self.options["sigma0"],
+                                           self.options["theta0"]))  # Variance + initial theta values
+                    lower_bounds = np.hstack((self.options["sigma_bounds"][0],
+                                              np.full(self.nx, self.options["theta_bounds"][0])))
+                    upper_bounds = np.hstack((self.options["sigma_bounds"][1],
+                                              np.full(self.nx, self.options["theta_bounds"][1])))
                     # Apply log10 to theta_ini and bounds
                     nb_params = len(self.options["theta0"])
-                    theta_ini[: nb_params + 1] = np.log10(theta_ini[: nb_params + 1])
-                    lower_bounds[: nb_params + 1] = np.log10(
-                        lower_bounds[: nb_params + 1]
-                    )
-                    upper_bounds[: nb_params + 1] = np.log10(
-                        upper_bounds[: nb_params + 1]
-                    )
+                    theta_ini[:nb_params+1] = np.log10(theta_ini[:nb_params+1])
+                    lower_bounds[:nb_params+1] = np.log10(lower_bounds[:nb_params+1])
+                    upper_bounds[:nb_params+1] = np.log10(upper_bounds[:nb_params+1])
 
                 elif lvl > 0:
                     # For additional levels, append to theta_ini, lower_bounds, and upper_bounds
-                    thetat = np.hstack(
-                        (self.options["sigma0"], self.options["theta0"][0])
-                    )
-                    lower_boundst = np.hstack(
-                        (
-                            self.options["sigma_bounds"][0],
-                            np.full(self.nx, self.options["theta_bounds"][0]),
-                        )
-                    )
-                    upper_boundst = np.hstack(
-                        (
-                            self.options["sigma_bounds"][1],
-                            np.full(self.nx, self.options["theta_bounds"][1]),
-                        )
-                    )
+                    thetat = np.hstack((self.options["sigma0"],
+                                        self.options["theta0"]))
+                    lower_boundst = np.hstack((self.options["sigma_bounds"][0],
+                                               np.full(self.nx, self.options["theta_bounds"][0])))
+                    upper_boundst = np.hstack((self.options["sigma_bounds"][1],
+                                               np.full(self.nx, self.options["theta_bounds"][1])))
                     # Apply log10 to the newly added values
                     thetat = np.log10(thetat)
                     lower_boundst = np.log10(lower_boundst)
                     upper_boundst = np.log10(upper_boundst)
                     # Append to theta_ini, lower_bounds, and upper_bounds
-                    theta_ini = np.hstack([theta_ini, thetat, self.options["rho0"]])
+                    theta_ini = np.hstack([theta_ini, thetat,self.options["rho0"]])
                     lower_bounds = np.hstack([lower_bounds, lower_boundst])
                     upper_bounds = np.hstack([upper_bounds, upper_boundst])
                     # Finally, append the rho bounds
-                    lower_bounds = np.hstack(
-                        [lower_bounds, self.options["rho_bounds"][0]]
-                    )
-                    upper_bounds = np.hstack(
-                        [upper_bounds, self.options["rho_bounds"][1]]
-                    )
+                    lower_bounds = np.hstack([lower_bounds, self.options["rho_bounds"][0]])
+                    upper_bounds = np.hstack([upper_bounds, self.options["rho_bounds"][1]])
 
         theta_ini = theta_ini[:].T
         x_opt = theta_ini
@@ -168,9 +154,9 @@ def train(self):
         if self.options["hyper_opt"] == "Cobyla":
             if self.options["n_start"] > 1:
                 sampling = LHS(
-                    xlimits=np.stack((lower_bounds, upper_bounds), axis=1),
-                    criterion="ese",
-                    random_state=0,
+                    xlimits = np.stack((lower_bounds, upper_bounds), axis=1),
+                    criterion = "ese",
+                    random_state = 0,
                 )
                 theta_lhs_loops = sampling(self.options["n_start"])
                 theta0 = np.vstack((theta_ini, theta_lhs_loops))
@@ -185,13 +171,15 @@ def train(self):
                 optimal_theta_res_loop = optimize.minimize(
                     self.neg_log_likelihood_scipy,
                     theta0[j, :],
-                    method="COBYLA",
-                    constraints=[{"fun": con, "type": "ineq"} for con in constraints],
-                    options={
-                        "rhobeg": 0.1,
-                        "tol": 1e-4,
-                        "maxiter": 200,
-                    },
+                    method = "COBYLA",
+                    constraints = [
+                        {"fun": con, "type": "ineq"} for con in constraints
+                    ],
+                    options = {
+                        "rhobeg": 0.2,
+                        "tol": 1e-6,
+                        "maxiter": 100,
+                    }
                 )
                 x_opt_iter = optimal_theta_res_loop.x
 
@@ -213,7 +201,7 @@ def train(self):
             opt.set_lower_bounds(lower_bounds)  # Lower bounds for each dimension
             opt.set_upper_bounds(upper_bounds)  # Upper bounds for each dimension
             opt.set_min_objective(self.neg_log_likelihood_nlopt)
-            opt.set_maxeval(5000)
+            opt.set_maxeval(1000)
             opt.set_xtol_rel(1e-6)
             x0 = np.copy(theta_ini)
             x_opt = opt.optimize(x0)
@@ -221,10 +209,12 @@ def train(self):
             raise ValueError(
                 f"The optimizer {self.options['hyper_opt']} is not available"
             )
+        x_opt[0] = 10**(x_opt[0]) # Apply 10** to Sigma 0
+        x_opt[1:self.nx+1] = 10**(x_opt[1:self.nx+1]) # Apply 10** to length scales 0
+        x_opt[self.nx+1 :: self.nx+2] = 10**(x_opt[self.nx+1 :: self.nx+2]) # Apply 10** to sigmas gamma
 
-        x_opt[0:2] = 10 ** (x_opt[0:2])
-        x_opt[2::3] = 10 ** (x_opt[2::3])
-        x_opt[3::3] = 10 ** (x_opt[3::3])
+        for i in np.arange(self.nx+2, x_opt.shape[0]-1, self.nx+2):
+            x_opt[i:i+self.nx] = 10**x_opt[i:i+self.nx]
         self.optimal_theta = x_opt
 
     def eta(self, j, jp, rho):
@@ -253,10 +243,13 @@ def compute_cross_K(self, x, xp, L, Lp, param):
         """
         cov_value = 0.0
 
-        param0 = param[0:2]
-        sigmas_gamma = param[2::3]
-        ls_gamma = param[3::3]
-        rho_values = param[4::3]
+        sigma_0 = param[0]
+        l_0 = param[1:self.nx+1]
+        #param0 = param[0 : self.nx+1]
+        sigmas_gamma = param[self.nx+1 :: self.nx+2]
+        l_s = [param[i:i+self.nx].tolist() for i in np.arange(self.nx+2, param.shape[0]-1, self.nx+2)]
+        #ls_gamma = param[3::3]
+        rho_values = param[2 + 2*self.nx::self.nx+2]
 
         # Sum of j=0 until l_^prime
         for j in range(Lp + 1):
@@ -265,12 +258,11 @@ def compute_cross_K(self, x, xp, L, Lp, param):
 
             if j == 0:
                 # Cov(γ_j(x), γ_j(x')) using the kernel for K_00
-                cov_gamma_j = self._compute_K(x, xp, param0)
+                cov_gamma_j = self._compute_K(x, xp, [sigma_0,l_0])
             else:
                 # Cov(γ_j(x), γ_j(x')) using the kernel
-                cov_gamma_j = self._compute_K(
-                    x, xp, [sigmas_gamma[j - 1], ls_gamma[j - 1]]
-                )
+                cov_gamma_j = self._compute_K(x, xp,
+                                              [sigmas_gamma[j-1], l_s[j-1]])
             # Add to the value of the covariance
             cov_value += eta_j_l * eta_j_lp * cov_gamma_j
 
@@ -290,50 +282,49 @@ def predict_all_levels(self, x):
         covariances: (list, np.array)
             Returns the conditional covariance matrixes per level.
         """
-        self.K = self.compute_blockwise_K(self.optimal_theta)
         means = []
         covariances = []
+        x = (x - self.X_offset) / self.X_scale
+
         if self.lvl == 1:
-            k_XX = self._compute_K(self.X[0], self.X[0], self.optimal_theta[0:2])
-            k_xX = self._compute_K(x, self.X[0], self.optimal_theta[0:2])
-            k_xx = self._compute_K(x, x, self.optimal_theta[0:2])
-            # To be adapted using the Cholesky decomposition
-            k_XX_inv = np.linalg.inv(
-                k_XX + self.options["nugget"] * np.eye(k_XX.shape[0])
-            )
-            means.append(np.dot(k_xX, np.matmul(k_XX_inv, self.y)))
-            covariances.append(
-                k_xx - np.matmul(k_xX, np.matmul(k_XX_inv, k_xX.transpose()))
-            )
+            sigma = self.optimal_theta[0] # Apply 10** to Sigma 0
+            l_s = [self.optimal_theta[1:self.nx+1]] # Apply 10** to length scales 0
+
+            k_XX = self._compute_K(self.X_norma_all[0], self.X_norma_all[0],[sigma, l_s[0]])
+            k_xX = self._compute_K(x, self.X_norma_all[0], [sigma, l_s[0]])
+            k_xx = self._compute_K(x,x,[sigma, l_s[0]])
+
+            L = np.linalg.cholesky(k_XX + self.options["nugget"] * np.eye(k_XX.shape[0]))
+
+            beta1 = solve_triangular(L, k_xX.T, lower=True)
+            alpha1 = solve_triangular(L, self.y_norma_all, lower=True)
+            means.append(self.y_std * np.dot(beta1.T, alpha1)+ self.y_mean)
+            covariances.append(k_xx - np.dot(beta1.T, beta1))
 
+            # k_XX_inv = np.linalg.inv(k_XX + self.options["nugget"]*np.eye(k_XX.shape[0]))
+            # means.append( np.dot(k_xX, np.matmul(k_XX_inv, self.y)))
+            # covariances.append(k_xx - np.matmul(k_xX,
+            #                                     np.matmul(k_XX_inv,
+            #                                               k_xX.transpose())))
         else:
-            L = np.linalg.cholesky(
-                self.K + self.options["nugget"] * np.eye(self.K.shape[0])
-            )
+            self.K = self.compute_blockwise_K(self.optimal_theta)
+            L = np.linalg.cholesky(self.K + self.options["nugget"] * np.eye(self.K.shape[0]))
             k_xX = []
             for ind in range(self.lvl):
-                k_xx = self.compute_cross_K(x, x, ind, ind, self.optimal_theta)
+                k_xx = self.compute_cross_K(x, x, ind, ind,self.optimal_theta)
                 for j in range(self.lvl):
                     if ind >= j:
-                        k_xX.append(
-                            self.compute_cross_K(
-                                self.X[j], x, ind, j, self.optimal_theta
-                            )
-                        )
+                        k_xX.append(self.compute_cross_K(self.X_norma_all[j], x, ind, j,self.optimal_theta))
                     else:
-                        k_xX.append(
-                            self.compute_cross_K(
-                                self.X[j], x, j, ind, self.optimal_theta
-                            )
-                        )
+                        k_xX.append(self.compute_cross_K(self.X_norma_all[j], x, j, ind,self.optimal_theta))
 
                 beta1 = solve_triangular(L, np.vstack(k_xX), lower=True)
-                alpha1 = solve_triangular(L, self.y, lower=True)
-                means.append(np.dot(beta1.T, alpha1))
+                alpha1 = solve_triangular(L, self.y_norma_all, lower=True)
+                means.append(self.y_std * np.dot(beta1.T, alpha1)+ self.y_mean)
                 covariances.append(k_xx - np.dot(beta1.T, beta1))
                 k_xX.clear()
 
-        return means, covariances
+        return means,covariances
 
     def _predict(self, x):
         """
@@ -351,20 +342,21 @@ def _predict(self, x):
         """
         means, covariances = self.predict_all_levels(x)
 
-        return means[self.lvl - 1], covariances[self.lvl - 1]
+        return means[self.lvl-1], covariances[self.lvl-1]
 
     def neg_log_likelihood(self, param, grad=None):
         if self.lvl == 1:
-            self.K = self._compute_K(self.X[0], self.X[0], param[0:2])
+            sigma = param[0]
+            l_s = [param[1: self.nx+1]]
+            self.K = self._compute_K(self.X[0], self.X[0], [sigma,l_s])
         else:
             self.K = self.compute_blockwise_K(param)
 
-        L = np.linalg.cholesky(
-            self.K + self.options["nugget"] * np.eye(self.K.shape[0])
-        )
-        beta = solve_triangular(L, self.y, lower=True)
-        NMLL = 1 / 2 * (2 * np.sum(np.log(np.diag(L))) + np.dot(beta.T, beta))
-        (nmll,) = NMLL[0]
+        reg_term = self.options['lambda'] * np.sum(np.power(param,2))
+        L = np.linalg.cholesky(self.K + self.options["nugget"]*np.eye(self.K.shape[0]))
+        beta = solve_triangular(L, self.y_norma_all, lower=True)
+        NMLL = np.sum(np.log(np.diag(L))) + np.dot(beta.T,beta) + reg_term
+        nmll, = NMLL[0]
         return nmll
 
     def neg_log_likelihood_scipy(self, param):
@@ -372,9 +364,12 @@ def neg_log_likelihood_scipy(self, param):
         Likelihood for Cobyla-scipy (SMT) optimizer
         """
         param = np.array(param, copy=True)
-        param[0:2] = 10 ** (param[0:2])
-        param[2::3] = 10 ** (param[2::3])
-        param[3::3] = 10 ** (param[3::3])
+        param[0] = 10**(param[0]) # Apply 10** to Sigma 0
+        param[1:self.nx+1] = 10**(param[1:self.nx+1]) # Apply 10** to length scales 0
+        param[self.nx+1 :: self.nx+2] = 10**(param[self.nx+1 :: self.nx+2]) # Apply 10** to sigmas gamma
+
+        for i in np.arange(self.nx+2, param.shape[0]-1, self.nx+2):
+            param[i:i+self.nx] = 10**param[i:i+self.nx]
         return self.neg_log_likelihood(param)
 
     def neg_log_likelihood_nlopt(self, param, grad=None):
@@ -382,34 +377,34 @@ def neg_log_likelihood_nlopt(self, param, grad=None):
         Likelihood for nlopt optimizers
         """
         param = np.array(param, copy=True)
-        param[0:2] = 10 ** (param[0:2])
-        param[2::3] = 10 ** (param[2::3])
-        param[3::3] = 10 ** (param[3::3])
+        param[0] = 10**(param[0]) # Apply 10** to Sigma 0
+        param[1:self.nx+1] = 10**(param[1:self.nx+1]) # Apply 10** to length scales 0
+        param[self.nx+1 :: self.nx+2] = 10**(param[self.nx+1 :: self.nx+2]) # Apply 10** to sigmas gamma
+
+        for i in np.arange(self.nx+2, param.shape[0]-1, self.nx+2):
+            param[i:i+self.nx] = 10**param[i:i+self.nx]
         return self.neg_log_likelihood(param, grad)
 
+
     def compute_blockwise_K(self, param):
         K_block = {}
         n = self.y.shape[0]
         for jp in range(self.lvl):
             for j in range(self.lvl):
                 if jp >= j:
-                    K_block[str(jp) + str(j)] = self.compute_cross_K(
-                        self.X[j], self.X[jp], jp, j, param
-                    )
+                    K_block[str(jp)+str(j)] = self.compute_cross_K(self.X_norma_all[j],
+                                                                   self.X_norma_all[jp],
+                                                                   jp, j, param)
 
         K = np.zeros((n, n))
         row_init, col_init = 0, 0
         for j in range(self.lvl):
             col_init = row_init
             for jp in range(j, self.lvl):
-                r, c = K_block[str(jp) + str(j)].shape
-                K[row_init : row_init + r, col_init : col_init + c] = K_block[
-                    str(jp) + str(j)
-                ]
+                r, c = K_block[str(jp)+str(j)].shape
+                K[row_init:row_init+r, col_init:col_init+c] = K_block[str(jp)+str(j)]
                 if j != jp:
-                    K[col_init : col_init + c, row_init : row_init + r] = K_block[
-                        str(jp) + str(j)
-                    ].T
+                    K[col_init:col_init+c, row_init:row_init+r] = K_block[str(jp)+str(j)].T
                 col_init += c
             row_init += r
         return K
@@ -421,13 +416,10 @@ def _compute_K(self, A: np.ndarray, B: np.ndarray, param):
         """
         # Compute pairwise componentwise L1-distances between A and B
         dx = differences(A, B)
-        d = componentwise_distance(
-            dx,
-            self.options["corr"],
-            self.X[0].shape[1],
-            power=self.options["pow_exp_power"],
-        )
-        self.corr.theta = np.full(self.X[0].shape[1], param[1])
+        d = componentwise_distance(dx, self.options["corr"],
+                                   self.X[0].shape[1],
+                                   power=self.options["pow_exp_power"])
+        self.corr.theta = np.asarray(param[1])
         r = self.corr(d)
         R = r.reshape(A.shape[0], B.shape[0])
         K = param[0] * R