diff --git a/doc/_src_docs/surrogate_models.rst b/doc/_src_docs/surrogate_models.rst
index 324c4c5ea..11f77ca80 100644
--- a/doc/_src_docs/surrogate_models.rst
+++ b/doc/_src_docs/surrogate_models.rst
@@ -18,6 +18,7 @@ SMT contains the surrogate modeling methods listed below.
    surrogate_models/gekpls
    surrogate_models/genn
    surrogate_models/mgp
+   surrogate_models/sgp
 
 
 Usage
diff --git a/doc/_src_docs/surrogate_models.rstx b/doc/_src_docs/surrogate_models.rstx
index 43f8f8c82..6e0b640f4 100644
--- a/doc/_src_docs/surrogate_models.rstx
+++ b/doc/_src_docs/surrogate_models.rstx
@@ -18,6 +18,7 @@ SMT contains the surrogate modeling methods listed below.
    surrogate_models/gekpls
    surrogate_models/genn
    surrogate_models/mgp
+   surrogate_models/sgp
 
 
 Usage
diff --git a/doc/_src_docs/surrogate_models/sgp.rst b/doc/_src_docs/surrogate_models/sgp.rst
new file mode 100644
index 000000000..07ff82c14
--- /dev/null
+++ b/doc/_src_docs/surrogate_models/sgp.rst
@@ -0,0 +1,371 @@
+Sparse Gaussian Process (SGP)
+=============================
+
+Although the versatility of Gaussian Process regression models for learning complex data, their computational complexity, 
+which is :math:`\mathcal{O}(N^3)` with :math:`N` the number of training points, prevent their use to large datasets. 
+This complexity results from the inversion of the covariance matrix :math:`\mathbf{K}`. We must also highlight that the memory 
+cost of GPR models is :math:`\mathcal{O}(N^2)`, mainly due to the storage of the covariance matrix itself.
+
+To address these limitations, sparse GPs approximation methods have emerged as efficient alternatives. 
+Sparse GPs consider a set of inducing points to approximate the posterior Gaussian distribution with a low-rank representation,
+while the variational inference provides a framework for approximating the posterior distribution directly. 
+Thus, these methods enable accurate modeling of large datasets while preserving computational efficiency 
+(typically :math:`\mathcal{O}(NM^2)` time and :math:`\mathcal{O}(NM)` memory for some chosen :math:`M<N`). 
+
+See [1]_ for a detailed information and discussion on several approximation methods benefits and drawbacks.
+
+Implementation
+--------------
+
+In SMT the methods: Fully Independent Training Conditional (FITC) method and the Variational Free Energy (VFE) approximation
+are implemented inspired from inference methods developed in the GPy project [2]_
+
+In practice, the implementation rely on the expression of their respective negative marginal log
+likelihood (NMLL), which is minimised to train the methods. We have the following expressions:
+
+For FITC
+
+.. math :: 
+    \text{NMLL}_{\text{FITC}} = \frac{1}{2}\log\left(\text{det}\left(\tilde{\mathbf{Q}}_{NN} + \eta^2\mathbf{I}_N\right)\right) + \frac{1}{2}\mathbf{y}^\top\left(\tilde{\mathbf{Q}}_{NN} + \eta^2\mathbf{I}_N\right)^{-1}\mathbf{y} + \frac{N}{2}\log(2\pi)
+
+
+For VFE
+
+.. math :: 
+    \text{NMLL}_{\text{VFE}} = \frac{1}{2}\log\left(\text{det}\left(\mathbf{Q}_{NN} + \eta^2\mathbf{I}_N\right)\right) + \frac{1}{2}\mathbf{y}^\top\left(\mathbf{Q}_{NN} + \eta^2\mathbf{I}_N\right)^{-1}\mathbf{y} + \frac{1}{2\eta^2}\text{Tr}\left[\mathbf{K}_{NN} + \mathbf{Q}_{NN} \right] + \frac{N}{2}\log(2\pi)
+
+where
+
+.. math ::
+
+    \mathbf{K}_{NN} \approx \mathbf{Q}_{NN} = \mathbf{K}_{NM}\mathbf{K}_{MM}^{-1} \mathbf{K}f_{NM}^\top
+
+    \tilde{\mathbf{Q}}_{NN} = \mathbf{Q}_{NN} + \text{diag}\left[\mathbf{K}_{NN} - \mathbf{Q}_{NN}\right]
+
+and :math:`\eta^2` is the variance of the gaussian noise assumed on training data.
+
+Limitations
+-----------
+
+* Inducing points location can not be optimized.
+* Noise variance on observed training data can not be estimated.
+* Trend function is assumed to be zero.
+
+
+.. [1] Matthias Bauer, Mark van der Wilk, and Carl Edward Rasmussen. "Understanding Probabilistic Sparse Gaussian Process Approximations". In: Advances in Neural Information Processing Systems. Ed. by D. Lee et al. Vol. 29. Curran Associates, Inc., 2016
+
+.. [2] https://github.com/SheffieldML/GPy
+
+
+Usage
+-----
+
+Using FITC method
+^^^^^^^^^^^^^^^^^
+
+.. code-block:: python
+
+  import numpy as np
+  import matplotlib.pyplot as plt
+  
+  from smt.surrogate_models import SGP
+  
+  def f_obj(x):
+      import numpy as np
+  
+      return (
+          np.sin(3 * np.pi * x)
+          + 0.3 * np.cos(9 * np.pi * x)
+          + 0.5 * np.sin(7 * np.pi * x)
+      )
+  
+  # random generator for reproducibility
+  rng = np.random.RandomState(0)
+  
+  # Generate training data
+  nt = 200
+  # Variance of the gaussian noise on our trainingg data
+  eta2 = [0.01]
+  gaussian_noise = rng.normal(loc=0.0, scale=np.sqrt(eta2), size=(nt, 1))
+  xt = 2 * rng.rand(nt, 1) - 1
+  yt = f_obj(xt) + gaussian_noise
+  
+  # Pick inducing points randomly in training data
+  n_inducing = 30
+  random_idx = rng.permutation(nt)[:n_inducing]
+  Z = xt[random_idx].copy()
+  
+  sgp = SGP(noise0=eta2)  # Assume here we have an idea of the variance eta2
+  sgp.set_training_values(xt, yt)
+  sgp.set_inducing_inputs(Z=Z)
+  # sgp.set_inducing_inputs()  # When Z not specified inducing points are picked randomly in traing data
+  sgp.train()
+  
+  x = np.linspace(-1, 1, nt + 1).reshape(-1, 1)
+  y = f_obj(x)
+  hat_y = sgp.predict_values(x)
+  var = sgp.predict_variances(x)
+  
+  # plot prediction
+  plt.figure(figsize=(14, 6))
+  plt.plot(x, y, "C1-", label="target function")
+  plt.scatter(xt, yt, marker="o", s=10, label="observed data")
+  plt.plot(x, hat_y, "k-", label="Sparse GP")
+  plt.plot(x, hat_y - 3 * np.sqrt(var), "k--")
+  plt.plot(x, hat_y + 3 * np.sqrt(var), "k--", label="99% CI")
+  plt.plot(Z, -2.9 * np.ones_like(Z), "r|", mew=2, label="inducing points")
+  plt.ylim([-3, 3])
+  plt.legend(loc=0)
+  plt.show()
+  
+::
+
+  ___________________________________________________________________________
+     
+                                      SGP
+  ___________________________________________________________________________
+     
+   Problem size
+     
+        # training points.        : 200
+     
+  ___________________________________________________________________________
+     
+   Training
+     
+     Training ...
+     Training - done. Time (sec):  0.2497931
+  ___________________________________________________________________________
+     
+   Evaluation
+     
+        # eval points. : 201
+     
+     Predicting ...
+     Predicting - done. Time (sec):  0.0000000
+     
+     Prediction time/pt. (sec) :  0.0000000
+     
+  
+.. figure:: sgp_Test_test_sgp_fitc.png
+  :scale: 80 %
+  :align: center
+
+Using VFE method
+^^^^^^^^^^^^^^^^
+
+.. code-block:: python
+
+  import numpy as np
+  import matplotlib.pyplot as plt
+  
+  from smt.surrogate_models import SGP
+  
+  def f_obj(x):
+      import numpy as np
+  
+      return (
+          np.sin(3 * np.pi * x)
+          + 0.3 * np.cos(9 * np.pi * x)
+          + 0.5 * np.sin(7 * np.pi * x)
+      )
+  
+  # random generator for reproducibility
+  rng = np.random.RandomState(42)
+  
+  # Generate training data
+  nt = 200
+  # Variance of the gaussian noise on our trainingg data
+  eta2 = [0.01]
+  gaussian_noise = rng.normal(loc=0.0, scale=np.sqrt(eta2), size=(nt, 1))
+  xt = 2 * rng.rand(nt, 1) - 1
+  yt = f_obj(xt) + gaussian_noise
+  
+  # Pick inducing points randomly in training data
+  n_inducing = 30
+  random_idx = rng.permutation(nt)[:n_inducing]
+  Z = xt[random_idx].copy()
+  
+  sgp = SGP(noise0=eta2, method="VFE")
+  sgp.set_training_values(xt, yt)
+  sgp.set_inducing_inputs(Z=Z)
+  sgp.train()
+  
+  x = np.linspace(-1, 1, nt + 1).reshape(-1, 1)
+  y = f_obj(x)
+  hat_y = sgp.predict_values(x)
+  var = sgp.predict_variances(x)
+  
+  # plot prediction
+  plt.figure(figsize=(14, 6))
+  plt.plot(x, y, "C1-", label="target function")
+  plt.scatter(xt, yt, marker="o", s=10, label="observed data")
+  plt.plot(x, hat_y, "k-", label="Sparse GP")
+  plt.plot(x, hat_y - 3 * np.sqrt(var), "k--")
+  plt.plot(x, hat_y + 3 * np.sqrt(var), "k--", label="99% CI")
+  plt.plot(Z, -2.9 * np.ones_like(Z), "r|", mew=2, label="inducing points")
+  plt.ylim([-3, 3])
+  plt.legend(loc=0)
+  plt.show()
+  
+::
+
+  ___________________________________________________________________________
+     
+                                      SGP
+  ___________________________________________________________________________
+     
+   Problem size
+     
+        # training points.        : 200
+     
+  ___________________________________________________________________________
+     
+   Training
+     
+     Training ...
+     Training - done. Time (sec):  0.1902514
+  ___________________________________________________________________________
+     
+   Evaluation
+     
+        # eval points. : 201
+     
+     Predicting ...
+     Predicting - done. Time (sec):  0.0000000
+     
+     Prediction time/pt. (sec) :  0.0000000
+     
+  
+.. figure:: sgp_Test_test_sgp_vfe.png
+  :scale: 80 %
+  :align: center
+
+Options
+-------
+
+.. list-table:: List of options
+  :header-rows: 1
+  :widths: 15, 10, 20, 20, 30
+  :stub-columns: 0
+
+  *  -  Option
+     -  Default
+     -  Acceptable values
+     -  Acceptable types
+     -  Description
+  *  -  print_global
+     -  True
+     -  None
+     -  ['bool']
+     -  Global print toggle. If False, all printing is suppressed
+  *  -  print_training
+     -  True
+     -  None
+     -  ['bool']
+     -  Whether to print training information
+  *  -  print_prediction
+     -  True
+     -  None
+     -  ['bool']
+     -  Whether to print prediction information
+  *  -  print_problem
+     -  True
+     -  None
+     -  ['bool']
+     -  Whether to print problem information
+  *  -  print_solver
+     -  True
+     -  None
+     -  ['bool']
+     -  Whether to print solver information
+  *  -  poly
+     -  constant
+     -  ['constant']
+     -  ['str']
+     -  Regression function type
+  *  -  corr
+     -  squar_exp
+     -  ['squar_exp']
+     -  ['str']
+     -  Correlation function type
+  *  -  pow_exp_power
+     -  1.9
+     -  None
+     -  ['float']
+     -  Power for the pow_exp kernel function (valid values in (0.0, 2.0]), This option is set automatically when corr option is squar, abs, or matern.
+  *  -  categorical_kernel
+     -  MixIntKernelType.CONT_RELAX
+     -  [<MixIntKernelType.CONT_RELAX: 'CONT_RELAX'>, <MixIntKernelType.GOWER: 'GOWER'>, <MixIntKernelType.EXP_HOMO_HSPHERE: 'EXP_HOMO_HSPHERE'>, <MixIntKernelType.HOMO_HSPHERE: 'HOMO_HSPHERE'>]
+     -  None
+     -  The kernel to use for categorical inputs. Only for non continuous Kriging
+  *  -  hierarchical_kernel
+     -  MixHrcKernelType.ALG_KERNEL
+     -  [<MixHrcKernelType.ALG_KERNEL: 'ALG_KERNEL'>, <MixHrcKernelType.ARC_KERNEL: 'ARC_KERNEL'>]
+     -  None
+     -  The kernel to use for mixed hierarchical inputs. Only for non continuous Kriging
+  *  -  nugget
+     -  1e-08
+     -  None
+     -  ['float']
+     -  a jitter for numerical stability
+  *  -  theta0
+     -  [0.01]
+     -  None
+     -  ['list', 'ndarray']
+     -  Initial hyperparameters
+  *  -  theta_bounds
+     -  [1e-06, 100.0]
+     -  None
+     -  ['list', 'ndarray']
+     -  bounds for hyperparameters
+  *  -  hyper_opt
+     -  Cobyla
+     -  ['Cobyla', 'TNC']
+     -  ['str']
+     -  Optimiser for hyperparameters optimisation
+  *  -  eval_noise
+     -  True
+     -  [True, False]
+     -  ['bool']
+     -  Noise is always evaluated
+  *  -  noise0
+     -  [0.01]
+     -  None
+     -  ['list', 'ndarray']
+     -  Gaussian noise on observed training data
+  *  -  noise_bounds
+     -  [2.220446049250313e-14, 10000000000.0]
+     -  None
+     -  ['list', 'ndarray']
+     -  bounds for noise hyperparameters
+  *  -  use_het_noise
+     -  False
+     -  [True, False]
+     -  ['bool']
+     -  heteroscedastic noise evaluation flag
+  *  -  n_start
+     -  10
+     -  None
+     -  ['int']
+     -  number of optimizer runs (multistart method)
+  *  -  xlimits
+     -  None
+     -  None
+     -  ['list', 'ndarray']
+     -  definition of a design space of float (continuous) variables: array-like of size nx x 2 (lower, upper bounds)
+  *  -  design_space
+     -  None
+     -  None
+     -  ['BaseDesignSpace', 'list', 'ndarray']
+     -  definition of the (hierarchical) design space: use `smt.utils.design_space.DesignSpace` as the main API. Also accepts list of float variable bounds
+  *  -  method
+     -  FITC
+     -  ['FITC', 'VFE']
+     -  ['str']
+     -  Method used by sparse GP model
+  *  -  n_inducing
+     -  10
+     -  None
+     -  ['int']
+     -  Number of inducing inputs
diff --git a/doc/_src_docs/surrogate_models/sgp.rstx b/doc/_src_docs/surrogate_models/sgp.rstx
new file mode 100644
index 000000000..03f9c5f28
--- /dev/null
+++ b/doc/_src_docs/surrogate_models/sgp.rstx
@@ -0,0 +1,76 @@
+Sparse Gaussian Process (SGP)
+=============================
+
+Although the versatility of Gaussian Process regression models for learning complex data, their computational complexity, 
+which is :math:`\mathcal{O}(N^3)` with :math:`N` the number of training points, prevent their use to large datasets. 
+This complexity results from the inversion of the covariance matrix :math:`\mathbf{K}`. We must also highlight that the memory 
+cost of GPR models is :math:`\mathcal{O}(N^2)`, mainly due to the storage of the covariance matrix itself.
+
+To address these limitations, sparse GPs approximation methods have emerged as efficient alternatives. 
+Sparse GPs consider a set of inducing points to approximate the posterior Gaussian distribution with a low-rank representation,
+while the variational inference provides a framework for approximating the posterior distribution directly. 
+Thus, these methods enable accurate modeling of large datasets while preserving computational efficiency 
+(typically :math:`\mathcal{O}(NM^2)` time and :math:`\mathcal{O}(NM)` memory for some chosen :math:`M<N`). 
+
+See [1]_ for a detailed information and discussion on several approximation methods benefits and drawbacks.
+
+Implementation
+--------------
+
+In SMT the methods: Fully Independent Training Conditional (FITC) method and the Variational Free Energy (VFE) approximation
+are implemented inspired from inference methods developed in the GPy project [2]_
+
+In practice, the implementation rely on the expression of their respective negative marginal log
+likelihood (NMLL), which is minimised to train the methods. We have the following expressions:
+
+For FITC
+
+.. math :: 
+    \text{NMLL}_{\text{FITC}} = \frac{1}{2}\log\left(\text{det}\left(\tilde{\mathbf{Q}}_{NN} + \eta^2\mathbf{I}_N\right)\right) + \frac{1}{2}\mathbf{y}^\top\left(\tilde{\mathbf{Q}}_{NN} + \eta^2\mathbf{I}_N\right)^{-1}\mathbf{y} + \frac{N}{2}\log(2\pi)
+
+
+For VFE
+
+.. math :: 
+    \text{NMLL}_{\text{VFE}} = \frac{1}{2}\log\left(\text{det}\left(\mathbf{Q}_{NN} + \eta^2\mathbf{I}_N\right)\right) + \frac{1}{2}\mathbf{y}^\top\left(\mathbf{Q}_{NN} + \eta^2\mathbf{I}_N\right)^{-1}\mathbf{y} + \frac{1}{2\eta^2}\text{Tr}\left[\mathbf{K}_{NN} + \mathbf{Q}_{NN} \right] + \frac{N}{2}\log(2\pi)
+
+where
+
+.. math ::
+
+    \mathbf{K}_{NN} \approx \mathbf{Q}_{NN} = \mathbf{K}_{NM}\mathbf{K}_{MM}^{-1} \mathbf{K}f_{NM}^\top
+
+    \tilde{\mathbf{Q}}_{NN} = \mathbf{Q}_{NN} + \text{diag}\left[\mathbf{K}_{NN} - \mathbf{Q}_{NN}\right]
+
+and :math:`\eta^2` is the variance of the gaussian noise assumed on training data.
+
+Limitations
+-----------
+
+* Inducing points location can not be optimized.
+* Noise variance on observed training data can not be estimated.
+* Trend function is assumed to be zero.
+
+
+.. [1] Matthias Bauer, Mark van der Wilk, and Carl Edward Rasmussen. "Understanding Probabilistic Sparse Gaussian Process Approximations". In: Advances in Neural Information Processing Systems. Ed. by D. Lee et al. Vol. 29. Curran Associates, Inc., 2016
+
+.. [2] https://github.com/SheffieldML/GPy
+
+
+Usage
+-----
+
+Using FITC method
+^^^^^^^^^^^^^^^^^
+
+.. embed-test-print-plot :: smt.surrogate_models.tests.test_surrogate_model_examples , Test , test_sgp_fitc , 80
+
+Using VFE method
+^^^^^^^^^^^^^^^^
+
+.. embed-test-print-plot :: smt.surrogate_models.tests.test_surrogate_model_examples , Test , test_sgp_vfe , 80
+
+Options
+-------
+
+.. embed-options-table :: smt.surrogate_models , SGP , options
diff --git a/doc/_src_docs/surrogate_models/sgp_Test_test_sgp_fitc.png b/doc/_src_docs/surrogate_models/sgp_Test_test_sgp_fitc.png
new file mode 100644
index 000000000..7e9786783
Binary files /dev/null and b/doc/_src_docs/surrogate_models/sgp_Test_test_sgp_fitc.png differ
diff --git a/doc/_src_docs/surrogate_models/sgp_Test_test_sgp_vfe.png b/doc/_src_docs/surrogate_models/sgp_Test_test_sgp_vfe.png
new file mode 100644
index 000000000..aea1ab85e
Binary files /dev/null and b/doc/_src_docs/surrogate_models/sgp_Test_test_sgp_vfe.png differ
diff --git a/smt/applications/tests/test_ego.py b/smt/applications/tests/test_ego.py
index 8567edfd7..9a90deece 100644
--- a/smt/applications/tests/test_ego.py
+++ b/smt/applications/tests/test_ego.py
@@ -1204,9 +1204,7 @@ def function_test_mixed_integer(X):
         )
         mixint = MixedIntegerContext(design_space)
         n_doe = 3
-        sampling = mixint.build_sampling_method(
-            LHS, criterion="ese", random_state=random_state
-        )
+        sampling = mixint.build_sampling_method(random_state=random_state)
         xdoe = sampling(n_doe)
         ydoe = function_test_mixed_integer(xdoe)
 
diff --git a/smt/surrogate_models/krg_based.py b/smt/surrogate_models/krg_based.py
index 625c90026..550ff5152 100644
--- a/smt/surrogate_models/krg_based.py
+++ b/smt/surrogate_models/krg_based.py
@@ -763,7 +763,6 @@ def _reduced_likelihood_function(self, theta):
             p = self.p
             q = self.q
         sigma2 = (rho**2.0).sum(axis=0) / (self.nt - p - q)
-        self.sigma2 = sigma2
         reduced_likelihood_function_value = -(self.nt - p - q) * np.log10(
             sigma2.sum()
         ) - self.nt * np.log10(detR)
diff --git a/smt/surrogate_models/sgp.py b/smt/surrogate_models/sgp.py
index 4619fe6d7..21298a094 100644
--- a/smt/surrogate_models/sgp.py
+++ b/smt/surrogate_models/sgp.py
@@ -26,35 +26,62 @@ def _initialize(self):
         declare = self.options.declare
         declare(
             "corr",
-            "squar_exp",
+            "squar_exp",  # gaussian kernel only
             values=("squar_exp"),
             desc="Correlation function type",
             types=(str),
         )
         declare(
             "poly",
-            "constant",
+            "constant",  # constant mean function
             values=("constant"),
             desc="Regression function type",
             types=(str),
         )
+        declare(
+            "theta_bounds",
+            [1e-6, 1e2],  # upper bound increased compared to kriging-based one
+            types=(list, np.ndarray),
+            desc="bounds for hyperparameters",
+        )
         declare(
             "noise0",
-            [0.01],
-            desc="Gaussian noise on observed data",
+            [1e-2],
+            desc="Gaussian noise on observed training data",
             types=(list, np.ndarray),
         )
+        declare(
+            "eval_noise",
+            True,
+            types=bool,
+            values=(True, False),
+            desc="Noise is always evaluated",
+        )
+        declare(
+            "nugget",
+            1e-8,  # increased compared to kriging-based one
+            types=(float),
+            desc="a jitter for numerical stability",
+        )
         declare(
             "method",
             "FITC",
             values=("FITC", "VFE"),
-            desc="Method for sparse GP model",
+            desc="Method used by sparse GP model",
             types=(str),
         )
         declare("n_inducing", 10, desc="Number of inducing inputs", types=int)
+
+        supports = self.supports
+        supports["derivatives"] = False
+        supports["variances"] = True
+        supports["variance_derivatives"] = False
+        supports["x_hierarchy"] = False
+
         self.Z = None
         self.woodbury_data = {"vec": None, "inv": None}
         self.optimal_par = {}
+        self.optimal_noise = None
 
     def compute_K(self, A: np.ndarray, B: np.ndarray, theta, sigma2):
         """
@@ -114,9 +141,6 @@ def _new_train(self):
         if self.Z is None:
             self.set_inducing_inputs()
 
-        # Has to evaluate the noise
-        self.options["eval_noise"] = True
-
         # make sure the latent function is scalars
         Y = self.training_points[None][0][1]
         _, output_dim = Y.shape
@@ -139,10 +163,13 @@ def _reduced_likelihood_function(self, theta):
         Y = self.training_points[None][0][1]
         Z = self.Z
 
-        sigma2 = theta[-1]
-        theta = theta[0:-1]
+        if self.options["eval_noise"]:
+            sigma2 = theta[-1]
+            theta = theta[0:-1]
+        else:
+            sigma2 = self.options["noise0"]
 
-        nugget = 1e-8
+        nugget = self.options["nugget"]
 
         if self.options["method"] == "VFE":
             likelihood, w_vec, w_inv = self._vfe(X, Y, Z, theta, sigma2, nugget)
@@ -156,7 +183,7 @@ def _reduced_likelihood_function(self, theta):
             "theta": theta,
             "sigma2": sigma2,
         }
-        # print(">>> lkh=", likelihood)
+        # print(">>>>>>> MLL=", likelihood)
         return likelihood, params
 
     def _fitc(self, X, Y, Z, theta, sigma2, nugget):
@@ -176,8 +203,11 @@ def _fitc(self, X, Y, Z, theta, sigma2, nugget):
         Ui = linalg.inv(U)
         V = Ui @ Kmn
 
+        # Assumption on the gaussian noise on training outputs
+        eta2 = np.array(self.options["noise0"])
+
         # Compute diagonal correction: nu = Knn_diag - Qnn_diag + \eta^2
-        nu = Knn - np.sum(np.square(V), 0) + np.array(self.options["noise0"])
+        nu = Knn - np.sum(np.square(V), 0) + eta2
         # Compute beta, the effective noise precision
         beta = 1.0 / nu
 
diff --git a/smt/surrogate_models/tests/test_surrogate_model_examples.py b/smt/surrogate_models/tests/test_surrogate_model_examples.py
index b6adf44ed..5b892fc36 100644
--- a/smt/surrogate_models/tests/test_surrogate_model_examples.py
+++ b/smt/surrogate_models/tests/test_surrogate_model_examples.py
@@ -620,6 +620,113 @@ def fun(x):
         fig.subplots_adjust(top=0.74)
         plt.show()
 
+    def test_sgp_fitc(self):
+        import numpy as np
+        import matplotlib.pyplot as plt
+
+        from smt.surrogate_models import SGP
+
+        def f_obj(x):
+            import numpy as np
+
+            return (
+                np.sin(3 * np.pi * x)
+                + 0.3 * np.cos(9 * np.pi * x)
+                + 0.5 * np.sin(7 * np.pi * x)
+            )
+
+        # random generator for reproducibility
+        rng = np.random.RandomState(0)
+
+        # Generate training data
+        nt = 200
+        # Variance of the gaussian noise on our trainingg data
+        eta2 = [0.01]
+        gaussian_noise = rng.normal(loc=0.0, scale=np.sqrt(eta2), size=(nt, 1))
+        xt = 2 * rng.rand(nt, 1) - 1
+        yt = f_obj(xt) + gaussian_noise
+
+        # Pick inducing points randomly in training data
+        n_inducing = 30
+        random_idx = rng.permutation(nt)[:n_inducing]
+        Z = xt[random_idx].copy()
+
+        sgp = SGP(noise0=eta2)  # Assume here we have an idea of the variance eta2
+        sgp.set_training_values(xt, yt)
+        sgp.set_inducing_inputs(Z=Z)
+        # sgp.set_inducing_inputs()  # When Z not specified inducing points are picked randomly in traing data
+        sgp.train()
+
+        x = np.linspace(-1, 1, nt + 1).reshape(-1, 1)
+        y = f_obj(x)
+        hat_y = sgp.predict_values(x)
+        var = sgp.predict_variances(x)
+
+        # plot prediction
+        plt.figure(figsize=(14, 6))
+        plt.plot(x, y, "C1-", label="target function")
+        plt.scatter(xt, yt, marker="o", s=10, label="observed data")
+        plt.plot(x, hat_y, "k-", label="Sparse GP")
+        plt.plot(x, hat_y - 3 * np.sqrt(var), "k--")
+        plt.plot(x, hat_y + 3 * np.sqrt(var), "k--", label="99% CI")
+        plt.plot(Z, -2.9 * np.ones_like(Z), "r|", mew=2, label="inducing points")
+        plt.ylim([-3, 3])
+        plt.legend(loc=0)
+        plt.show()
+
+    def test_sgp_vfe(self):
+        import numpy as np
+        import matplotlib.pyplot as plt
+
+        from smt.surrogate_models import SGP
+
+        def f_obj(x):
+            import numpy as np
+
+            return (
+                np.sin(3 * np.pi * x)
+                + 0.3 * np.cos(9 * np.pi * x)
+                + 0.5 * np.sin(7 * np.pi * x)
+            )
+
+        # random generator for reproducibility
+        rng = np.random.RandomState(42)
+
+        # Generate training data
+        nt = 200
+        # Variance of the gaussian noise on our trainingg data
+        eta2 = [0.01]
+        gaussian_noise = rng.normal(loc=0.0, scale=np.sqrt(eta2), size=(nt, 1))
+        xt = 2 * rng.rand(nt, 1) - 1
+        yt = f_obj(xt) + gaussian_noise
+
+        # Pick inducing points randomly in training data
+        n_inducing = 30
+        random_idx = rng.permutation(nt)[:n_inducing]
+        Z = xt[random_idx].copy()
+
+        sgp = SGP(noise0=eta2, method="VFE")
+        sgp.set_training_values(xt, yt)
+        sgp.set_inducing_inputs(Z=Z)
+        sgp.train()
+
+        x = np.linspace(-1, 1, nt + 1).reshape(-1, 1)
+        y = f_obj(x)
+        hat_y = sgp.predict_values(x)
+        var = sgp.predict_variances(x)
+
+        # plot prediction
+        plt.figure(figsize=(14, 6))
+        plt.plot(x, y, "C1-", label="target function")
+        plt.scatter(xt, yt, marker="o", s=10, label="observed data")
+        plt.plot(x, hat_y, "k-", label="Sparse GP")
+        plt.plot(x, hat_y - 3 * np.sqrt(var), "k--")
+        plt.plot(x, hat_y + 3 * np.sqrt(var), "k--", label="99% CI")
+        plt.plot(Z, -2.9 * np.ones_like(Z), "r|", mew=2, label="inducing points")
+        plt.ylim([-3, 3])
+        plt.legend(loc=0)
+        plt.show()
+
 
 if __name__ == "__main__":
     unittest.main()