From 56c9d4ecefc74ca5cabff309f52bcb492b28279b Mon Sep 17 00:00:00 2001
From: vcarlier <105044741+vcarlier@users.noreply.github.com>
Date: Thu, 7 Sep 2023 15:39:40 +0200
Subject: [PATCH] Add kernels for various linear algebra operations (#310)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add kernels for:
- inner product of two `StencilVector` objects belonging to the same
space (still called `dot` for now);
- `axpy` operation `y = a * x + y` of `StencilVectorSpace`, where
`(x, y)` are `StencilVector` objects and `a` is scalar;
- Matrix-vector product of `StencilMatrix` (called `dot`).

---------

Co-authored-by: Yaman Güçlü <yaman.guclu@gmail.com>
Co-authored-by: tomcaruso <tomarthur,caruso@gmail,com>
---
 psydac/api/tests/test_equation.py          |   2 +-
 psydac/linalg/basic.py                     |  33 +
 psydac/linalg/block.py                     |  29 +
 psydac/linalg/kernels/axpy_kernels.py      |  61 ++
 psydac/linalg/kernels/inner_kernels.py     | 100 +++
 psydac/linalg/kernels/matvec_kernels.py    | 210 ++++++
 psydac/linalg/solvers.py                   | 260 +++----
 psydac/linalg/stencil.py                   | 767 +++++++++++----------
 psydac/linalg/tests/test_block.py          |  81 ++-
 psydac/linalg/tests/test_stencil_matrix.py |   6 +-
 psydac/linalg/tests/test_stencil_vector.py |  35 +-
 psydac/polar/dense.py                      |   4 +
 12 files changed, 1027 insertions(+), 561 deletions(-)
 create mode 100644 psydac/linalg/kernels/axpy_kernels.py
 create mode 100644 psydac/linalg/kernels/inner_kernels.py
 create mode 100644 psydac/linalg/kernels/matvec_kernels.py

diff --git a/psydac/api/tests/test_equation.py b/psydac/api/tests/test_equation.py
index 9fe171665..eebfc6ecd 100644
--- a/psydac/api/tests/test_equation.py
+++ b/psydac/api/tests/test_equation.py
@@ -58,4 +58,4 @@ def test_field_and_constant(backend):
     xh = equation_h.solve(c=c_value, f=fh)
 
     # Verify that solution is equal to c_value
-    assert np.allclose(xh.coeffs.toarray(), c_value, rtol=1e-10, atol=1e-16)
+    assert np.allclose(xh.coeffs.toarray(), c_value, rtol=1e-9, atol=1e-16)
diff --git a/psydac/linalg/basic.py b/psydac/linalg/basic.py
index 70e4281c1..6a545c601 100644
--- a/psydac/linalg/basic.py
+++ b/psydac/linalg/basic.py
@@ -52,6 +52,25 @@ def dot(self, a, b):
 
         """
 
+    @abstractmethod
+    def axpy(self, a, x, y):
+        """
+        Increment the vector y with the a-scaled vector x, i.e. y = a * x + y,
+        provided that x and y belong to the same vector space V (self).
+        The scalar value a may be real or complex, depending on the field of V.
+
+        Parameters
+        ----------
+        a : scalar
+            The scaling coefficient needed for the operation.
+
+        x : Vector
+            The vector which is not modified by this function.
+
+        y : Vector
+            The vector modified by this function (incremented by a * x).
+        """
+
 #===============================================================================
 class Vector(ABC):
     """
@@ -76,6 +95,20 @@ def dot(self, other):
         assert self.space is other.space
         return self.space.dot(self, other)
 
+    def mul_iadd(self, a, x):
+        """
+        Compute self += a * x, where x is another vector of the same space.
+
+        Parameters
+        ----------
+        a : scalar
+            Rescaling coefficient, which can be cast to the correct dtype.
+
+        x : Vector
+            Vector belonging to the same space as self.
+        """
+        self.space.axpy(a, x, self)
+
     #-------------------------------------
     # Deferred methods
     #-------------------------------------
diff --git a/psydac/linalg/block.py b/psydac/linalg/block.py
index 702aa866e..fb2737d56 100644
--- a/psydac/linalg/block.py
+++ b/psydac/linalg/block.py
@@ -85,6 +85,35 @@ def zeros(self):
         """
         return BlockVector(self, [Vi.zeros() for Vi in self._spaces])
 
+    #...
+    def axpy(self, a, x, y):
+        """
+        Increment the vector y with the a-scaled vector x, i.e. y = a * x + y,
+        provided that x and y belong to the same vector space V (self).
+        The scalar value a may be real or complex, depending on the field of V.
+
+        Parameters
+        ----------
+        a : scalar
+            The scaling coefficient needed for the operation.
+
+        x : BlockVector
+            The vector which is not modified by this function.
+
+        y : BlockVector
+            The vector modified by this function (incremented by a * x).
+        """
+
+        assert isinstance(x, BlockVector)
+        assert isinstance(y, BlockVector)
+        assert x.space is self
+        assert y.space is self
+
+        for Vi, xi, yi in zip(self.spaces, x.blocks, y.blocks):
+            Vi.axpy(a, xi, yi)
+
+        x._sync = x._sync and y._sync
+
     #--------------------------------------
     # Other properties/methods
     #--------------------------------------
diff --git a/psydac/linalg/kernels/axpy_kernels.py b/psydac/linalg/kernels/axpy_kernels.py
new file mode 100644
index 000000000..9761aa9a1
--- /dev/null
+++ b/psydac/linalg/kernels/axpy_kernels.py
@@ -0,0 +1,61 @@
+from pyccel.decorators import template
+
+#========================================================================================================
+@template(name='Tarray', types=['float[:]', 'complex[:]'])
+@template(name='T', types=['float', 'complex'])
+def axpy_1d(alpha: 'T', x: "Tarray", y: "Tarray"):
+    """
+    Kernel for computing y = alpha * x + y.
+
+    Parameters
+    ----------
+    alpha : float | complex
+        Scaling coefficient.
+
+    x, y : 1D Numpy arrays of (float | complex) data
+        Data of the vectors.
+    """
+    n1, = x.shape
+    for i1 in range(n1):
+        y[i1] += alpha * x[i1]
+
+#========================================================================================================
+@template(name='Tarray', types=['float[:,:]', 'complex[:,:]'])
+@template(name='T', types=['float', 'complex'])
+def axpy_2d(alpha: 'T', x: "Tarray", y: "Tarray"):
+    """
+    Kernel for computing y = alpha * x + y.
+
+    Parameters
+    ----------
+    alpha : float | complex
+        Scaling coefficient.
+
+    x, y : 2D Numpy arrays of (float | complex) data
+        Data of the vectors.
+    """
+    n1, n2 = x.shape
+    for i1 in range(n1):
+        for i2 in range(n2):
+            y[i1, i2] += alpha * x[i1, i2]
+
+#========================================================================================================
+@template(name='Tarray', types=['float[:,:,:]', 'complex[:,:,:]'])
+@template(name='T', types=['float', 'complex'])
+def axpy_3d(alpha: 'T', x: "Tarray", y: "Tarray"):
+    """
+    Kernel for computing y = alpha * x + y.
+
+    Parameters
+    ----------
+    alpha : float | complex
+        Scaling coefficient.
+
+    x, y : 3D Numpy arrays of (float | complex) data
+        Data of the vectors.
+    """
+    n1, n2, n3 = x.shape
+    for i1 in range(n1):
+        for i2 in range(n2):
+            for i3 in range(n3):
+                y[i1, i2, i3] += alpha * x[i1, i2, i3]
diff --git a/psydac/linalg/kernels/inner_kernels.py b/psydac/linalg/kernels/inner_kernels.py
new file mode 100644
index 000000000..efb5a6ccb
--- /dev/null
+++ b/psydac/linalg/kernels/inner_kernels.py
@@ -0,0 +1,100 @@
+from pyccel.decorators import template
+
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
+#!!!!!!!!!!!!!!!!!!! WARNING !!!!!!!!!!!!!!!!!!!#
+#!!!!!!! Conjugate on the first argument !!!!!!!#
+#!!!!!!!!!! This will need an update !!!!!!!!!!!#
+#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
+
+#==============================================================================
+@template(name='T', types=['float[:]', 'complex[:]'])
+def inner_1d(v1: 'T', v2: 'T', nghost0: 'int64'):
+    """
+    Kernel for computing the inner product (case of two 1D vectors).
+
+    Parameters
+    ----------
+    v1, v2 : 1D NumPy array
+        Data of the vectors from which we are computing the inner product.
+
+    nghost0 : int
+        Number of ghost cells of the arrays along the index 0.
+
+    Returns
+    -------
+    res : scalar
+        Scalar (real or complex) containing the result of the inner product.
+    """
+    shape0, = v1.shape
+
+    res = v1[0] - v1[0]
+    for i0 in range(nghost0, shape0 - nghost0):
+        res += v1[i0].conjugate() * v2[i0]
+
+    return res
+
+#==============================================================================
+@template(name='T', types=['float[:,:]', 'complex[:,:]'])
+def inner_2d(v1: 'T', v2: 'T', nghost0: 'int64', nghost1: 'int64'):
+    """
+    Kernel for computing the inner product (case of two 2D vectors).
+
+    Parameters
+    ----------
+    v1, v2 : 2D NumPy array
+        Data of the vectors from which we are computing the inner product.
+
+    nghost0 : int
+        Number of ghost cells of the arrays along the index 0.
+
+    nghost1 : int
+        Number of ghost cells of the arrays along the index 1.
+
+    Returns
+    -------
+    res : scalar
+        Scalar (real or complex) containing the result of the inner product.
+    """
+    shape0, shape1 = v1.shape
+
+    res = v1[0, 0] - v1[0, 0]
+    for i0 in range(nghost0, shape0 - nghost0):
+        for i1 in range(nghost1, shape1 - nghost1):
+            res += v1[i0, i1].conjugate() * v2[i0, i1]
+
+    return res
+
+#==============================================================================
+@template(name='T', types=['float[:,:,:]', 'complex[:,:,:]'])
+def inner_3d(v1: 'T', v2: 'T', nghost0: 'int64', nghost1: 'int64', nghost2: 'int64'):
+    """
+    Kernel for computing the inner product (case of two 3D vectors).
+
+    Parameters
+    ----------
+    v1, v2 : 3D NumPy array
+        Data of the vectors from which we are computing the inner product.
+
+    nghost0 : int
+        Number of ghost cells of the arrays along the index 0.
+
+    nghost1 : int
+        Number of ghost cells of the arrays along the index 1.
+
+    nghost2 : int
+        Number of ghost cells of the arrays along the index 2.
+
+    Returns
+    -------
+    res : scalar
+        Scalar (real or complex) containing the result of the inner product.
+    """
+    shape0, shape1, shape2 = v1.shape
+
+    res = v1[0, 0, 0] - v1[0, 0, 0]
+    for i0 in range(nghost0, shape0 - nghost0):
+        for i1 in range(nghost1, shape1 - nghost1):
+            for i2 in range(nghost2, shape2 - nghost2):
+                res += v1[i0, i1, i2].conjugate() * v2[i0, i1, i2]
+
+    return res
diff --git a/psydac/linalg/kernels/matvec_kernels.py b/psydac/linalg/kernels/matvec_kernels.py
new file mode 100644
index 000000000..758ce0b8d
--- /dev/null
+++ b/psydac/linalg/kernels/matvec_kernels.py
@@ -0,0 +1,210 @@
+from pyccel.decorators import template
+
+
+@template(name='T', types=['float[:]', 'complex[:]'])
+@template(name='Tarray', types=['float[:,:]', 'complex[:,:]'])
+def matvec_1d(mat00:'Tarray', x0:'T', out0:'T', starts: 'int64[:]', nrows: 'int64[:]', nrows_extra: 'int64[:]',
+                  dm:'int64[:]', cm:'int64[:]', pad_imp:'int64[:]', ndiags:'int64[:]', gpads: 'int64[:]'):
+
+    nrows1   = nrows[0]
+    dstart1  = starts[0]
+    dshift1  = dm[0]
+    cshift1  = cm[0]
+    ndiags1  = ndiags[0]
+    dpads1   = gpads[0]
+    pad_imp1 = pad_imp[0]
+
+    pxm1 = dpads1 * cshift1
+
+    start_impact1 = dstart1 % dshift1
+
+    v00 = mat00[0, 0] - mat00[0, 0] + x0[0] - x0[0]
+
+    for i1 in range(nrows1):
+        v00 *= 0
+        x_min1 = pad_imp1 + (i1 + start_impact1) // cshift1 * dshift1
+        for k1 in range(ndiags1):
+            v00 += mat00[pxm1 + i1, k1] * x0[k1 + x_min1]
+        out0[pxm1 + i1] = v00
+
+    if 0 < nrows_extra[0]:
+        pxm1          += nrows1
+        start_impact1 += nrows1
+        for i1 in range(nrows_extra[0]):
+            v00 *= 0
+            x_min1 = pad_imp1 + (i1 + start_impact1) // cshift1 * dshift1
+            for k1 in range(ndiags1 - i1 - 1):
+                v00 += mat00[pxm1 + i1, k1] * x0[x_min1 + k1]
+            out0[pxm1 + i1] = v00
+
+
+@template(name='T', types=['float[:,:]', 'complex[:,:]'])
+@template(name='Tarray', types=['float[:,:,:,:]', 'complex[:,:,:,:]'])
+def matvec_2d(mat00:'Tarray', x0:'T', out0:'T', starts:'int64[:]', nrows:'int64[:]', nrows_extra:'int64[:]',
+                  dm:'int64[:]', cm:'int64[:]', pad_imp:'int64[:]', ndiags:'int64[:]', gpads: 'int64[:]'):
+
+    nrows1   = nrows[0]
+    nrows2   = nrows[1]
+    dstart1  = starts[0]
+    dstart2  = starts[1]
+    dshift1  = dm[0]
+    dshift2  = dm[1]
+    cshift1  = cm[0]
+    cshift2  = cm[1]
+    ndiags1  = ndiags[0]
+    ndiags2  = ndiags[1]
+    dpads1   = gpads[0]
+    dpads2   = gpads[1]
+    pad_imp1 = pad_imp[0]
+    pad_imp2 = pad_imp[1]
+
+    pxm1 = dpads1 * cshift1
+    pxm2 = dpads2 * cshift2
+
+    start_impact1 = dstart1 % dshift1
+    start_impact2 = dstart2 % dshift2
+
+    v00 = mat00[0, 0, 0, 0] - mat00[0, 0, 0, 0] + x0[0, 0] - x0[0, 0]
+
+    for i1 in range(nrows1):
+        for i2 in range(nrows2):
+            v00 *= 0
+            x_min1 = pad_imp1 + (i1 + start_impact1) // cshift1 * dshift1
+            x_min2 = pad_imp2 + (i2 + start_impact2) // cshift2 * dshift2
+            for k1 in range(ndiags1):
+                for k2 in range(ndiags2):
+                    v00 += mat00[pxm1 + i1, pxm2 + i2, k1, k2] * x0[k1 + x_min1, k2 + x_min2]
+            out0[pxm1 + i1, pxm2 + i2] = v00
+
+    if 0 < nrows_extra[0]:
+        pxm1          += nrows1
+        start_impact1 += nrows1
+        for i1 in range(nrows_extra[0]):
+            for i2 in range(nrows2):
+                v00 *= 0
+                x_min1 = pad_imp1 + (i1 + start_impact1) // cshift1 * dshift1
+                x_min2 = pad_imp2 + (i2 + start_impact2) // cshift2 * dshift2
+                for k1 in range(ndiags1 - i1 - 1):
+                    for k2 in range(ndiags2):
+                        v00 += mat00[pxm1 + i1, pxm2 + i2, k1, k2] * x0[x_min1 + k1, x_min2 + k2]
+                out0[pxm1 + i1,  pxm2 + i2] = v00
+
+    if 0 < nrows_extra[1]:
+        pxm1           = dpads1  * cshift1
+        start_impact1  = dstart1 % dshift1
+        pxm2          += nrows2
+        start_impact2 += nrows2
+        for i1 in range(nrows1 + nrows_extra[0]):
+            for i2 in range(nrows_extra[1]):
+                v00 *= 0
+                x_min1 = pad_imp1 + (i1 + start_impact1) // cshift1 * dshift1
+                x_min2 = pad_imp2 + (i2 + start_impact2) // cshift2 * dshift2
+                for k1 in range(ndiags1 - max(0, i1 + 1 - nrows1)):
+                    for k2 in range(ndiags2 - i2 - 1):
+                        v00 += mat00[pxm1 + i1, pxm2 + i2, k1, k2] * x0[x_min1 + k1, x_min2 + k2]
+                out0[pxm1 + i1, pxm2 + i2] = v00
+
+
+@template(name='T', types=['float[:,:,:]', 'complex[:,:,:]'])
+@template(name='Tarray', types=['float[:,:,:,:,:,:]', 'complex[:,:,:,:,:,:]'])
+def matvec_3d(mat00:'Tarray', x0:'T', out0:'T', starts:'int64[:]', nrows:'int64[:]', nrows_extra:'int64[:]',
+                  dm:'int64[:]', cm:'int64[:]', pad_imp:'int64[:]', ndiags:'int64[:]', gpads: 'int64[:]'):
+
+    nrows1   = nrows[0]
+    nrows2   = nrows[1]
+    nrows3   = nrows[2]
+    dstart1  = starts[0]
+    dstart2  = starts[1]
+    dstart3  = starts[2]
+    dshift1  = dm[0]
+    dshift2  = dm[1]
+    dshift3  = dm[2]
+    cshift1  = cm[0]
+    cshift2  = cm[1]
+    cshift3  = cm[2]
+    ndiags1  = ndiags[0]
+    ndiags2  = ndiags[1]
+    ndiags3  = ndiags[2]
+    dpads1   = gpads[0]
+    dpads2   = gpads[1]
+    dpads3   = gpads[2]
+    pad_imp1 = pad_imp[0]
+    pad_imp2 = pad_imp[1]
+    pad_imp3 = pad_imp[2]
+
+    pxm1 = dpads1 * cshift1
+    pxm2 = dpads2 * cshift2
+    pxm3 = dpads3 * cshift3
+
+    start_impact1 = dstart1 % dshift1
+    start_impact2 = dstart2 % dshift2
+    start_impact3 = dstart3 % dshift3
+
+    v00 = mat00[0, 0, 0, 0, 0, 0] - mat00[0, 0, 0, 0, 0, 0] + x0[0, 0, 0] - x0[0, 0, 0]
+
+    for i1 in range(nrows1):
+        for i2 in range(nrows2):
+            for i3 in range(nrows3):
+                v00 *= 0
+                x_min1 = pad_imp1 + (i1 + start_impact1) // cshift1 * dshift1
+                x_min2 = pad_imp2 + (i2 + start_impact2) // cshift2 * dshift2
+                x_min3 = pad_imp3 + (i3 + start_impact3) // cshift3 * dshift3
+                for k1 in range(ndiags1):
+                    for k2 in range(ndiags2):
+                        for k3 in range(ndiags3):
+                            v00 += mat00[pxm1 + i1, pxm2 + i2, pxm3 + i3, k1, k2, k3] * x0[k1 + x_min1, k2 + x_min2, k3 + x_min3]
+                out0[pxm1 + i1, pxm2 + i2, pxm3 + i3] = v00
+
+    if 0 < nrows_extra[0]:
+        pxm1 += nrows1
+        start_impact1 += nrows1
+        for i1 in range(nrows_extra[0]):
+            for i2 in range(nrows2):
+                for i3 in range(nrows3):
+                    v00 *= 0
+                    x_min1 = pad_imp1 + (i1 + start_impact1) // cshift1 * dshift1
+                    x_min2 = pad_imp2 + (i2 + start_impact2) // cshift2 * dshift2
+                    x_min3 = pad_imp3 + (i3 + start_impact3) // cshift3 * dshift3
+                    for k1 in range(ndiags1 - i1 - 1):
+                        for k2 in range(ndiags2):
+                            for k3 in range(ndiags3):
+                                v00 += mat00[pxm1 + i1, pxm2 + i2, pxm3 + i3, k1, k2, k3] *  x0[x_min1 + k1, x_min2 + k2, x_min3 + k3]
+                    out0[pxm1 + i1,  pxm2 + i2,  pxm3 + i3] = v00
+
+    if 0 < nrows_extra[1]:
+        pxm1           = dpads1  * cshift1
+        start_impact1  = dstart1 % dshift1
+        pxm2          += nrows2
+        start_impact2 += nrows2
+        for i1 in range(nrows1 + nrows_extra[0]):
+            for i2 in range(nrows_extra[1]):
+                for i3 in range(nrows3):
+                    v00 *= 0
+                    x_min1 = pad_imp1 + (i1 + start_impact1) // cshift1 * dshift1
+                    x_min2 = pad_imp2 + (i2 + start_impact2) // cshift2 * dshift2
+                    x_min3 = pad_imp3 + (i3 + start_impact3) // cshift3 * dshift3
+                    for k1 in range(ndiags1 - max(0, i1 + 1 - nrows1)):
+                        for k2 in range(ndiags2 - i2 - 1):
+                            for k3 in range(ndiags3):
+                                v00 += mat00[pxm1 + i1, pxm2 + i2, pxm3 + i3, k1, k2, k3] * x0[x_min1 + k1, x_min2 + k2, x_min3 + k3]
+                    out0[pxm1 + i1, pxm2 + i2, pxm3 + i3] = v00
+
+    if 0 < nrows_extra[2]:
+        pxm1           = dpads1  * cshift1
+        pxm2           = dpads2  * cshift2
+        start_impact1  = dstart1 % dshift1
+        start_impact2  = dstart2 % dshift2
+        pxm3          += nrows3
+        start_impact3 += nrows3
+        for i1 in range(nrows1 + nrows_extra[0]):
+            for i2 in range(nrows2 + nrows_extra[1]):
+                for i3 in range(nrows_extra[2]):
+                    v00 *= 0
+                    x_min1 = pad_imp1 + (i1 + start_impact1) // cshift1 * dshift1
+                    x_min2 = pad_imp2 + (i2 + start_impact2) // cshift2 * dshift2
+                    x_min3 = pad_imp3 + (i3 + start_impact3) // cshift3 * dshift3
+                    for k1 in range(ndiags1 - max(0, i1 + 1 - nrows1)):
+                        for k2 in range(ndiags2 - max(0, i2 + 1 - nrows2)):
+                            for k3 in range(ndiags3 - i3 - 1):
+                                v00 += mat00[pxm1 + i1, pxm2 + i2, pxm3 + i3, k1, k2, k3] * x0[x_min1 + k1, x_min2 + k2, x_min3 + k3]
+                    out0[pxm1 + i1, pxm2 + i2, pxm3 + i3] = v00
diff --git a/psydac/linalg/solvers.py b/psydac/linalg/solvers.py
index 58a7d6741..35ea01366 100644
--- a/psydac/linalg/solvers.py
+++ b/psydac/linalg/solvers.py
@@ -125,7 +125,7 @@ def __init__(self, A, *, x0=None, tol=1e-6, maxiter=1000, verbose=False):
         self._solver = 'cg'
         self._options = {"x0":x0, "tol":tol, "maxiter":maxiter, "verbose":verbose}
         self._check_options(**self._options)
-        self._tmps = {key: domain.zeros() for key in ("v", "r", "p", "lp", "lv")}
+        self._tmps = {key: domain.zeros() for key in ("v", "r", "p")}
         self._info = None
 
     def _check_options(self, **kwargs):
@@ -205,9 +205,6 @@ def solve(self, b, out=None):
         v = self._tmps["v"]
         r = self._tmps["r"]
         p = self._tmps["p"]
-        # Not strictly needed by the conjugate gradient, but necessary to avoid temporaries
-        lp = self._tmps["lp"]
-        lv = self._tmps["lv"]
 
         # First values
         A.dot(x, out=v)
@@ -233,12 +230,10 @@ def solve(self, b, out=None):
                 break
             A.dot(p, out=v)
             l   = am / v.dot(p)
-            p.copy(out=lp)
-            lp *= l
-            x  += lp # this was x += l*p
-            v.copy(out=lv)
-            lv *= l
-            r  -= lv # this was r -= l*v
+
+            x.mul_iadd(l, p)  # this is x += l*p
+            r.mul_iadd(-l, v) # this is r -= l*v
+
             am1 = r.dot(r).real
             p  *= (am1/am)
             p  += r
@@ -313,8 +308,8 @@ def __init__(self, A, *, pc='jacobi', x0=None, tol=1e-6, maxiter=1000, verbose=F
         self._solver = 'pcg'
         self._options = {"x0":x0, "pc":pc, "tol":tol, "maxiter":maxiter, "verbose":verbose}
         self._check_options(**self._options)
-        tmps_codomain = {key: codomain.zeros() for key in ("p", "s", "lp")}
-        tmps_domain = {key: domain.zeros() for key in ("v", "r", "lv")}
+        tmps_codomain = {key: codomain.zeros() for key in ("p", "s")}
+        tmps_domain = {key: domain.zeros() for key in ("v", "r")}
         self._tmps = {**tmps_codomain, **tmps_domain}
         self._info = None
 
@@ -409,9 +404,6 @@ def solve(self, b, out=None):
         r = self._tmps["r"]
         p = self._tmps["p"]
         s = self._tmps["s"]
-        # Not strictly needed by the conjugate gradient, but necessary to avoid temporaries
-        lp = self._tmps["lp"]
-        lv = self._tmps["lv"]
 
         # First values
         A.dot(x, out=v)
@@ -441,19 +433,19 @@ def solve(self, b, out=None):
 
             v  = A.dot(p, out=v)
             l  = am / v.dot(p)
-            p.copy(out=lp)
-            lp *= l
-            x  += lp # this was x += l*p
-            v.copy(out=lv)
-            lv *= l
-            r  -= lv # this was r -= l*v
+
+            x.mul_iadd(l, p) # this is x += l*p
+            r.mul_iadd(-l, v) # this is r -= l*v
 
             nrmr_sqr = r.dot(r).real
             psolve(r, out=s)
 
             am1 = s.dot(r)
-            p  *= (am1/am)
-            p  += s
+
+            # we are computing p = (am1 / am) * p + s by using axpy on s and exchanging the arrays
+            s.mul_iadd((am1/am), p)
+            s, p = p, s
+
             am  = am1
 
             if verbose:
@@ -640,8 +632,6 @@ def solve(self, b, out=None):
             #-----------------------
             A.dot(p, out=v)
             Ah.dot(ps, out=vs)
-            #v  = A.dot(p , out=v) # overwriting v, then saving in v. Necessary?
-            #vs = At.dot(ps, out=vs) # same story
             #-----------------------
 
             # c := (rs, r)
@@ -654,32 +644,29 @@ def solve(self, b, out=None):
             # SOLUTION UPDATE
             #-----------------------
             # x := x + a*p
-            p *= a
-            x += p
+            x.mul_iadd(a, p)
             #-----------------------
 
             # r := r - a*v
-            v *= a
-            r -= v
+            r.mul_iadd(-a, v)
 
             # rs := rs - conj(a)*vs
-            vs *= a.conj()
-            rs -= vs
+            rs.mul_iadd(-a.conjugate(), vs)
+
+            # ||r||_2 := (r, r)
+            res_sqr = r.dot(r).real
 
             # b := (rs, r)_{m+1} / (rs, r)_m
             b = rs.dot(r) / c
 
             # p := r + b*p
-            p *= (b/a) # *= (b/a) why a? or update description
+            p *= b
             p += r
 
             # ps := rs + conj(b)*ps
             ps *= b.conj()
             ps += rs
 
-            # ||r||_2 := (r, r)
-            res_sqr = r.dot(r).real
-
             if verbose:
                 print( template.format(m, sqrt(res_sqr)) )
 
@@ -746,7 +733,7 @@ def __init__(self, A, *, x0=None, tol=1e-6, maxiter=1000, verbose=False):
         self._solver = 'bicgstab'
         self._options = {"x0": x0, "tol": tol, "maxiter": maxiter, "verbose": verbose}
         self._check_options(**self._options)
-        self._tmps = {key: domain.zeros() for key in ("v", "r", "p", "vs", "r0", "s")}
+        self._tmps = {key: domain.zeros() for key in ("v", "r", "p", "vr", "r0")}
         self._info = None
 
     def _check_options(self, **kwargs):
@@ -834,9 +821,8 @@ def solve(self, b, out=None):
         v = self._tmps["v"]
         r = self._tmps["r"]
         p = self._tmps["p"]
-        vs = self._tmps["vs"]
+        vr = self._tmps["vr"]
         r0 = self._tmps["r0"]
-        s = self._tmps["s"]
 
         # First values
         A.dot(x, out=v)
@@ -845,11 +831,9 @@ def solve(self, b, out=None):
         #r = b - A.dot(x)
         r.copy(out=p)
         v *= 0.0
-        vs *= 0.0
+        vr *= 0.0
 
         r.copy(out=r0)
-        r.copy(out=s)
-        s *= 0.0
 
         res_sqr = r.dot(r).real
         tol_sqr = tol ** 2
@@ -880,34 +864,25 @@ def solve(self, b, out=None):
             # a := (r0, r) / (r0, v)
             a = c / (r0.dot(v))
 
-            # s := r - a*v
-            s *= 0
-            v *= a
-            s += r
-            s -= v
+            # r := r - a*v
+            r.mul_iadd(-a, v)
 
-            # vs :=  A*s
-            vs = A.dot(s, out=vs)
+            # vr :=  A*r
+            vr = A.dot(r, out=vr)
 
-            # w := (s, A*s) / (A*s, A*s)
-            w = s.dot(vs) / vs.dot(vs)
+            # w := (r, A*r) / (A*r, A*r)
+            w = r.dot(vr) / vr.dot(vr)
 
             # -----------------------
             # SOLUTION UPDATE
             # -----------------------
-            # x := x + a*p +w*s
-            p *= a
-            s *= w
-            x += p
-            x += s
+            # x := x + a*p +w*r
+            x.mul_iadd(a, p)
+            x.mul_iadd(w, r)
             # -----------------------
 
-            # r := s - w*vs
-            vs *= w
-            s *= 1 / w
-            r *= 0
-            r += s
-            r -= vs
+            # r := r - w*A*r
+            r.mul_iadd(-w, vr)
 
             # ||r||_2 := (r, r)
             res_sqr = r.dot(r).real
@@ -919,10 +894,9 @@ def solve(self, b, out=None):
             b = r0.dot(r) * a / (c * w)
 
             # p := r + b*p- b*w*v
-            v *= (b * w / a)
-            p *= (b / a)
-            p -= v
+            p *= b
             p += r
+            p.mul_iadd(-b * w, v)
 
             if verbose:
                 print(template.format(m, sqrt(res_sqr)))
@@ -1001,8 +975,7 @@ def __init__(self, A, *, x0=None, tol=1e-6, maxiter=1000, verbose=False):
         self._solver = 'minres'
         self._options = {"x0":x0, "tol":tol, "maxiter":maxiter, "verbose":verbose}
         self._check_options(**self._options)
-        self._tmps = {key: domain.zeros() for key in ("res1", "res2", "w", "w2", "yc",
-                      "v", "resc", "res2c", "ycc", "res1c", "wc", "w2c")}
+        self._tmps = {key: domain.zeros() for key in ("res_old", "res_new", "w_new", "w_work", "w_old", "v", "y")}
         self._info = None
 
     def _check_options(self, **kwargs):
@@ -1092,40 +1065,30 @@ def solve(self, b, out=None):
 
         # Extract local storage
         v = self._tmps["v"]
-        w = self._tmps["w"]
-        w2 = self._tmps["w2"]
-        res1 = self._tmps["res1"]
-        res2 = self._tmps["res2"]
-        # auxiliary to minimzize temps, optimal solution until proven wrong
-        wc = self._tmps["wc"]
-        w2c = self._tmps["w2c"]
-        yc = self._tmps["yc"]
-        ycc = self._tmps["ycc"]
-        resc = self._tmps["resc"]
-        res1c = self._tmps["res1c"]
-        res2c = self._tmps["res2c"]
+        y = self._tmps["y"]
+        w_new = self._tmps["w_new"]
+        w_work = self._tmps["w_work"]
+        w_old = self._tmps["w_old"]
+        res_old = self._tmps["res_old"]
+        res_new = self._tmps["res_new"]
 
         istop = 0
         itn   = 0
-        Anorm = 0
-        Acond = 0
         rnorm = 0
-        ynorm = 0
 
         eps = np.finfo(b.dtype).eps
 
-        A.dot(x, out=res1)
-        res1 -= b
-        res1 *= -1.0
-        y  = res1
+        A.dot(x, out=y)
+        y -= b
+        y *= -1.0
+        y.copy(out=res_old)
 
-        beta = sqrt(res1.dot(res1))
+        beta = sqrt(res_old.dot(res_old))
 
         # Initialize other quantities
         oldb    = 0
         dbar    = 0
         epsln   = 0
-        qrnorm  = beta
         phibar  = beta
         rhs1    = beta
         rhs2    = 0
@@ -1134,11 +1097,10 @@ def solve(self, b, out=None):
         gmin    = np.finfo(b.dtype).max
         cs      = -1
         sn      = 0
-        b.copy(out=w)
-        w *= 0.0
-        b.copy(out=w2)
-        w2 *= 0.0
-        res1.copy(out=res2)
+        w_new  *= 0.0
+        w_work *= 0.0
+        w_old *= 0.0
+        res_old.copy(out=res_new)
 
         if verbose:
             print( "MINRES solver:" )
@@ -1153,28 +1115,20 @@ def solve(self, b, out=None):
             s = 1.0/beta
             y.copy(out=v)
             v *= s
-            A.dot(v, out=yc)
-            y = yc
+            A.dot(v, out=y)
 
             if itn >= 2:
-                res1 *= (beta/oldb)
-                y -= res1
+                y.mul_iadd(-(beta/oldb), res_old)
 
             alfa = v.dot(y)
-            res1 = res2
-
-            res2.copy(out=resc)
-            resc *= (alfa/beta)
-            y.copy(out=ycc)
-            ycc -= resc
-            y = ycc
-            res1.copy(out=res1c)
-            res1 = res1c
-            y.copy(out=res2c)
-            res2 = res2c
+            y.mul_iadd(-(alfa/beta), res_new)
+
+            # We put res_new in res_old and y in res_new
+            res_new, res_old = res_old, res_new
+            y.copy(out=res_new)
 
             oldb = beta
-            beta = sqrt(y.dot(y))
+            beta = sqrt(res_new.dot(res_new))
             tnorm2 += alfa**2 + oldb**2 + beta**2
 
             # Apply previous rotation Qk-1 to get
@@ -1187,7 +1141,6 @@ def solve(self, b, out=None):
             epsln  = sn * beta                  # epsln2 = 0         epslnk+1
             dbar   = - cs * beta                # dbar 2 = beta2     dbar k+1
             root   = sqrt(gbar**2 + dbar**2)
-            Arnorm = phibar * root
 
             # Compute the next plane rotation Qk
 
@@ -1199,24 +1152,17 @@ def solve(self, b, out=None):
             phibar = sn * phibar                 # phibark+1
 
             # Update  x.
-
             denom = 1.0/gamma
-            w1    = w2
-            w2    = w
-
-            w1.copy(out=yc)
-            yc *= oldeps
-            w2.copy(out=w2c)
-            w2.copy(out=wc)
-            w = wc
-            w2 = w2c
-            w *= delta
-            w += yc
-            w -= v
-            w *= -denom
-            w.copy(out=yc)
-            yc *= phi
-            x += yc
+
+            # We put w_old in w_work and w_new in w_old
+            w_work, w_old = w_old, w_work
+            w_new.copy(out=w_old)
+
+            w_new *= delta
+            w_new.mul_iadd(oldeps, w_work)
+            w_new -= v
+            w_new *= -denom
+            x.mul_iadd(phi, w_new)
 
             # Go round again.
 
@@ -1230,12 +1176,6 @@ def solve(self, b, out=None):
 
             Anorm = sqrt(tnorm2)
             ynorm = sqrt(x.dot(x))
-            epsa  = Anorm * eps
-            epsx  = Anorm * ynorm * eps
-            epsr  = Anorm * ynorm * tol
-            diag  = gbar
-
-            if diag == 0:diag = epsa
 
             rnorm  = phibar
             if ynorm == 0 or Anorm == 0:test1 = inf
@@ -1360,8 +1300,8 @@ def __init__(self, A, *, x0=None, tol=None, atol=None, btol=None, maxiter=1000,
         self._check_options(**self._options)
         self._info = None
         self._successful = None
-        tmps_domain = {key: domain.zeros() for key in ("uh", "uc")}
-        tmps_codomain = {key: codomain.zeros() for key in ("v", "vh", "h", "hbar")}
+        tmps_domain = {key: domain.zeros() for key in ("u", "u_work")}
+        tmps_codomain = {key: codomain.zeros() for key in ("v", "v_work", "h", "hbar")}
         self._tmps = {**tmps_codomain, **tmps_domain}
 
     def get_success(self):
@@ -1463,13 +1403,13 @@ def solve(self, b, out=None):
         x = x0.copy(out=out)
 
         # Extract local storage
+        u = self._tmps["u"]
         v = self._tmps["v"]
         h = self._tmps["h"]
         hbar = self._tmps["hbar"]
         # Not strictly needed by the LSMR, but necessary to avoid temporaries
-        uh = self._tmps["uh"]
-        vh = self._tmps["vh"]
-        uc = self._tmps["uc"]
+        u_work = self._tmps["u_work"]
+        v_work = self._tmps["v_work"]
 
         if atol is None:atol = 1e-6
         if btol is None:btol = 1e-6
@@ -1477,17 +1417,15 @@ def solve(self, b, out=None):
             atol = tol
             btol = tol
 
-        u = b
+        b.copy(out=u)
         normb = sqrt(b.dot(b).real)
 
-        A.dot(x, out=uh)
-        u -= uh
+        A.dot(x, out=u_work)
+        u -= u_work
         beta = sqrt(u.dot(u).real)
 
         if beta > 0:
-            u.copy(out = uc)
-            uc *= (1 / beta)
-            u = uc
+            u *= (1 / beta)
             At.dot(u, out=v)
             alpha = sqrt(v.dot(v).real)
         else:
@@ -1525,9 +1463,6 @@ def solve(self, b, out=None):
         normA2  = alpha * alpha
         maxrbar = 0
         minrbar = 1e+100
-        normA   = sqrt(normA2)
-        condA   = 1
-        normx   = 0
 
         # Items for use in stopping rules, normb set earlier
         istop = 0
@@ -1536,7 +1471,6 @@ def solve(self, b, out=None):
         normr = beta
 
         # Reverse the order here from the original matlab code because
-        normar = alpha * beta
 
         if verbose:
             print( "LSMR solver:" )
@@ -1554,15 +1488,15 @@ def solve(self, b, out=None):
             #        alpha*v  =  A'*u  -  beta*v.
 
             u *= -alpha
-            A.dot(v, out=uh)
-            u += uh
+            A.dot(v, out=u_work)
+            u += u_work
             beta = sqrt(u.dot(u).real)
 
             if beta > 0:
                 u     *= (1 / beta)
                 v     *= -beta
-                At.dot(u, out=vh)
-                v     += vh
+                At.dot(u, out=v_work)
+                v     += v_work
                 alpha = sqrt(v.dot(v).real)
                 if alpha > 0:v *= (1 / alpha)
 
@@ -1594,9 +1528,7 @@ def solve(self, b, out=None):
             hbar *= - (thetabar * rho / (rhoold * rhobarold))
             hbar += h
 
-            hbar.copy(out=uh)
-            uh *= (zeta / (rho * rhobar))
-            x += uh
+            x.mul_iadd((zeta / (rho * rhobar)), hbar)
 
             h    *= - (thetanew / rho)
             h    += v
@@ -1743,7 +1675,7 @@ def __init__(self, A, *, x0=None, tol=1e-6, maxiter=100, verbose=False):
         self._solver = 'gmres'
         self._options = {"x0":x0, "tol":tol, "maxiter":maxiter, "verbose":verbose}
         self._check_options(**self._options) 
-        self._tmps = {key: domain.zeros() for key in ("r", "p", "v", "lv")}
+        self._tmps = {key: domain.zeros() for key in ("r", "p")}
 
         # Initialize upper Hessenberg matrix
         self._H = np.zeros((self._options["maxiter"] + 1, self._options["maxiter"]), dtype=A.dtype)
@@ -1824,7 +1756,7 @@ def solve(self, b, out=None):
         
         # Extract local storage
         r = self._tmps["r"]
-        v = self._tmps["v"]
+        p = self._tmps["p"]
 
         # Internal objects of GMRES
         self._H[:,:] = 0.
@@ -1859,7 +1791,7 @@ def solve(self, b, out=None):
                 break
 
             # run Arnoldi
-            self.arnoldi(k)
+            self.arnoldi(k, p)
 
             # make the last diagonal entry in H equal to 0, so that H becomes upper triangular
             self.apply_givens_rotation(k, sn, cn)
@@ -1878,9 +1810,7 @@ def solve(self, b, out=None):
         y = self.solve_triangular(self._H[:k, :k], beta[:k]) # system of upper triangular matrix
 
         for i in range(k):
-            self._Q[i].copy(out=v)
-            v *= y[i]
-            x += v
+            x.mul_iadd(y[i], self._Q[i])
 
         # Convergence information
         self._info = {'niter': k+1, 'success': am < tol, 'res_norm': am }
@@ -1900,19 +1830,13 @@ def solve_triangular(self, T, d):
         
         return y
 
-    def arnoldi(self, k):
+    def arnoldi(self, k, p):
         h = self._H[:k+2, k]
-
-        p = self._tmps["p"]
         self._A.dot( self._Q[k] , out=p) # Krylov vector
 
-        lv = self._tmps["lv"]
-
         for i in range(k + 1): # Modified Gram-Schmidt, keeping Hessenberg matrix
             h[i] = p.dot(self._Q[i])
-            self._Q[i].copy(out=lv)
-            lv *= h[i]
-            p -= lv
+            p.mul_iadd(-h[i], self._Q[i])
         
         h[k+1] = sqrt(p.dot(p).real)
         p /= h[k+1] # Normalize vector
diff --git a/psydac/linalg/stencil.py b/psydac/linalg/stencil.py
index c2c55d3d6..a083e6db0 100644
--- a/psydac/linalg/stencil.py
+++ b/psydac/linalg/stencil.py
@@ -11,56 +11,83 @@
 from scipy.sparse import coo_matrix
 from mpi4py       import MPI
 
-from psydac.linalg.basic   import VectorSpace, Vector, LinearOperator
-from psydac.ddm.cart       import find_mpi_type, CartDecomposition, InterfaceCartDecomposition
-from psydac.ddm.utilities  import get_data_exchanger
-from psydac.linalg.kernels.transpose_kernels import transpose_1d, transpose_2d, transpose_3d, interface_transpose_1d, interface_transpose_2d, interface_transpose_3d
-from psydac.linalg.kernels.stencil2coo_kernels import stencil2coo_1d_F, stencil2coo_1d_C,stencil2coo_2d_C, stencil2coo_2d_F, stencil2coo_3d_C,stencil2coo_3d_F
-__all__ = ('StencilVectorSpace','StencilVector','StencilMatrix', 'StencilInterfaceMatrix')
+from psydac.linalg.basic  import VectorSpace, Vector, LinearOperator
+from psydac.ddm.cart      import find_mpi_type, CartDecomposition, InterfaceCartDecomposition
+from psydac.ddm.utilities import get_data_exchanger
+from psydac.api.settings  import PSYDAC_BACKENDS
+
+from .kernels.axpy_kernels        import axpy_1d, axpy_2d, axpy_3d
+from .kernels.inner_kernels       import inner_1d, inner_2d, inner_3d
+from .kernels.matvec_kernels      import matvec_1d, matvec_2d, matvec_3d
+from .kernels.transpose_kernels   import transpose_1d, transpose_2d, transpose_3d
+from .kernels.transpose_kernels   import interface_transpose_1d, interface_transpose_2d, interface_transpose_3d
+from .kernels.stencil2coo_kernels import stencil2coo_1d_F, stencil2coo_2d_F, stencil2coo_3d_F
+from .kernels.stencil2coo_kernels import stencil2coo_1d_C, stencil2coo_2d_C, stencil2coo_3d_C
+
+
+__all__ = (
+    'StencilVectorSpace',
+    'StencilVector',
+    'StencilMatrix',
+    'StencilInterfaceMatrix'
+)
+
+#===============================================================================
+# Dictionary used to select correct kernel functions based on dimensionality
+kernels = {
+    'axpy'  : (None,   axpy_1d,   axpy_2d,   axpy_3d),
+    'inner' : (None,  inner_1d,  inner_2d,  inner_3d),
+    'matvec': (None, matvec_1d, matvec_2d, matvec_3d),
+    'transpose': (None, transpose_1d, transpose_2d, transpose_3d),
+    'interface_transpose': (None, interface_transpose_1d, interface_transpose_2d, interface_transpose_3d),
+    'stencil2coo': {'F': (None, stencil2coo_1d_F, stencil2coo_2d_F, stencil2coo_3d_F),
+                    'C': (None, stencil2coo_1d_C, stencil2coo_2d_C, stencil2coo_3d_C)}
+}
 
 #===============================================================================
 def compute_diag_len(pads, shifts_domain, shifts_codomain, return_padding=False):
-    """ Compute the diagonal length and the padding of the stencil matrix for each direction,
-        using the shifts of the domain and the codomain.
+    """
+    Compute the diagonal length and the padding of the stencil matrix for each direction,
+    using the shifts of the domain and the codomain.
 
-        Parameters
-        ----------
-        pads : tuple-like (int)
-         Padding along each direction
+    Parameters
+    ----------
+    pads : tuple-like (int)
+        Padding along each direction.
 
-        shifts_domain : tuple_like (int)
-         Shifts of the domain along each direction
+    shifts_domain : tuple_like (int)
+        Shifts of the domain along each direction.
 
-        shifts_codomain : tuple_like (int)
-         Shifts of the codomain along each direction
+    shifts_codomain : tuple_like (int)
+        Shifts of the codomain along each direction.
 
-        return_padding : bool
-            Return the new padding if True
-    
-        Returns
-        -------
-        n : (int)
-         Diagonal length of the stencil matrix
+    return_padding : bool
+        Return the new padding if True.
+
+    Returns
+    -------
+    n : (int)
+        Diagonal length of the stencil matrix.
 
-        ep : (int)
-          Padding that constitutes the starting index of the non zero elements 
+    ep : (int)
+        Padding that constitutes the starting index of the non zero elements.
     """
-    n = ((np.ceil((pads+1)/shifts_codomain)-1)*shifts_domain).astype('int')
+    n  = ((np.ceil((pads+1)/shifts_codomain)-1)*shifts_domain).astype('int')
     ep = -np.minimum(0, n-pads)
-    n = n+ep + pads+1
+    n  = n + ep + pads + 1
     if return_padding:
-        return n.astype('int'), (ep).astype('int')
+        return n.astype('int'), ep.astype('int')
     else:
         return n.astype('int')
 
 #===============================================================================
-class StencilVectorSpace( VectorSpace ):
+class StencilVectorSpace(VectorSpace):
     """
     Vector space for n-dimensional stencil format. Two different initializations
     are possible:
 
-    - serial  : StencilVectorSpace( npts, pads, periods, shifts=None, starts=None, ends=None, dtype=float )
-    - parallel: StencilVectorSpace( cart, dtype=float )
+    - serial  : StencilVectorSpace(npts, pads, periods, shifts=None, starts=None, ends=None, dtype=float)
+    - parallel: StencilVectorSpace(cart, dtype=float)
 
     Parameters
     ----------
@@ -91,21 +118,21 @@ class StencilVectorSpace( VectorSpace ):
 
     """
 
-    def __init__( self, cart, dtype=float ):
+    def __init__(self, cart, dtype=float):
 
-        assert isinstance( cart, (CartDecomposition, InterfaceCartDecomposition) )
+        assert isinstance(cart, (CartDecomposition, InterfaceCartDecomposition))
 
         # Sequential attributes
-        self._parallel   = cart.is_parallel
-        self._cart       = cart
-        self._ndim       = cart._ndims
-        self._npts       = cart.npts
-        self._pads       = cart.pads
-        self._periods    = cart.periods
-        self._shifts     = cart.shifts
-        self._dtype      = dtype
-        self._starts     = cart.starts
-        self._ends       = cart.ends
+        self._parallel = cart.is_parallel
+        self._cart     = cart
+        self._ndim     = cart._ndims
+        self._npts     = cart.npts
+        self._pads     = cart.pads
+        self._periods  = cart.periods
+        self._shifts   = cart.shifts
+        self._dtype    = dtype
+        self._starts   = cart.starts
+        self._ends     = cart.ends
 
         # The shape of the allocated numpy array
         self._shape         = cart.shape
@@ -115,36 +142,67 @@ def __init__( self, cart, dtype=float ):
 
         # The dictionary follows the structure {(axis, ext): StencilVectorSpace()}
         # where axis and ext represent the boundary shared by two patches
-        self._interfaces    = {}
+        self._interfaces = {}
         self._interfaces_readonly = MappingProxyType(self._interfaces)
 
-
         # Parallel attributes
         if cart.is_parallel and not cart.is_comm_null:
-            self._mpi_type      = find_mpi_type(dtype)
+            self._mpi_type = find_mpi_type(dtype)
             if isinstance(cart, InterfaceCartDecomposition):
                 # TODO : Check if this line really change the ._shape
                 self._shape = cart.get_interface_communication_infos(cart.axis)['gbuf_recv_shape'][0]
             else:
-                self._synchronizer = get_data_exchanger( cart, dtype , assembly=True, blocking=False)
+                self._synchronizer = get_data_exchanger(cart, dtype , assembly=True, blocking=False)
+
+        # Select kernel for AXPY operation
+        if self._ndim in [1, 2, 3]:
+            self._axpy_func = kernels['axpy'][self._ndim]
+        else:
+            self._axpy_func = self._axpy_python
+            self._axpy_work = self.zeros()  # work array
+
+        # Select kernel for inner product
+        if self._ndim in [1, 2, 3]:
+            self._inner_func = kernels['inner'][self._ndim]
+        else:
+            self._inner_func = self._inner_python
+
+        # Constant arguments for inner product: total number of ghost cells
+        self._inner_consts = tuple(np.int64(p * s) for p, s in zip(self._pads, self._shifts))
+
+        # TODO [YG, 06.09.2023]: print warning if pure Python functions are used
+
+    #--------------------------------------
+    # Pure Python methods for backup
+    #--------------------------------------
+    def _axpy_python(self, a, x, y):
+        w = self._axpy_work
+        x.copy(out=w)  # w <- x
+        w *= a         # w <- a * x
+        y += w         # y <- a * x + y
+
+    @staticmethod
+    def _inner_python(v1, v2, nghost):
+        index = tuple(slice(ng, -ng) for ng in nghost)
+        return np.vdot(v1[index].flat, v2[index].flat)
 
     #--------------------------------------
     # Abstract interface
     #--------------------------------------
     @property
-    def dimension( self ):
+    def dimension(self):
         """ The dimension of a vector space V is the cardinality
             (i.e. the number of vectors) of a basis of V over its base field.
         """
-        return np.prod( self._npts )
+        return np.prod(self._npts)
 
     # ...
     @property
-    def dtype( self ):
+    def dtype(self):
         return self._dtype
 
     # ...
-    def zeros( self ):
+    def zeros(self):
         """
         Get a copy of the null element of the StencilVectorSpace V.
 
@@ -154,99 +212,138 @@ def zeros( self ):
             A new vector object with all components equal to zero.
 
         """
-        return StencilVector( self )
+        return StencilVector(self)
+
+    # ...
+    def axpy(self, a, x, y):
+        """
+        Increment the vector y with the a-scaled vector x, i.e. y = a * x + y,
+        provided that x and y belong to the same vector space V (self).
+        The scalar value a may be real or complex, depending on the field of V.
+
+        Parameters
+        ----------
+        a : scalar
+            The scaling coefficient needed for the operation.
+
+        x : StencilVector
+            The vector which is not modified by this function.
+
+        y : StencilVector
+            The vector modified by this function (incremented by a * x).
+        """
+        assert isinstance(x, StencilVector)
+        assert isinstance(y, StencilVector)
+        assert x._space is self
+        assert y._space is self
+
+        if self.dtype == complex:
+            a = complex(a)
+        else:
+            if isinstance(a, complex):
+                raise TypeError('A complex scalar was given in a real case')
+            else:
+                a = float(a)
+
+        self._axpy_func(a, x._data, y._data)
+
+        for axis, ext in self.interfaces:
+            self._axpy_func(a, x._interface_data[axis, ext], y._interface_data[axis, ext])
+
+        x._sync = x._sync and y._sync
 
     #--------------------------------------
     # Other properties/methods
     #--------------------------------------
     @property
-    def mpi_type( self ):
+    def mpi_type(self):
         return self._mpi_type
 
     @property
-    def shape( self ):
+    def shape(self):
         return self._shape
 
     @property
-    def parallel( self ):
+    def parallel(self):
         return self._parallel
 
     # ...
     @property
-    def cart( self ):
+    def cart(self):
         return self._cart
 
     # ...
     @property
-    def npts( self ):
+    def npts(self):
         return self._npts
 
     # ...
     @property
-    def starts( self ):
+    def starts(self):
         return self._starts
 
     # ...
     @property
-    def ends( self ):
+    def ends(self):
         return self._ends
 
     # ...
     @property
-    def parent_starts( self ):
+    def parent_starts(self):
         return self._parent_starts
 
     # ...
     @property
-    def parent_ends( self ):
+    def parent_ends(self):
         return self._parent_ends
 
     # ...
     @property
-    def pads( self ):
+    def pads(self):
         return self._pads
 
     # ...
     @property
-    def periods( self ):
+    def periods(self):
         return self._periods
 
     # ...
     @property
-    def shifts( self ):
+    def shifts(self):
         return self._shifts
 
     # ...
     @property
-    def ndim( self ):
+    def ndim(self):
         return self._ndim
 
     @property
-    def interfaces( self ):
+    def interfaces(self):
         return self._interfaces_readonly
 
     def set_interface(self, axis, ext, cart):
-
-        """ Set the interface space along a given axis and extremity.
+        """
+        Set the interface space along a given axis and extremity.
 
         Parameters
         ----------
-         axis : int
-          The axis of the new Interface space.
+        axis : int
+            The axis of the new Interface space.
 
-         ext: int
-          The extremity of the new Interface space.
-          the values must be 1 or -1.
+        ext: {-1, 1}
+            The extremity of the new Interface space.
 
-         cart: CartDecomposition
-          The cart of the new space.
+        cart: CartDecomposition
+            The cart of the new space.
         """
 
-        assert int(ext) in [-1,1]
+        assert int(ext) in [-1, 1]
+        assert isinstance(cart, (CartDecomposition, InterfaceCartDecomposition))
+
+        if cart.is_comm_null:
+            return
 
         # Create the interface space in the parallel case using the new cart
-        assert isinstance(cart, (CartDecomposition, InterfaceCartDecomposition))
-        if cart.is_comm_null: return
         if isinstance(cart, InterfaceCartDecomposition):
             # Case where the patches that share the interface are owned by different intra-communicators
             space = StencilVectorSpace(cart, dtype=self.dtype)
@@ -279,7 +376,7 @@ def set_interface(self, axis, ext, cart):
             self._interfaces[axis, ext] = space
 
 #===============================================================================
-class StencilVector( Vector ):
+class StencilVector(Vector):
     """
     Vector in n-dimensional stencil format.
 
@@ -296,7 +393,7 @@ def __init__(self, V):
         self._space          = V
         self._sizes          = V.shape
         self._ndim           = len(V.npts)
-        self._data           = np.zeros( V.shape, dtype=V.dtype )
+        self._data           = np.zeros(V.shape, dtype=V.dtype)
         self._dot_send_data  = np.zeros((1,), dtype=V.dtype)
         self._dot_recv_data  = np.zeros((1,), dtype=V.dtype)
         self._interface_data = {}
@@ -304,15 +401,16 @@ def __init__(self, V):
 
         # allocate data for the boundary that shares an interface
         for axis, ext in V.interfaces:
-            self._interface_data[axis, ext] = np.zeros( V.interfaces[axis, ext].shape, dtype=V.dtype )
+            self._interface_data[axis, ext] = np.zeros(V.interfaces[axis, ext].shape, dtype=V.dtype)
 
         #prepare communications
         if V.cart.is_parallel and not V.cart.is_comm_null and isinstance(V.cart, CartDecomposition):
             self._requests = V._synchronizer.prepare_communications(self._data)
 
         # TODO: distinguish between different directions
-        self._sync  = False
+        self._sync = False
 
+    #...
     def __del__(self):
         # Release memory of persistent MPI communication channels
         if self._requests:
@@ -342,33 +440,32 @@ def dot(self, v):
         Parameters
         ----------
         v : StencilVector
-            Vector of the same space than self needed for the scalar product
+            Vector of the same space than self needed for the scalar product.
 
         Returns
         -------
         null: self._space.dtype
-            Scalar containing scalar product of v and self
+            Scalar containing scalar product of v and self.
 
         """
 
         assert isinstance(v, StencilVector)
         assert v._space is self._space
 
+        inner_func = self._space._inner_func
+        inner_args = (self._data, v._data, *self._space._inner_consts)
+
         if self._space.parallel:
-            self._dot_send_data[0] = self._dot(self._data, v._data , self._space.pads, self._space.shifts)
+            # Sometimes in the parallel case, we can get an empty vector that breaks our kernel
+            self._dot_send_data[0] = 0 if self._data.shape[0] == 0 else inner_func(*inner_args)
             self._space.cart.global_comm.Allreduce((self._dot_send_data, self._space.mpi_type),
                                                    (self._dot_recv_data, self._space.mpi_type),
                                                    op=MPI.SUM )
             return self._dot_recv_data[0]
         else:
-            return self._dot(self._data, v._data , self._space.pads, self._space.shifts)
+            return inner_func(*inner_args)
 
     #...
-    @staticmethod
-    def _dot(v1, v2, pads, shifts):
-        index = tuple( slice(m*p,-m*p) for p,m in zip(pads, shifts))
-        return np.vdot(v1[index].flat, v2[index].flat)
-
     def conjugate(self, out=None):
         if out is not None:
             assert isinstance(out, StencilVector)
@@ -590,6 +687,7 @@ def _toarray_parallel_with_pads(self, order='C'):
         # Step 4: return flattened array
         return out.flatten( order=order)
 
+    #...
     def topetsc(self):
         """ Convert to petsc data structure.
         """
@@ -637,8 +735,8 @@ def update_ghost_regions(self):
         if self.space.parallel:
             if not self.space.cart.is_comm_null:
                 # PARALLEL CASE: fill in ghost regions with data from neighbors
-                self.space._synchronizer.start_update_ghost_regions( self._data, self._requests )
-                self.space._synchronizer.end_update_ghost_regions( self._data, self._requests )
+                self.space._synchronizer.start_update_ghost_regions(self._data, self._requests)
+                self.space._synchronizer.  end_update_ghost_regions(self._data, self._requests)
         else:
             # SERIAL CASE: fill in ghost regions along periodic directions, otherwise set to zero
             self._update_ghost_regions_serial()
@@ -647,14 +745,15 @@ def update_ghost_regions(self):
         if self.space.parallel:
 
             for axis, ext in self.space.interfaces:
-                V      = self.space.interfaces[axis, ext]
-                if isinstance(V.cart, InterfaceCartDecomposition):continue
+                V = self.space.interfaces[axis, ext]
+                if isinstance(V.cart, InterfaceCartDecomposition):
+                    continue
                 slices = [slice(s, e+2*m*p+1) for s,e,m,p in zip(V.starts, V.ends, V.shifts, V.pads)]
                 self._interface_data[axis, ext][...] = self._data[tuple(slices)]
         else:
 
             for axis, ext in self.space.interfaces:
-                V      = self.space.interfaces[axis, ext]
+                V = self.space.interfaces[axis, ext]
                 slices = [slice(s, e+2*m*p+1) for s,e,m,p in zip(V.starts, V.ends, V.shifts, V.pads)]
                 self._interface_data[axis, ext][...] = self._data[tuple(slices)]
 
@@ -667,29 +766,29 @@ def _update_ghost_regions_serial(self):
         ndim = self._space.ndim
         for direction in range(ndim):
             periodic = self._space.periods[direction]
-            p        = self._space.pads   [direction]*self._space.shifts[direction]
+            p        = self._space.pads   [direction] * self._space.shifts[direction]
 
-            idx_front = [slice(None)]*direction
-            idx_back  = [slice(None)]*(ndim-direction-1)
+            idx_front = [slice(None)] * direction
+            idx_back  = [slice(None)] * (ndim-direction-1)
 
             if periodic:
                 # Copy data from left to right
-                idx_from = tuple( idx_front + [slice( p, 2*p)] + idx_back )
-                idx_to   = tuple( idx_front + [slice(-p,None)] + idx_back )
+                idx_from = tuple(idx_front + [slice( p, 2*p)] + idx_back)
+                idx_to   = tuple(idx_front + [slice(-p,None)] + idx_back)
                 self._data[idx_to] = self._data[idx_from]
 
                 # Copy data from right to left
-                idx_from = tuple( idx_front + [slice(-2*p,-p)] + idx_back )
-                idx_to   = tuple( idx_front + [slice(None, p)] + idx_back )
+                idx_from = tuple(idx_front + [slice(-2*p,-p)] + idx_back)
+                idx_to   = tuple(idx_front + [slice(None, p)] + idx_back)
                 self._data[idx_to] = self._data[idx_from]
 
             else:
                 # Set left ghost region to zero
-                idx_ghost = tuple( idx_front + [slice(None, p)] + idx_back )
+                idx_ghost = tuple(idx_front + [slice(None, p)] + idx_back)
                 self._data[idx_ghost] = 0
 
                 # Set right ghost region to zero
-                idx_ghost = tuple( idx_front + [slice(-p,None)] + idx_back )
+                idx_ghost = tuple(idx_front + [slice(-p,None)] + idx_back)
                 self._data[idx_ghost] = 0
 
     # ...
@@ -700,22 +799,22 @@ def exchange_assembly_data(self):
 
         if self.space.parallel and not self.space.cart.is_comm_null:
             # PARALLEL CASE: fill in ghost regions with data from neighbors
-            self.space._synchronizer.start_exchange_assembly_data( self._data )
-            self.space._synchronizer.end_exchange_assembly_data( self._data )
+            self.space._synchronizer.start_exchange_assembly_data(self._data)
+            self.space._synchronizer.  end_exchange_assembly_data(self._data)
         else:
             # SERIAL CASE: fill in ghost regions along periodic directions, otherwise set to zero
             self._exchange_assembly_data_serial()
 
         ndim     = self._space.ndim
         for direction in range(ndim):
-            idx_front = [slice(None)]*direction
-            idx_back  = [slice(None)]*(ndim-direction-1)
+            idx_front = [slice(None)] * direction
+            idx_back  = [slice(None)] * (ndim-direction-1)
 
-            p        = self._space.pads   [direction]
+            p        = self._space.pads  [direction]
             m        = self._space.shifts[direction]
-            idx_from = tuple( idx_front + [ slice(-m*p,None) if (-m*p+p)!=0 else slice(-m*p,None)] + idx_back )
+            idx_from = tuple(idx_front + [slice(-m*p,None) if (-m*p+p)!=0 else slice(-m*p,None)] + idx_back)
             self._data[idx_from] = 0.
-            idx_from = tuple( idx_front + [ slice(0,m*p)] + idx_back )
+            idx_from = tuple(idx_front + [slice(0,m*p)] + idx_back)
             self._data[idx_from] = 0.
 
     # ...
@@ -726,15 +825,15 @@ def _exchange_assembly_data_serial(self):
 
             periodic = self._space.periods[direction]
             p        = self._space.pads   [direction]
-            m        = self._space.shifts[direction]
+            m        = self._space.shifts [direction]
 
             if periodic:
-                idx_front = [slice(None)]*direction
-                idx_back  = [slice(None)]*(ndim-direction-1)
+                idx_front = [slice(None)] * direction
+                idx_back  = [slice(None)] * (ndim-direction-1)
 
                 # Copy data from left to right
-                idx_to   = tuple( idx_front + [slice( m*p, m*p+p)] + idx_back )
-                idx_from = tuple( idx_front + [ slice(-m*p,-m*p+p) if (-m*p+p)!=0 else slice(-m*p,None)] + idx_back )
+                idx_to   = tuple(idx_front + [slice( m*p, m*p+p)] + idx_back)
+                idx_from = tuple(idx_front + [slice(-m*p,-m*p+p) if (-m*p+p)!=0 else slice(-m*p,None)] + idx_back)
                 self._data[idx_to] += self._data[idx_from]
 
     #--------------------------------------
@@ -743,8 +842,7 @@ def _exchange_assembly_data_serial(self):
     def _getindex(self, key):
 
         # TODO: check if we should ignore padding elements
-
-        if not isinstance( key, tuple ):
+        if not isinstance(key, tuple):
             key = (key,)
         index = []
         for (i,s,p,m) in zip(key, self.starts, self.pads,self.space.shifts):
@@ -758,7 +856,7 @@ def _getindex(self, key):
         return tuple(index)
 
 #===============================================================================
-class StencilMatrix( LinearOperator ):
+class StencilMatrix(LinearOperator):
     """
     Matrix in n-dimensional stencil format.
 
@@ -774,12 +872,11 @@ class StencilMatrix( LinearOperator ):
 
     W : psydac.linalg.stencil.StencilVectorSpace
         Codomain of the new linear operator.
-
     """
-    def __init__( self, V, W, pads=None , backend=None):
+    def __init__(self, V, W, pads=None, backend=None):
 
-        assert isinstance( V, StencilVectorSpace )
-        assert isinstance( W, StencilVectorSpace )
+        assert isinstance(V, StencilVectorSpace)
+        assert isinstance(W, StencilVectorSpace)
         assert W.pads == V.pads
         if not W.dtype==V.dtype:
             raise NotImplementedError("The domain and the codomain should have the same data type.")
@@ -791,10 +888,10 @@ def __init__( self, V, W, pads=None , backend=None):
         self._pads     = pads or tuple(V.pads)
         dims           = list(W.shape)
         diags          = [compute_diag_len(p, md, mc) for p,md,mc in zip(self._pads, V.shifts, W.shifts)]
-        self._data     = np.zeros( dims+diags, dtype=W.dtype )
+        self._data     = np.zeros(dims+diags, dtype=W.dtype)
         self._domain   = V
         self._codomain = W
-        self._ndim     = len( dims )
+        self._ndim     = len(dims)
         self._backend  = backend
         self._is_T     = False
         self._diag_indices = None
@@ -815,53 +912,52 @@ def __init__( self, V, W, pads=None , backend=None):
         self._sync = False
 
         # Prepare the arguments for the dot product method
-        nd  = [(ej-sj+2*gp*mj-mj*p-gp)//mj*mi+1 for sj,ej,mj,mi,p,gp in zip(V.starts, V.ends, V.shifts, W.shifts, self._pads, V.pads)]
-        nc  = [ei-si+1 for si,ei,mj,p in zip(W.starts, W.ends, V.shifts, self._pads)]
+        nd = [(ej-sj+2*gp*mj-mj*p-gp)//mj*mi+1 for sj,ej,mj,mi,p,gp in zip(V.starts, V.ends, V.shifts, W.shifts, self._pads, V.pads)]
+        nc = [ei-si+1 for si,ei,mj,p in zip(W.starts, W.ends, V.shifts, self._pads)]
 
         # Number of rows in matrix (along each dimension)
-        nrows        = [min(ni,nj) for ni,nj  in zip(nc, nd)]
-        nrows_extra  = [max(0,ni-nj) for ni,nj in zip(nc, nd)]
-
-        args                 = {}
-        args['starts']       = tuple(V.starts)
-        args['nrows']        = tuple(nrows)
-        args['nrows_extra']  = tuple(nrows_extra)
-        args['gpads']        = tuple(V.pads)
-        args['pads']         = tuple(self._pads)
-        args['dm']           = tuple(V.shifts)
-        args['cm']           = tuple(W.shifts)
+        nrows       = [min(ni, nj)   for ni,nj in zip(nc, nd)]
+        nrows_extra = [max(0, ni-nj) for ni,nj in zip(nc, nd)]
+
+        args                = {}
+        args['starts']      = tuple(V.starts)
+        args['nrows']       = tuple(nrows)
+        args['nrows_extra'] = tuple(nrows_extra)
+        args['gpads']       = tuple(V.pads)
+        args['pads']        = tuple(self._pads)
+        args['dm']          = tuple(V.shifts)
+        args['cm']          = tuple(W.shifts)
+        ndiags, _           = list(zip(*[compute_diag_len(p,mj,mi, return_padding=True) for p,mi,mj in zip(self._pads, W.shifts, V.shifts)]))
+        args['pad_imp']     = [gp*m+gp+1-n-s%m+p-gp for gp,m,n,s,p in zip(V.pads, V.shifts, ndiags, V.starts, self._pads)]
+        args['ndiags']      = ndiags
 
         self._dotargs_null = args
-        self._args         = args.copy()
-        self._func         = self._dot
+        self._dot          = kernels['matvec'][self._ndim]
 
         self._transpose_args = self._prepare_transpose_args()
-        self._transpose_func      = eval(f'transpose_{self._ndim}d')
-        if backend is None:
-            backend = PSYDAC_BACKENDS.get(os.environ.get('PSYDAC_BACKEND'))
+        self._transpose_func = kernels['transpose'][self._ndim]
 
-        if backend:
-            self.set_backend(backend)
+        self.set_backend(backend)
 
     #--------------------------------------
     # Abstract interface
     #--------------------------------------
     @property
-    def domain( self ):
+    def domain(self):
         return self._domain
 
     # ...
     @property
-    def codomain( self ):
+    def codomain(self):
         return self._codomain
 
     # ...
     @property
-    def dtype( self ):
+    def dtype(self):
         return self._domain.dtype
 
     # ...
-    def dot( self, v, out=None):
+    def dot(self, v, out=None):
         """
         Return the matrix/vector product between self and v.
         This function optimized this product.
@@ -869,20 +965,18 @@ def dot( self, v, out=None):
         Parameters
         ----------
         v   : StencilVector
-            Vector of the domain of self needed for the Matrix/Vector product
+            Vector of the domain of self needed for the Matrix/Vector product.
 
         out : StencilVector
-            Vector of the codomain of self
+            Vector of the codomain of self.
 
         Returns
         -------
         out : StencilVector
-            Vector of the codomain of self, contain the result of the product
-
+            Vector of the codomain of self, contain the result of the product.
         """
 
-
-        assert isinstance( v, StencilVector )
+        assert isinstance(v, StencilVector)
         assert v.space is self.domain
 
         if out is not None:
@@ -901,6 +995,7 @@ def dot( self, v, out=None):
         out.ghost_regions_in_sync = False
         return out
 
+    # ...
     def vdot( self, v, out=None):
         """
         Return the matrix/vector product between the conjugate of self and v.
@@ -918,14 +1013,13 @@ def vdot( self, v, out=None):
         -------
         out : StencilVector
             Vector of the codomain of self, contain the result of the product
-
         """
 
-        assert isinstance( v, StencilVector )
+        assert isinstance(v, StencilVector)
         assert v.space is self.domain
 
         if out is not None:
-            assert isinstance( out, StencilVector )
+            assert isinstance(out, StencilVector)
             assert out.space is self.codomain
         else:
             out = StencilVector( self.codomain )
@@ -942,49 +1036,6 @@ def vdot( self, v, out=None):
         out.ghost_regions_in_sync = False
         return out
 
-    # ...
-    @staticmethod
-    def _dot(mat, x, out, starts, nrows, nrows_extra, gpads, pads, dm, cm):
-
-        # Index for k=i-j
-        ndim = len(x.shape)
-        kk   = [slice(None)]*ndim
-
-        # pads are <= gpads
-        diff = [gp-p for gp,p in zip(gpads, pads)]
-
-        ndiags, _ = list(zip(*[compute_diag_len(p,mj,mi, return_padding=True) for p,mi,mj in zip(pads,cm,dm)]))
-
-        bb = [p*m+p+1-n-s%m for p,m,n,s in zip(gpads, dm, ndiags, starts)]
-
-        for xx in np.ndindex( *nrows ):
-
-            ii    = tuple( mi*pi + x for mi,pi,x in zip(cm, gpads, xx) )
-            jj    = tuple( slice(b-d+(x+s%mj)//mi*mj,b-d+(x+s%mj)//mi*mj+n) for x,mi,mj,b,s,n,d in zip(xx,cm,dm,bb,starts,ndiags,diff) )
-            ii_kk = tuple( list(ii) + kk )
-            out[ii] = np.dot( mat[ii_kk].flat, x[jj].flat )
-
-        new_nrows = list(nrows).copy()
-
-        for d,er in enumerate(nrows_extra):
-
-            rows = new_nrows.copy()
-            del rows[d]
-
-            for n in range(er):
-                for xx in np.ndindex(*rows):
-                    xx = list(xx)
-                    xx.insert(d, nrows[d]+n)
-
-                    ii     = tuple(mi*pi + x for mi,pi,x in zip(cm, gpads, xx))
-                    ee     = [max(x-l+1,0) for x,l in zip(xx, nrows)]
-                    jj     = tuple( slice(b-d+(x+s%mj)//mi*mj, b-d+(x+s%mj)//mi*mj+n-e) for x,mi,mj,d,e,b,s,n in zip(xx, cm, dm, diff, ee,bb,starts, ndiags) )
-                    kk     = [slice(None,n-e) for n,e in zip(ndiags, ee)]
-                    ii_kk  = tuple( list(ii) + kk )
-                    out[ii] = np.dot( mat[ii_kk].flat, x[jj].flat )
-
-            new_nrows[d] += er
-            
     # ...
     def transpose(self, conjugate=False):
         """ Create new StencilMatrix Mt, where domain and codomain are swapped
@@ -1008,7 +1059,7 @@ def transpose(self, conjugate=False):
         return Mt
 
     # ...
-    def toarray( self, **kwargs ):
+    def toarray(self, **kwargs):
         """ Convert to Numpy 2D array. """
 
         order     = kwargs.pop('order', 'C')
@@ -1022,7 +1073,7 @@ def toarray( self, **kwargs ):
         return coo.toarray()
 
     # ...
-    def tosparse( self, **kwargs ):
+    def tosparse(self, **kwargs):
         """ Convert to any Scipy sparse matrix format. """
 
         order     = kwargs.pop('order', 'C')
@@ -1042,8 +1093,8 @@ def __neg__(self):
         return self.__mul__(-1)
 
     # ...
-    def __mul__( self, a ):
-        w = StencilMatrix( self._domain, self._codomain, self._pads, self._backend )
+    def __mul__(self, a):
+        w = StencilMatrix(self._domain, self._codomain, self._pads, self._backend)
         w._data = self._data * a
         w._func = self._func
         w._args = self._args
@@ -1116,12 +1167,12 @@ def conj(self, out=None):
 
     # ...
     @property
-    def pads( self ):
+    def pads(self):
         return self._pads
 
     # ...
     @property
-    def backend( self ):
+    def backend(self):
         return self._backend
 
     # ...
@@ -1135,11 +1186,11 @@ def __setitem__(self, key, value):
         self._data[index] = value
 
     #...
-    def max( self ):
+    def max(self):
         return self._data.max()
 
     #...
-    def copy( self ):
+    def copy(self):
         M = StencilMatrix( self.domain, self.codomain, self._pads, self._backend )
         M._data[:] = self._data[:]
         M._func    = self._func
@@ -1178,7 +1229,7 @@ def __isub__(self, m):
             return LinearOperator.__sub__(self, m)
 
     #...
-    def __abs__( self ):
+    def __abs__(self):
         w = StencilMatrix( self._domain, self._codomain, self._pads, self._backend )
         w._data = abs(self._data)
         w._func = self._func
@@ -1187,7 +1238,7 @@ def __abs__( self ):
         return w
 
     #...
-    def remove_spurious_entries( self ):
+    def remove_spurious_entries(self):
         """
         If any dimension is NOT periodic, make sure that the corresponding
         periodic corners are set to zero.
@@ -1226,7 +1277,7 @@ def remove_spurious_entries( self ):
                     self[index] = 0
 
     # ...
-    def update_ghost_regions( self ):
+    def update_ghost_regions(self):
         """
         Update ghost regions before performing non-local access to matrix
         elements (e.g. in matrix transposition).
@@ -1247,7 +1298,7 @@ def update_ghost_regions( self ):
         self._sync = True
 
     # ...
-    def exchange_assembly_data( self ):
+    def exchange_assembly_data(self):
         """
         Exchange assembly data.
         """
@@ -1275,7 +1326,7 @@ def exchange_assembly_data( self ):
             self._data[idx_from] = 0.
 
     # ...
-    def _exchange_assembly_data_serial( self ):
+    def _exchange_assembly_data_serial(self):
 
         ndim     = self._codomain.ndim
         for direction in range(ndim):
@@ -1316,19 +1367,18 @@ def diagonal(self):
         return self._data[self._diag_indices].reshape(nrows)
 
     # ...
-    def topetsc( self ):
-        """ Convert to petsc data structure.
+    def topetsc(self):
+        """ Convert to PETSc data structure.
         """
         from psydac.linalg.topetsc import mat_topetsc
-        mat = mat_topetsc( self )
+        mat = mat_topetsc(self)
         return mat
 
     #--------------------------------------
     # Private methods
     #--------------------------------------
 
-    # ...
-    def _getindex( self, key ):
+    def _getindex(self, key):
 
         nd = self._ndim
         ii = key[:nd]
@@ -1347,7 +1397,7 @@ def _getindex( self, key ):
 
     # ...
     @staticmethod
-    def _shift_index( index, shift ):
+    def _shift_index(index, shift):
         if isinstance( index, slice ):
             start = None if index.start is None else index.start + shift
             stop  = None if index.stop  is None else index.stop  + shift
@@ -1355,7 +1405,7 @@ def _shift_index( index, shift ):
         else:
             return index + shift
 
-    def tocoo_local( self, order='C' ):
+    def tocoo_local(self, order='C'):
 
         # Shortcuts
         sc = self._codomain.starts
@@ -1410,7 +1460,7 @@ def tocoo_local( self, order='C' ):
         return M
 
     #...
-    def _tocoo_no_pads( self , order='C'):
+    def _tocoo_no_pads(self , order='C'):
 
         # Shortcuts
         nr    = self._codomain.npts
@@ -1441,8 +1491,7 @@ def _tocoo_no_pads( self , order='C'):
         cpads = [np.int64(i) for i in cpads]
         pp = [np.int64(i) for i in pp]
 
-        func        = 'stencil2coo_{dim}d_{order}'.format(dim=nd, order=order)
-        stencil2coo = eval(func)
+        stencil2coo = kernels['stencil2coo'][order][nd]
 
         ind = stencil2coo(self._data, data, rows, cols, *nrl, *ncl, *ss, *nr, *nc, *dm, *cm, *cpads, *pp)
         M = coo_matrix(
@@ -1452,7 +1501,7 @@ def _tocoo_no_pads( self , order='C'):
         return M
 
     #...
-    def _tocoo_parallel_with_pads( self , order='C'):
+    def _tocoo_parallel_with_pads(self , order='C'):
 
         # If necessary, update ghost regions
         if not self.ghost_regions_in_sync:
@@ -1539,21 +1588,21 @@ def _tocoo_parallel_with_pads( self , order='C'):
 
     # ...
     @property
-    def ghost_regions_in_sync( self ):
+    def ghost_regions_in_sync(self):
         return self._sync
 
     # ...
     # NOTE: this property must be set collectively
     @ghost_regions_in_sync.setter
-    def ghost_regions_in_sync( self, value ):
-        assert isinstance( value, bool )
+    def ghost_regions_in_sync(self, value):
+        assert isinstance(value, bool)
         self._sync = value
 
     # ...
-    def _update_ghost_regions_serial( self ):
+    def _update_ghost_regions_serial(self):
 
-        ndim     = self._codomain.ndim
-        for direction in range( self._codomain.ndim ):
+        ndim = self._codomain.ndim
+        for direction in range(self._codomain.ndim):
 
             periodic = self._codomain.periods[direction]
             p        = self._codomain.pads   [direction]
@@ -1564,23 +1613,23 @@ def _update_ghost_regions_serial( self ):
             if periodic:
 
                 # Copy data from left to right
-                idx_from = tuple( idx_front + [slice( p, 2*p)] + idx_back )
-                idx_to   = tuple( idx_front + [slice(-p,None)] + idx_back )
+                idx_from = tuple(idx_front + [slice( p, 2*p)] + idx_back)
+                idx_to   = tuple(idx_front + [slice(-p,None)] + idx_back)
                 self._data[idx_to] = self._data[idx_from]
 
                 # Copy data from right to left
-                idx_from = tuple( idx_front + [slice(-2*p,-p)] + idx_back )
-                idx_to   = tuple( idx_front + [slice(None, p)] + idx_back )
+                idx_from = tuple(idx_front + [slice(-2*p,-p)] + idx_back)
+                idx_to   = tuple(idx_front + [slice(None, p)] + idx_back)
                 self._data[idx_to] = self._data[idx_from]
 
             else:
 
                 # Set left ghost region to zero
-                idx_ghost = tuple( idx_front + [slice(None, p)] + idx_back )
+                idx_ghost = tuple(idx_front + [slice(None, p)] + idx_back)
                 self._data[idx_ghost] = 0
 
                 # Set right ghost region to zero
-                idx_ghost = tuple( idx_front + [slice(-p,None)] + idx_back )
+                idx_ghost = tuple(idx_front + [slice(-p,None)] + idx_back)
                 self._data[idx_ghost] = 0
 
     # ...
@@ -1593,21 +1642,21 @@ def _prepare_transpose_args(self):
         eec   = W.ends
         ssd   = V.starts
         eed   = V.ends
-        pads    = self._pads
+        pads  = self._pads
         gpads = V.pads
 
         dm    = V.shifts
         cm    = W.shifts
 
         # Number of rows in the transposed matrix (along each dimension)
-        nrows       = [e-s+1 for s, e in zip(ssd, eed)]
-        ncols       = [e-s+2*m*p+1 for s, e, m, p in zip(ssc, eec, cm, gpads)]
+        nrows = [e-s+1 for s, e in zip(ssd, eed)]
+        ncols = [e-s+2*m*p+1 for s, e, m, p in zip(ssc, eec, cm, gpads)]
 
         pp = pads
         ndiags, starts = list(zip(*[compute_diag_len(p, mi, mj, return_padding=True) for p, mi, mj in zip(pp, cm, dm)]))
         ndiagsT, _     = list(zip(*[compute_diag_len(p, mj, mi, return_padding=True) for p, mi, mj in zip(pp, cm, dm)]))
 
-        diff   = [gp-p for gp, p in zip(gpads, pp)]
+        diff = [gp-p for gp, p in zip(gpads, pp)]
 
         sl   = [(s if mi > mj else 0) + (s % mi + mi//mj if mi < mj else 0)+(s if mi == mj else 0)\
                  for s, p, mi, mj in zip(starts, pp, cm, dm)]
@@ -1632,20 +1681,23 @@ def _prepare_transpose_args(self):
         args['cm']  = np.int64(cm)
         args['nd']  = np.int64(ndiags)
         args['ndT'] = np.int64(ndiagsT)
-        args['si'] = np.int64(si)
-        args['sk'] = np.int64(sk)
-        args['sl'] = np.int64(sl)
+        args['si']  = np.int64(si)
+        args['sk']  = np.int64(sk)
+        args['sl']  = np.int64(sl)
 
         return args
 
     # ...
     def set_backend(self, backend):
         from psydac.api.ast.linalg import LinearOperatorDot
-        self._backend         = backend
-        self._args            = self._dotargs_null.copy()
+        self._backend = backend
+        self._args    = self._dotargs_null.copy()
 
         if self._backend is None:
-            self._func           = self._dot
+            for key, arg in self._args.items():
+                self._args[key] = np.int64(arg)
+            self._func = self._dot
+            self._args.pop('pads')
         else:
             if self.domain.parallel:
                 comm = self.codomain.cart.comm
@@ -1730,6 +1782,8 @@ def set_backend(self, backend):
                 self._args.pop('dm')
                 self._args.pop('cm')
 
+            self._args.pop('pad_imp')
+            self._args.pop('ndiags')
             self._func = dot.func
 
 #===============================================================================
@@ -1737,8 +1791,8 @@ def set_backend(self, backend):
 # - Check if StencilMatrix should be subclassed
 # - Reimplement magic methods (some are simply copied from StencilMatrix)
 def flip_axis(index, n):
-    s = n-index.start-1
-    e = n-index.stop-1 if n>index.stop else None
+    s = n - index.start-1
+    e = n - index.stop-1 if n > index.stop else None
     return slice(s,e,-1)
 
 class StencilInterfaceMatrix(LinearOperator):
@@ -1783,10 +1837,10 @@ class StencilInterfaceMatrix(LinearOperator):
           Padding of the linear operator.
 
     """
-    def __init__( self, V, W, s_d, s_c, d_axis, c_axis, d_ext, c_ext, *, flip=None, pads=None, backend=None ):
+    def __init__(self, V, W, s_d, s_c, d_axis, c_axis, d_ext, c_ext, *, flip=None, pads=None, backend=None):
 
-        assert isinstance( V, StencilVectorSpace )
-        assert isinstance( W, StencilVectorSpace )
+        assert isinstance(V, StencilVectorSpace)
+        assert isinstance(W, StencilVectorSpace)
         assert W.pads == V.pads
 
         Vin = V.interfaces[d_axis, d_ext]
@@ -1795,17 +1849,17 @@ def __init__( self, V, W, s_d, s_c, d_axis, c_axis, d_ext, c_ext, *, flip=None,
             for p,vp in zip(pads, Vin.pads):
                 assert p<=vp
 
-        self._pads     = pads or tuple(Vin.pads)
-        dims           = list(W.shape)
+        self._pads = pads or tuple(Vin.pads)
+        dims       = list(W.shape)
 
         if W.parent_ends[c_axis] is not None:
             diff = min(1, W.parent_ends[c_axis]-W.ends[c_axis])
         else:
             diff = 0
 
-        dims[c_axis]   = W.pads[c_axis] + 1-diff + 2*W.shifts[c_axis]*W.pads[c_axis]
-        diags          = [compute_diag_len(p, md, mc) for p,md,mc in zip(self._pads, Vin.shifts, W.shifts)]
-        self._data     = np.zeros( dims+diags, dtype=W.dtype )
+        dims[c_axis] = W.pads[c_axis] + 1-diff + 2*W.shifts[c_axis]*W.pads[c_axis]
+        diags        = [compute_diag_len(p, md, mc) for p,md,mc in zip(self._pads, Vin.shifts, W.shifts)]
+        self._data   = np.zeros(dims + diags, dtype=W.dtype)
 
         # Parallel attributes
         if W.parallel and not isinstance(W.cart, InterfaceCartDecomposition):
@@ -1831,13 +1885,13 @@ def __init__( self, V, W, s_d, s_c, d_axis, c_axis, d_ext, c_ext, *, flip=None,
         self._codomain_ext   = c_ext
         self._domain_start   = s_d
         self._codomain_start = s_c
-        self._ndim        = len( dims )
-        self._backend     = None
+        self._ndim           = len(dims)
+        self._backend        = None
 
 
         # Prepare the arguments for the dot product method
-        nd  = [(ej-sj+2*gp*mj-mj*p-gp)//mj*mi+1 for sj,ej,mj,mi,p,gp in zip(Vin.starts, Vin.ends, Vin.shifts, W.shifts, self._pads, Vin.pads)]
-        nc  = [ei-si+1 for si,ei,mj,p in zip(W.starts, W.ends, Vin.shifts, self._pads)]
+        nd = [(ej-sj+2*gp*mj-mj*p-gp)//mj*mi+1 for sj,ej,mj,mi,p,gp in zip(Vin.starts, Vin.ends, Vin.shifts, W.shifts, self._pads, Vin.pads)]
+        nc = [ei-si+1 for si,ei,mj,p in zip(W.starts, W.ends, Vin.shifts, self._pads)]
 
         # Number of rows in matrix (along each dimension)
         nrows         = [min(ni,nj) for ni,nj  in zip(nc, nd)]
@@ -1846,26 +1900,26 @@ def __init__( self, V, W, s_d, s_c, d_axis, c_axis, d_ext, c_ext, *, flip=None,
         nrows[c_axis] = W.pads[c_axis] + 1-diff-nrows_extra[c_axis]
 
 
-        args                 = {}
-        args['starts']       = tuple(Vin.starts)
-        args['nrows']        = tuple(nrows)
-        args['nrows_extra']  = tuple(nrows_extra)
-        args['gpads']        = tuple(Vin.pads)
-        args['pads']         = tuple(self._pads)
-        args['dm']           = tuple(Vin.shifts)
-        args['cm']           = tuple(W.shifts)
-        args['c_axis']       = c_axis
-        args['d_start']      = self._domain_start
-        args['c_start']      = self._codomain_start
-        args['flip']         = self._flip
-        args['permutation']  = self._permutation
+        args                = {}
+        args['starts']      = tuple(Vin.starts)
+        args['nrows']       = tuple(nrows)
+        args['nrows_extra'] = tuple(nrows_extra)
+        args['gpads']       = tuple(Vin.pads)
+        args['pads']        = tuple(self._pads)
+        args['dm']          = tuple(Vin.shifts)
+        args['cm']          = tuple(W.shifts)
+        args['c_axis']      = c_axis
+        args['d_start']     = self._domain_start
+        args['c_start']     = self._codomain_start
+        args['flip']        = self._flip
+        args['permutation'] = self._permutation
 
         self._dotargs_null = args
         self._args         = args.copy()
         self._func         = self._dot
 
         self._transpose_args = self._prepare_transpose_args()
-        self._transpose_func = eval(f'interface_transpose_{self._ndim}d')
+        self._transpose_func = kernels['interface_transpose'][self._ndim]
 
         if backend is None:
             backend = PSYDAC_BACKENDS.get(os.environ.get('PSYDAC_BACKEND'))
@@ -1880,29 +1934,29 @@ def __init__( self, V, W, s_d, s_c, d_axis, c_axis, d_ext, c_ext, *, flip=None,
     # Abstract interface
     #--------------------------------------
     @property
-    def domain( self ):
+    def domain(self):
         return self._domain
 
     # ...
     @property
-    def codomain( self ):
+    def codomain(self):
         return self._codomain
 
     # ...
     @property
-    def dtype( self ):
+    def dtype(self):
         return self.domain.dtype
 
     # ...
-    def dot( self, v, out=None ):
+    def dot(self, v, out=None):
 
-        assert isinstance( v, StencilVector )
+        assert isinstance(v, StencilVector)
         assert v.space is self.domain
 
         # Necessary if vector space is distributed across processes
 
         if out is not None:
-            assert isinstance( out, StencilVector )
+            assert isinstance(out, StencilVector)
             assert out.space is self.codomain
             out[(slice(None,None),)*v.space.ndim] = 0.
         else:
@@ -1922,10 +1976,10 @@ def dot( self, v, out=None ):
     def _dot(mat, v, out, starts, nrows, nrows_extra, gpads, pads, dm, cm, c_axis, d_start, c_start, flip, permutation):
 
         # Index for k=i-j
-        nrows      = list(nrows)
-        ndim       = len(v.shape)
-        kk         = [slice(None)]*ndim
-        diff       = [xp-p for xp,p in zip(gpads, pads)]
+        nrows     = list(nrows)
+        ndim      = len(v.shape)
+        kk        = [slice(None)]*ndim
+        diff      = [xp-p for xp,p in zip(gpads, pads)]
 
         ndiags, _ = list(zip(*[compute_diag_len(p,mj,mi, return_padding=True) for p,mi,mj in zip(pads,cm,dm)]))
         bb        = [p*m+p+1-n-s%m for p,m,n,s in zip(gpads, dm, ndiags, starts)]
@@ -2003,8 +2057,8 @@ def _prepare_transpose_args(self):
         dim   = self._codomain_axis
 
         # Number of rows in the transposed matrix (along each dimension)
-        nrows       = [e-s+1 for s,e in zip(ssd, eed)]
-        ncols       = [e-s+1+2*m*p for s, e, m, p in zip(ssc, eec, cm, gpads)]
+        nrows = [e-s+1 for s,e in zip(ssd, eed)]
+        ncols = [e-s+1+2*m*p for s, e, m, p in zip(ssc, eec, cm, gpads)]
 
         pp = pads
         ndiags, starts = list(zip(*[compute_diag_len(p,mi,mj, return_padding=True) for p, mi, mj in zip(pp, cm, dm)]))
@@ -2040,8 +2094,7 @@ def _prepare_transpose_args(self):
         nrows[dim] = pads[dim] + 1 - diff_r
         ncols[dim] = pads[dim] + 1 - diff_c + 2*cm[dim]*pads[dim]
 
-
-        args={}
+        args = {}
         args['n']   = np.int64(nrows)
         args['nc']  = np.int64(ncols)
         args['gp']  = np.int64(gpads)
@@ -2050,12 +2103,14 @@ def _prepare_transpose_args(self):
         args['cm']  = np.int64(cm)
         args['nd']  = np.int64(ndiags)
         args['ndT'] = np.int64(ndiagsT)
-        args['si'] = np.int64(si)
-        args['sk'] = np.int64(sk)
-        args['sl'] = np.int64(sl)
+        args['si']  = np.int64(si)
+        args['sk']  = np.int64(sk)
+        args['sl']  = np.int64(sl)
+
         return args
+
     # ...
-    def toarray( self, **kwargs ):
+    def toarray(self, **kwargs):
 
         order     = kwargs.pop('order', 'C')
         with_pads = kwargs.pop('with_pads', False)
@@ -2068,7 +2123,7 @@ def toarray( self, **kwargs ):
         return coo.toarray()
 
     # ...
-    def tosparse( self, **kwargs ):
+    def tosparse(self, **kwargs):
 
         order     = kwargs.pop('order', 'C')
         with_pads = kwargs.pop('with_pads', False)
@@ -2081,7 +2136,7 @@ def tosparse( self, **kwargs ):
         return coo
 
     #...
-    def copy( self ):
+    def copy(self):
         M = StencilInterfaceMatrix( self._domain, self._codomain,
                                     self._domain_start, self._codomain_start,
                                     self._domain_axis, self._codomain_axis,
@@ -2096,7 +2151,7 @@ def __neg__(self):
         return self.__mul__(-1)
 
     #...
-    def __mul__( self, a ):
+    def __mul__(self, a):
         w = self.copy()
         w._data *= a
         w._sync = self._sync
@@ -2128,52 +2183,52 @@ def __isub__(self, m):
 
     # ...
     @property
-    def domain_axis( self ):
+    def domain_axis(self):
         return self._domain_axis
 
     # ...
     @property
-    def codomain_axis( self ):
+    def codomain_axis(self):
         return self._codomain_axis
 
     # ...
     @property
-    def domain_ext( self ):
+    def domain_ext(self):
         return self._domain_ext
 
     # ...
     @property
-    def codomain_ext( self ):
+    def codomain_ext(self):
         return self._codomain_ext
 
     # ...
     @property
-    def domain_start( self ):
+    def domain_start(self):
         return self._domain_start
 
     # ...
     @property
-    def codomain_start( self ):
+    def codomain_start(self):
         return self._codomain_start
 
     # ...
     @property
-    def dim( self ):
+    def dim(self):
         return self._ndim
 
     # ...
     @property
-    def flip( self ):
+    def flip(self):
         return self._flip
 
     # ...
     @property
-    def permutation( self ):
+    def permutation(self):
         return self._permutation
 
     # ...
     @property
-    def pads( self ):
+    def pads(self):
         return self._pads
 
     # ...
@@ -2187,18 +2242,18 @@ def __setitem__(self, key, value):
         self._data[index] = value
 
     #...
-    def max( self ):
+    def max(self):
         return self._data.max()
 
     # ...
     @property
-    def backend( self ):
+    def backend(self):
         return self._backend
 
     #--------------------------------------
     # Private methods
     #--------------------------------------
-    def _getindex( self, key ):
+    def _getindex(self, key):
 
         nd = self._ndim
         ii = key[:nd]
@@ -2206,20 +2261,20 @@ def _getindex( self, key ):
 
         index = []
 
-        for i,s,p in zip( ii, self._codomain.starts, self._codomain.pads ):
-            x = self._shift_index( i, p-s )
-            index.append( x )
+        for i,s,p in zip(ii, self._codomain.starts, self._codomain.pads):
+            x = self._shift_index(i, p-s)
+            index.append(x)
 
-        for k,p in zip( kk, self._pads ):
-            l = self._shift_index( k, p )
-            index.append( l )
+        for k,p in zip(kk, self._pads):
+            l = self._shift_index(k, p)
+            index.append(l)
 
         return tuple(index)
 
     # ...
     @staticmethod
-    def _shift_index( index, shift ):
-        if isinstance( index, slice ):
+    def _shift_index(index, shift):
+        if isinstance(index, slice):
             start = None if index.start is None else index.start + shift
             stop  = None if index.stop  is None else index.stop  + shift
             return slice(start, stop, index.step)
@@ -2227,7 +2282,7 @@ def _shift_index( index, shift ):
             return index + shift
 
     #...
-    def _tocoo_no_pads( self ):
+    def _tocoo_no_pads(self):
         # Shortcuts
         nr  = self.codomain.npts
         nc  = self.domain.npts
@@ -2273,12 +2328,12 @@ def _tocoo_no_pads( self ):
 
                 jj = [jj[i] for i in permutation]
 
-                I = ravel_multi_index( ii, dims=nr, order='C' )
-                J = ravel_multi_index( jj, dims=nc, order='C' )
+                I = ravel_multi_index(ii, dims=nr, order='C')
+                J = ravel_multi_index(jj, dims=nc, order='C')
 
-                rows.append( I )
-                cols.append( J )
-                data.append( value )
+                rows.append(I)
+                cols.append(J)
+                data.append(value)
 
         M = coo_matrix(
                     (data,(rows,cols)),
@@ -2289,55 +2344,55 @@ def _tocoo_no_pads( self ):
 
     # ...
     @property
-    def ghost_regions_in_sync( self ):
+    def ghost_regions_in_sync(self):
         return self._sync
 
     # ...
     # NOTE: this property must be set collectively
     @ghost_regions_in_sync.setter
-    def ghost_regions_in_sync( self, value ):
-        assert isinstance( value, bool )
+    def ghost_regions_in_sync(self, value):
+        assert isinstance(value, bool)
         self._sync = value
 
     # ...
-    def _update_ghost_regions_serial( self, direction: int ):
+    def _update_ghost_regions_serial(self, direction: int):
 
         if direction is None:
-            for d in range( self._codomain.ndim ):
-                self._update_ghost_regions_serial( d )
+            for d in range(self._codomain.ndim):
+                self._update_ghost_regions_serial(d)
             return
 
         ndim     = self._codomain.ndim
         periodic = self._codomain.periods[direction]
         p        = self._codomain.pads   [direction]
 
-        idx_front = [slice(None)]*direction
-        idx_back  = [slice(None)]*(ndim-direction-1 + ndim)
+        idx_front = [slice(None)] * direction
+        idx_back  = [slice(None)] * (ndim-direction-1)
 
         if periodic:
 
             # Copy data from left to right
-            idx_from = tuple( idx_front + [slice( p, 2*p)] + idx_back )
-            idx_to   = tuple( idx_front + [slice(-p,None)] + idx_back )
+            idx_from = tuple(idx_front + [slice( p, 2*p)] + idx_back)
+            idx_to   = tuple(idx_front + [slice(-p,None)] + idx_back)
             self._data[idx_to] = self._data[idx_from]
 
             # Copy data from right to left
-            idx_from = tuple( idx_front + [slice(-2*p,-p)] + idx_back )
-            idx_to   = tuple( idx_front + [slice(None, p)] + idx_back )
+            idx_from = tuple(idx_front + [slice(-2*p,-p)] + idx_back)
+            idx_to   = tuple(idx_front + [slice(None, p)] + idx_back)
             self._data[idx_to] = self._data[idx_from]
 
         else:
 
             # Set left ghost region to zero
-            idx_ghost = tuple( idx_front + [slice(None, p)] + idx_back )
+            idx_ghost = tuple(idx_front + [slice(None, p)] + idx_back)
             self._data[idx_ghost] = 0
 
             # Set right ghost region to zero
-            idx_ghost = tuple( idx_front + [slice(-p,None)] + idx_back )
+            idx_ghost = tuple(idx_front + [slice(-p,None)] + idx_back)
             self._data[idx_ghost] = 0
 
     # ...
-    def exchange_assembly_data( self ):
+    def exchange_assembly_data(self):
         """
         Exchange assembly data.
         """
@@ -2346,38 +2401,41 @@ def exchange_assembly_data( self ):
 
         if self._codomain.parallel:
             # PARALLEL CASE: fill in ghost regions with data from neighbors
-            self._synchronizer.start_exchange_assembly_data( self._data )
-            self._synchronizer.end_exchange_assembly_data( self._data )
+            self._synchronizer.start_exchange_assembly_data(self._data)
+            self._synchronizer.  end_exchange_assembly_data(self._data)
         else:
             # SERIAL CASE: fill in ghost regions along periodic directions, otherwise set to zero
             self._exchange_assembly_data_serial()
 
     # ...
-    def _exchange_assembly_data_serial( self ):
+    def _exchange_assembly_data_serial(self):
 
-        ndim     = self._codomain.ndim
+        ndim = self._codomain.ndim
         for direction in range(ndim):
-            if direction == self._codomain_axis:continue
+            if direction == self._codomain_axis:
+                continue
             periodic = self._codomain.periods[direction]
             p        = self._codomain.pads   [direction]
-            m        = self._codomain.shifts[direction]
+            m        = self._codomain.shifts [direction]
 
             if periodic:
-                idx_front = [slice(None)]*direction
-                idx_back  = [slice(None)]*(ndim-direction-1)
+                idx_front = [slice(None)] * direction
+                idx_back  = [slice(None)] * (ndim-direction-1)
 
                 # Copy data from left to right
-                idx_to   = tuple( idx_front + [slice( m*p, m*p+p)] + idx_back )
-                idx_from = tuple( idx_front + [ slice(-m*p,-m*p+p) if (-m*p+p)!=0 else slice(-m*p,None)] + idx_back )
+                idx_to   = tuple(idx_front + [slice( m*p, m*p+p)] + idx_back)
+                idx_from = tuple(idx_front + [slice(-m*p,-m*p+p) if (-m*p+p)!=0 else slice(-m*p, None)] + idx_back)
                 self._data[idx_to] += self._data[idx_from]
 
+    # ...
     def set_backend(self, backend):
         from psydac.api.ast.linalg import LinearOperatorDot
-        self._backend         = backend
-        self._args            = self._dotargs_null.copy()
+
+        self._backend = backend
+        self._args    = self._dotargs_null.copy()
 
         if self._backend is None:
-            self._func           = self._dot
+            self._func = self._dot
         else:
             if self.domain.parallel:
 
@@ -2439,10 +2497,10 @@ def set_backend(self, backend):
                         self._args['s00_{i}'.format(i=i+1)] = np.int64(starts[i])
 
                     for i in range(len(nrows)):
-                        self._args['n00_{i}'.format(i=i+1)] =  np.int64(nrows[i])
+                        self._args['n00_{i}'.format(i=i+1)] = np.int64(nrows[i])
 
                     for i in range(len(nrows)):
-                        self._args['ne00_{i}'.format(i=i+1)] =  np.int64(nrows_extra[i])
+                        self._args['ne00_{i}'.format(i=i+1)] = np.int64(nrows_extra[i])
 
             else:
                 dot = LinearOperatorDot(self._ndim,
@@ -2469,5 +2527,4 @@ def set_backend(self, backend):
             self._func = dot.func
 
 #===============================================================================
-from psydac.api.settings   import PSYDAC_BACKENDS
 del VectorSpace, Vector
diff --git a/psydac/linalg/tests/test_block.py b/psydac/linalg/tests/test_block.py
index c556ba4a4..c67f49262 100644
--- a/psydac/linalg/tests/test_block.py
+++ b/psydac/linalg/tests/test_block.py
@@ -299,30 +299,31 @@ def test_2D_block_diagonal_solver_serial_init( dtype, n1, n2, p1, p2, P1, P2  ):
     assert L3.blocks == (M1, M2)
     assert L3.n_blocks == 2
 #===============================================================================
+@pytest.mark.parametrize( 'dtype', [float, complex] )
 @pytest.mark.parametrize( 'ndim', [1, 2, 3] )
 @pytest.mark.parametrize( 'p', [1, 2] )
 @pytest.mark.parametrize( 'P1', [True, False] )
 @pytest.mark.parametrize( 'P2', [True, False] )
 @pytest.mark.parametrize( 'P3', [True] )
-def test_block_serial_dimension( ndim, p, P1, P2, P3 ):
+def test_block_serial_dimension( ndim, p, P1, P2, P3, dtype ):
 
-    if ndim==1:
-        npts=[12]
-        ps=[p]
-        Ps=[P1]
-        shifts=[1]
+    if ndim == 1:
+        npts = [12]
+        ps = [p]
+        Ps = [P1]
+        shifts = [1]
 
-    elif ndim==2:
-        npts=[12,15]
-        ps=[p,p]
-        Ps=[P1,P2]
-        shifts=[1,1]
+    elif ndim == 2:
+        npts =[12, 15]
+        ps = [p, p]
+        Ps = [P1, P2]
+        shifts = [1, 1]
 
     else:
-        npts=[12,15,9]
-        ps=[p,p,p]
-        Ps=[P1,P2,P3]
-        shifts=[1,1,1]
+        npts = [12, 15, 9]
+        ps = [p, p, p]
+        Ps = [P1, P2, P3]
+        shifts = [1, 1, 1]
 
     # set seed for reproducibility
     D = DomainDecomposition(npts, periods=Ps)
@@ -333,7 +334,11 @@ def test_block_serial_dimension( ndim, p, P1, P2, P3 ):
     cart = CartDecomposition(D, npts, global_starts, global_ends, pads=ps, shifts=shifts)
 
     # Create vector spaces, stencil matrices, and stencil vectors
-    V = StencilVectorSpace( cart)
+    V = StencilVectorSpace( cart, dtype=dtype)
+    if dtype==complex:
+        cst=1j
+    else:
+        cst=1
 
     x1 = StencilVector( V )
     x2 = StencilVector( V )
@@ -345,22 +350,22 @@ def test_block_serial_dimension( ndim, p, P1, P2, P3 ):
 
     # Fill in vector with random values, then update ghost regions
     if ndim==1:
-        x1[:] = 2.0*np.random.random((npts[0]+2*p))
-        x2[:] = 5.0*np.random.random((npts[0]+2*p))
-        y1[:] = 2.0*np.random.random((npts[0]+2*p))
-        y2[:] = 3.0*np.random.random((npts[0]+2*p))
+        x1[:] = cst*2.0*np.random.random((npts[0]+2*p))
+        x2[:] = cst*5.0*np.random.random((npts[0]+2*p))
+        y1[:] = cst*2.0*np.random.random((npts[0]+2*p))
+        y2[:] = cst*3.0*np.random.random((npts[0]+2*p))
 
     elif ndim==2:
-        x1[:,:] = 2.0*np.random.random((npts[0]+2*p,npts[1]+2*p))
-        x2[:,:] = 5.0*np.random.random((npts[0]+2*p,npts[1]+2*p))
-        y1[:,:] = 2.0*np.random.random((npts[0]+2*p,npts[1]+2*p))
-        y2[:,:] = 3.0*np.random.random((npts[0]+2*p,npts[1]+2*p))
+        x1[:,:] = cst*2.0*np.random.random((npts[0]+2*p,npts[1]+2*p))
+        x2[:,:] = cst*5.0*np.random.random((npts[0]+2*p,npts[1]+2*p))
+        y1[:,:] = cst*2.0*np.random.random((npts[0]+2*p,npts[1]+2*p))
+        y2[:,:] = cst*3.0*np.random.random((npts[0]+2*p,npts[1]+2*p))
 
     else:
-        x1[:,:,:] = 2.0*np.random.random((npts[0]+2*p,npts[1]+2*p,npts[2]+2*p))
-        x2[:,:,:] = 5.0*np.random.random((npts[0]+2*p,npts[1]+2*p,npts[2]+2*p))
-        y1[:,:,:] = 2.0*np.random.random((npts[0]+2*p,npts[1]+2*p,npts[2]+2*p))
-        y2[:,:,:] = 3.0*np.random.random((npts[0]+2*p,npts[1]+2*p,npts[2]+2*p))
+        x1[:,:,:] = cst*2.0*np.random.random((npts[0]+2*p,npts[1]+2*p,npts[2]+2*p))
+        x2[:,:,:] = cst*5.0*np.random.random((npts[0]+2*p,npts[1]+2*p,npts[2]+2*p))
+        y1[:,:,:] = cst*2.0*np.random.random((npts[0]+2*p,npts[1]+2*p,npts[2]+2*p))
+        y2[:,:,:] = cst*3.0*np.random.random((npts[0]+2*p,npts[1]+2*p,npts[2]+2*p))
 
     x1.update_ghost_regions()
     x2.update_ghost_regions()
@@ -380,11 +385,18 @@ def test_block_serial_dimension( ndim, p, P1, P2, P3 ):
     Y[0] = y1
     Y[1] = y2
 
-    exact_dot=x1.dot(y1)+x2.dot(y2)
+    # Test dot product
+    exact_dot = x1.dot(y1)+x2.dot(y2)
 
-    assert X.dtype == float
+    assert X.dtype == dtype
     assert np.allclose(X.dot(Y), exact_dot,  rtol=1e-14, atol=1e-14 )
 
+    # Test axpy product
+    axpy_exact = X + np.pi * cst * Y
+    X.mul_iadd(np.pi * cst, Y)
+    assert np.allclose(X[0]._data, axpy_exact[0]._data,  rtol=1e-10, atol=1e-10 )
+    assert np.allclose(X[1]._data, axpy_exact[1]._data,  rtol=1e-10, atol=1e-10 )
+
     M1 = StencilMatrix(V, V)
     M2 = StencilMatrix(V, V)
     M3 = StencilMatrix(V, V)
@@ -421,7 +433,7 @@ def test_block_serial_dimension( ndim, p, P1, P2, P3 ):
     Y[0]=M1.dot(x1)+M2.dot(x2)
     Y[1]=M3.dot(x1)
 
-    assert M.dtype == float
+    assert M.dtype == dtype
     assert np.allclose((M.dot(X)).toarray(), Y.toarray(),  rtol=1e-14, atol=1e-14 )
 #===============================================================================
 @pytest.mark.parametrize( 'dtype', [float, complex] )
@@ -1415,6 +1427,13 @@ def test_block_matrix_operator_parallel_dot_backend( dtype, n1, n2, p1, p2, P1,
     y1 = M1.dot(x1) + M2.dot(x2)
     y2 = M3.dot(x1) + M4.dot(x2)
 
+    #Test axpy method in parallel
+    z3 = X + 5 * factor * Y
+    X.mul_iadd(5 * factor, Y)
+
+    # Test exact value and symetry of the scalar product
+    assert np.allclose(X[0]._data, z3[0]._data)
+
     # Check data in 1D array
     assert np.allclose( Y.blocks[0].toarray(), y1.toarray(), rtol=1e-13, atol=1e-13 )
     assert np.allclose( Y.blocks[1].toarray(), y2.toarray(), rtol=1e-13, atol=1e-13 )
diff --git a/psydac/linalg/tests/test_stencil_matrix.py b/psydac/linalg/tests/test_stencil_matrix.py
index 0fd702796..521bdd587 100644
--- a/psydac/linalg/tests/test_stencil_matrix.py
+++ b/psydac/linalg/tests/test_stencil_matrix.py
@@ -730,7 +730,7 @@ def test_stencil_matrix_2d_serial_dot_1(dtype, n1, n2, p1, p2, s1, s2, P1, P2):
 
     # Check data in 1D array
     assert y.dtype==dtype
-    assert np.allclose(ya, ya_exact, rtol=1e-13, atol=1e-13)
+    assert np.allclose(ya, ya_exact, rtol=1e-12, atol=1e-12)
 
 # TODO: verify for s>1
 # ===============================================================================
@@ -1208,9 +1208,6 @@ def test_stencil_matrix_2d_serial_vdot(dtype, n1, n2, p1, p2, s1, s2, P1, P2):
 
     # Exact result using Numpy dot product
     ya_exact = np.dot(np.conjugate(Ma), xa)
-
-    print(ya)
-    print(ya_exact)
     # Check data in 1D array
     assert y.dtype == dtype
     assert np.allclose(ya, ya_exact, rtol=1e-13, atol=1e-13)
@@ -2179,7 +2176,6 @@ def test_stencil_matrix_1d_parallel_dot(dtype, n1, p1, sh1, P1):
     ya_exact = Ms.dot(xa)
 
     # Check data in 1D array
-    print(ya-ya_exact)
     assert np.allclose(ya, ya_exact, rtol=1e-14, atol=1e-14)
 
 # ===============================================================================
diff --git a/psydac/linalg/tests/test_stencil_vector.py b/psydac/linalg/tests/test_stencil_vector.py
index 9bc51e8bf..03da52906 100644
--- a/psydac/linalg/tests/test_stencil_vector.py
+++ b/psydac/linalg/tests/test_stencil_vector.py
@@ -298,11 +298,21 @@ def test_stencil_vector_2d_serial_dot(dtype, n1, n2, p1, p2, s1, s2, P1=True, P2
     else:
         z_exact = np.dot(x.toarray(), y.toarray())
 
-    # Test exact value and symetry of the scalar product
+    # Compute axpy exact sol
+    if dtype == complex:
+        cst = 5j
+    else:
+        cst = 5
+
+    z3 = x + cst * y
+    x.mul_iadd(cst, y)
+
+    # Test exact value and symmetry of the scalar product
     assert z1.dtype == dtype
     assert z2.dtype == dtype
     assert z1 == z_exact
     assert z2 == z_exact.conjugate()
+    assert np.allclose(x._data, z3._data)
 
 # ===============================================================================
 @pytest.mark.parametrize('dtype', [float, complex])
@@ -690,6 +700,17 @@ def test_stencil_vector_2d_parallel_dot(dtype, n1, n2, p1, p2, s1, s2, P1=True,
         res_ex1 = comm.allreduce(np.dot(x.toarray(), y.toarray()))
         res_ex2 = res_ex1
 
+    # Compute axpy exact sol
+    if dtype == complex:
+        cst = 5j
+    else:
+        cst = 5
+
+    z3 = x + cst * y
+    x.mul_iadd(cst, y)
+
+    # Test exact value and symmetry of the scalar product
+    assert np.allclose(x._data, z3._data)
     assert res1 == res_ex1
     assert res2 == res_ex2
 
@@ -748,6 +769,18 @@ def test_stencil_vector_3d_parallel_dot(dtype, n1, n2, n3, p1, p2, p3, s1, s2, s
         res_ex1 = comm.allreduce(np.dot(x.toarray(), y.toarray()))
         res_ex2 = res_ex1
 
+    # Compute axpy exact sol
+    if dtype == complex:
+        cst = 5j
+    else:
+        cst = 5
+
+    z3 = x + cst * y
+    x.mul_iadd(cst, y)
+
+    # Test exact value and symmetry of the scalar product
+    assert np.allclose(x._data, z3._data)
+
     assert res1 == res_ex1
     assert res2 == res_ex2
 
diff --git a/psydac/polar/dense.py b/psydac/polar/dense.py
index edf7c908b..2e209f95e 100644
--- a/psydac/polar/dense.py
+++ b/psydac/polar/dense.py
@@ -137,6 +137,10 @@ def zeros(self):
         data = np.zeros(self.ncoeff, dtype=self.dtype)
         return DenseVector(self, data)
 
+    # ...
+    def axpy(self, a, x, y):
+        y += a * x
+
     #-------------------------------------
     # Other properties/methods
     #-------------------------------------