From 5431e0e14dbe5c1b53f35ccad4dfa15ad0b6dd67 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Wed, 20 Mar 2024 23:51:38 -0500
Subject: [PATCH 01/14] Improve docstring for kernel and device_func.

---
 numba_dpex/experimental/decorators.py | 212 +++++++++++++++++++++++---
 1 file changed, 187 insertions(+), 25 deletions(-)

diff --git a/numba_dpex/experimental/decorators.py b/numba_dpex/experimental/decorators.py
index 89ff2d7ff4..085361aa5d 100644
--- a/numba_dpex/experimental/decorators.py
+++ b/numba_dpex/experimental/decorators.py
@@ -43,16 +43,116 @@ def _parse_func_or_sig(signature_or_function):
     return pyfunc, sigs
 
 
-def kernel(func_or_sig=None, **options):
-    """A decorator to define a kernel function.
+def kernel(function_or_signature=None, **options):
+    """A decorator to compile a function written using :py:mod:`numba_dpex.kernel_api`.
+
+    The ``kernel`` decorator triggers the compilation of a function written
+    using the data-parallel kernel programming API exposed by
+    :py:mod:`numba_dpex.kernel_api`. Such a function is conceptually
+    equivalent to a kernel function written in the C++ SYCL eDSL. The
+    decorator will compile the function based on the types of the arguments
+    to a SPIR-V binary that can be executed either on OpenCL CPU, GPU
+    devices or Intel Level Zero GPU devices.
+
+    Any function to be compilable using the kernel decorator should
+    adhere to the following semantic rules:
+
+    - The first argument to the function should be either an instance of the
+      :class:`numba_dpex.kernel_api.Item` class or an instance of the
+      :class:`numba_dpex.kernel_api.NdItem`.
+
+    - The function should not return any value.
+
+    - The function should have at least one array type argument that can
+      either be an instance of ``dpnp.ndarray`` or an instance of
+      ``dpctl.tensor.usm_ndarray``.
+
+
+    Args:
+        signature_or_function (optional): An optional signature or list of
+            signatures for which a function is to be compiled. Passing in a
+            signature "specializes" the decorated function and no other versions
+            of the function will be compiled. A function can also be
+            directly passed instead of a signature and the signature will get
+            inferred from the function. The actual compilation happens on every
+            invocation of the :func:`numba_dpex.experimental.call_kernel`
+            function where the decorated function is passed in as an argument
+            along with the argument values for the decorated function.
+        options (optional):
+            - **debug** (bool): Whether the compilation should happen in debug
+              mode. *(Default = False)*
+            - **inline_threshold** (int): Specifies the level of inlining that
+              the compiler should attempt. *(Default = 2)*
+    Returns:
+        An instance of
+        :class:`numba_dpex.kernel_api_impl.spirv.dispatcher.KernelDispatcher`.
+        The ``KernelDispatcher`` object compiles the decorated function when
+        passed in to :func:`numba_dpex.experimental.call_kernel`.
+
+    Examples:
+
+    1. Decorate a function and pass it to ``call_kernel`` for compilation and
+       execution.
+
+    .. code-block:: python
+
+        import dpnp
+        import numba_dpex as dpex
+        from numba_dpex import kernel_api as kapi
+
+
+        # Data parallel kernel implementing vector sum
+        @dpex.kernel
+        def vecadd(item: kapi.Item, a, b, c):
+            i = item.get_id(0)
+            c[i] = a[i] + b[i]
+
+
+        N = 1024
+        a = dpnp.ones(N)
+        b = dpnp.ones_like(a)
+        c = dpnp.zeros_like(a)
+        dpex.call_kernel(vecadd, kapi.Range(N), a, b, c)
+
+    2. Specializes a kernel and then compiles it directly before executing it
+       via ``call_kernel``. The kernel is specialized to expect a 1-D
+       ``dpnp.ndarray`` with either ``float32`` type elements or ``int64`` type
+       elements.
+
+    .. code-block:: python
+
+        import dpnp
+        import numba_dpex as dpex
+        from numba_dpex import kernel_api as kapi
+        from numba_dpex import DpnpNdArray, float32, int64
+        from numba_dpex.core.types.kernel_api.index_space_ids import ItemType
+
+        i64arrty = DpnpNdArray(ndim=1, dtype=int64, layout="C")
+        f32arrty = DpnpNdArray(ndim=1, dtype=float32, layout="C")
+        item_ty = ItemType(ndim=1)
+
+        specialized_kernel = dpex.kernel(
+            [
+                (item_ty, i64arrty, i64arrty, i64arrty),
+                (item_ty, f32arrty, f32arrty, f32arrty),
+            ]
+        )
+
 
-    A kernel function is conceptually equivalent to a SYCL kernel function, and
-    gets compiled into either an OpenCL or a LevelZero SPIR-V binary kernel.
-    A kernel decorated Python function has the following restrictions:
+        def vecadd(item: kapi.Item, a, b, c):
+            i = item.get_id(0)
+            c[i] = a[i] + b[i]
+
+
+        # Compile all specializations for vecadd
+        precompiled_kernels = specialized_kernel(vecadd)
+        N = 1024
+        a = dpnp.ones(N, dtype=dpnp.int64)
+        b = dpnp.ones_like(a)
+        c = dpnp.zeros_like(a)
+        # Call a specific pre-compiled version of vecadd
+        dpex.call_kernel(precompiled_kernels, kapi.Range(N), a, b, c)
 
-        * The function can not return any value.
-        * All array arguments passed to a kernel should adhere to compute
-          follows data programming model.
     """
 
     # dispatcher is a type:
@@ -67,10 +167,10 @@ def kernel(func_or_sig=None, **options):
         )
     options["_compilation_mode"] = CompilationMode.KERNEL
 
-    # FIXME: The options need to be evaluated and checked here like it is
+    # TODO: The options need to be evaluated and checked here like it is
     # done in numba.core.decorators.jit
 
-    func, sigs = _parse_func_or_sig(func_or_sig)
+    func, sigs = _parse_func_or_sig(function_or_signature)
     for sig in sigs:
         if isinstance(sig, str):
             raise NotImplementedError(
@@ -104,22 +204,84 @@ def _kernel_dispatcher(pyfunc):
     return _kernel_dispatcher(func)
 
 
-def device_func(func_or_sig=None, **options):
-    """Generates a function with a device-only calling convention, e.g.,
-    spir_func for SPIR-V based devices.
+def device_func(function_or_signature=None, **options):
+    """Compiles a device-callable function that can be only invoked from a kernel.
+
+    The decorator is used to  express auxiliary device-only functions that can
+    be called from a kernel or another device function, but are not callable
+    from the host. This decorator :func:`numba_dpex.experimental.device_func`
+    has no direct analogue in SYCL and primarily is provided to help programmers
+    make their kapi applications modular.
+
+    A ``device_func`` decorated function does not require the first argument to
+    be a :class:`numba_dpex.kernel_api.Item` object or a
+    :class:`numba_dpex.kernel_api.NdItem` object, and unlike a ``kernel``
+    decorated function is allowed to return any value.
+    All :py:mod:`numba_dpex.kernel_api` functionality can be used in a
+    ``device_func`` decorated function.
+
+    The decorator is also used to compile overloads in the ``DpexKernelTarget``.
+
+    A ``device_func`` decorated function is not compiled down to device binary
+    and instead is compiled down to LLVM IR. Final compilation to binary happens
+    when the function is invoked from a ``kernel`` decorated function. The
+    compilation happens this was to allow a ``device_func`` decorated function
+    to be internally linked into the kernel module at the LLVM level, leading to
+    more optimization opportunities.
+
+    Args:
+        signature_or_function (optional): An optional signature or list of
+            signatures for which a function is to be compiled. Passing in a
+            signature "specializes" the decorated function and no other versions
+            of the function will be compiled. A function can also be
+            directly passed instead of a signature and the signature will get
+            inferred from the function. The actual compilation happens on every
+            invocation of the decorated function from another ``device_func`` or
+            ``kernel`` decorated function.
+        options (optional):
+            - **debug** (bool): Whether the compilation should happen in debug
+              mode. *(Default = False)*
+            - **inline_threshold** (int): Specifies the level of inlining that
+              the compiler should attempt. *(Default = 2)*
 
-    The decorator is used to compile overloads in the DpexKernelTarget and
-    users should use the decorator to define functions that are only callable
-    from inside another device_func or a kernel.
+    Returns:
+        An instance of
+        :class:`numba_dpex.kernel_api_impl.spirv.dispatcher.KernelDispatcher`.
+        The ``KernelDispatcher`` object compiles the decorated function when
+        it is called from another function.
 
-    A device_func is not compiled down to device binary IR and instead left as
-    LLVM IR. It is done so that the function can be inlined fully into the
-    kernel module from where it is used at the LLVM level, leading to more
-    optimization opportunities.
 
-    Returns:
-        KernelDispatcher: A KernelDispatcher instance with the
-        _compilation_mode option set to DEVICE_FUNC.
+    Example:
+
+    .. code-block:: python
+
+        import dpnp
+
+        from numba_dpex import experimental as dpex_exp
+        from numba_dpex import kernel_api as kapi
+
+
+        @dpex_exp.device_func
+        def increment_value(nd_item: NdItem, a):
+            i = nd_item.get_global_id(0)
+
+            a[i] += 1
+            group_barrier(nd_item.get_group(), MemoryScope.DEVICE)
+
+            if i == 0:
+                for idx in range(1, a.size):
+                    a[0] += a[idx]
+
+
+        @dpex_exp.kernel
+        def another_kernel(nd_item: NdItem, a):
+            increment_value(nd_item, a)
+
+
+        N = 16
+        b = dpnp.ones(N, dtype=dpnp.int32)
+
+        dpex_exp.call_kernel(another_kernel, dpex.NdRange((N,), (N,)), b)
     """
     dispatcher = resolve_dispatcher_from_str(DPEX_KERNEL_EXP_TARGET_NAME)
 
@@ -132,7 +294,7 @@ def device_func(func_or_sig=None, **options):
         )
     options["_compilation_mode"] = CompilationMode.DEVICE_FUNC
 
-    func, sigs = _parse_func_or_sig(func_or_sig)
+    func, sigs = _parse_func_or_sig(function_or_signature)
     for sig in sigs:
         if isinstance(sig, str):
             raise NotImplementedError(
@@ -156,7 +318,7 @@ def _kernel_dispatcher(pyfunc):
     if func is None:
         return _kernel_dispatcher
 
-    return _kernel_dispatcher(func_or_sig)
+    return _kernel_dispatcher(function_or_signature)
 
 
 jit_registry[target_registry[DPEX_KERNEL_EXP_TARGET_NAME]] = device_func

From 4c700e7db02b5bcaa20871ad5af2637be618cdc0 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Thu, 21 Mar 2024 14:55:06 -0500
Subject: [PATCH 02/14] Update kernel tests to use top-level module

---
 numba_dpex/__init__.py                        |  12 +-
 .../{experimental => core}/decorators.py      |  40 +++-
 .../launcher.py => core/kernel_launcher.py}   |   0
 numba_dpex/decorators.py                      | 171 ------------------
 numba_dpex/dpnp_iface/_intrinsic.py           |   2 +-
 numba_dpex/experimental/__init__.py           |   6 -
 .../tests/kernel_tests/test_async_kernel.py   |  12 +-
 .../tests/kernel_tests/test_atomic_fence.py   |   5 +-
 .../kernel_tests/test_atomic_fetch_phi.py     |  32 ++--
 .../test_atomic_load_store_cmp_exchg.py       |  19 +-
 .../tests/kernel_tests/test_atomic_ref.py     |  20 +-
 .../tests/kernel_tests/test_barriers.py       |  11 +-
 .../kernel_tests/test_compiler_warnings.py    |   4 +-
 .../test_complex_array_kernel_arg.py          |   2 +-
 .../kernel_tests/test_dpnp_ndarray_args.py    |   2 +-
 .../kernel_tests/test_dump_kernel_llvm.py     |   2 +-
 .../kernel_tests/test_exec_queue_inference.py |  10 +-
 numba_dpex/tests/kernel_tests/test_func.py    |   2 +-
 .../test_func_qualname_disambiguation.py      |   2 +-
 .../kernel_tests/test_func_specialization.py  |   2 +-
 .../kernel_tests/test_index_space_ids.py      |   3 +-
 .../test_inline_threshold_config.py           |   7 +-
 .../kernel_tests/test_invalid_kernel_args.py  |   2 +-
 .../kernel_tests/test_kernel_dispatcher.py    |  18 +-
 .../test_kernel_has_return_value_error.py     |   2 +-
 .../test_kernel_specialization.py             |  20 +-
 .../kernel_tests/test_local_accessors.py      |  11 +-
 .../tests/kernel_tests/test_math_functions.py |   2 +-
 .../test_memory_enum_compilation.py           |  14 +-
 numba_dpex/tests/kernel_tests/test_print.py   |   2 +-
 .../tests/kernel_tests/test_private_array.py  |  10 +-
 .../kernel_tests/test_scalar_arg_types.py     |   2 +-
 .../test_strided_dpnp_array_in_kernel.py      |  20 +-
 ...st_supported_array_types_as_kernel_args.py |   6 +-
 .../test_target_specific_overload.py          |   6 +-
 .../kernel_tests/test_usm_ndarray_args.py     |   2 +-
 36 files changed, 165 insertions(+), 318 deletions(-)
 rename numba_dpex/{experimental => core}/decorators.py (89%)
 rename numba_dpex/{experimental/launcher.py => core/kernel_launcher.py} (100%)
 delete mode 100644 numba_dpex/decorators.py

diff --git a/numba_dpex/__init__.py b/numba_dpex/__init__.py
index f77566dec4..999a0a6dd9 100644
--- a/numba_dpex/__init__.py
+++ b/numba_dpex/__init__.py
@@ -108,8 +108,9 @@ def parse_sem_version(version_string: str) -> Tuple[int, int, int]:
 # backward compatibility
 from numba_dpex.kernel_api import NdRange, Range  # noqa E402
 
+from .core.decorators import device_func, dpjit, kernel  # noqa E402
+from .core.kernel_launcher import call_kernel, call_kernel_async  # noqa E402
 from .core.targets import dpjit_target  # noqa E402
-from .decorators import dpjit, func, kernel  # noqa E402
 from .ocl.stubs import (  # noqa E402
     GLOBAL_MEM_FENCE,
     LOCAL_MEM_FENCE,
@@ -137,4 +138,11 @@ def parse_sem_version(version_string: str) -> Tuple[int, int, int]:
 __version__ = get_versions()["version"]
 del get_versions
 
-__all__ = types.__all__ + ["Range", "NdRange", "call_kernel"]
+__all__ = types.__all__ + [
+    "call_kernel",
+    "device_func",
+    "dpjit",
+    "kernel",
+    "Range",
+    "NdRange",
+]
diff --git a/numba_dpex/experimental/decorators.py b/numba_dpex/core/decorators.py
similarity index 89%
rename from numba_dpex/experimental/decorators.py
rename to numba_dpex/core/decorators.py
index 085361aa5d..3779526b9f 100644
--- a/numba_dpex/experimental/decorators.py
+++ b/numba_dpex/core/decorators.py
@@ -1,25 +1,24 @@
-# SPDX-FileCopyrightText: 2023 - 2024 Intel Corporation
+# SPDX-FileCopyrightText: 2020 - 2024 Intel Corporation
 #
 # SPDX-License-Identifier: Apache-2.0
 
-""" The set of experimental decorators provided by numba_dpex that are not yet
-ready to move to numba_dpex.core.
-"""
 import inspect
 from warnings import warn
 
-from numba.core import sigutils, typeinfer
+from numba.core import decorators, sigutils, typeinfer
 from numba.core.target_extension import (
     jit_registry,
     resolve_dispatcher_from_str,
     target_registry,
 )
 
+from numba_dpex.core import config
+from numba_dpex.core.pipelines.dpjit_compiler import get_compiler
+from numba_dpex.core.targets.dpjit_target import DPEX_TARGET_NAME
+from numba_dpex.experimental.target import DPEX_KERNEL_EXP_TARGET_NAME
 from numba_dpex.kernel_api_impl.spirv.dispatcher import SPIRVKernelDispatcher
 from numba_dpex.kernel_api_impl.spirv.target import CompilationMode
 
-from .target import DPEX_KERNEL_EXP_TARGET_NAME
-
 
 def _parse_func_or_sig(signature_or_function):
     # Handle signature (borrowed from numba). swapped signature and list check
@@ -321,4 +320,31 @@ def _kernel_dispatcher(pyfunc):
     return _kernel_dispatcher(function_or_signature)
 
 
+# ----------------- Experimental dpjit decorator ------------------------------#
+
+
+def dpjit(*args, **kws):
+    if "nopython" in kws and kws["nopython"] is not True:
+        warn("nopython is set for dpjit and is ignored", RuntimeWarning)
+    if "forceobj" in kws:
+        warn("forceobj is set for dpjit and is ignored", RuntimeWarning)
+        del kws["forceobj"]
+    if "pipeline_class" in kws:
+        warn("pipeline class is set for dpjit and is ignored", RuntimeWarning)
+        del kws["pipeline_class"]
+
+    use_mlir = kws.pop("use_mlir", bool(config.USE_MLIR))
+
+    kws.update({"nopython": True})
+    kws.update({"parallel": True})
+    kws.update({"pipeline_class": get_compiler(use_mlir)})
+
+    kws.update({"_target": DPEX_TARGET_NAME})
+
+    return decorators.jit(*args, **kws)
+
+
+# add it to the decorator registry, this is so e.g. @overload can look up a
+# JIT function to do the compilation work.
+jit_registry[target_registry[DPEX_TARGET_NAME]] = dpjit
 jit_registry[target_registry[DPEX_KERNEL_EXP_TARGET_NAME]] = device_func
diff --git a/numba_dpex/experimental/launcher.py b/numba_dpex/core/kernel_launcher.py
similarity index 100%
rename from numba_dpex/experimental/launcher.py
rename to numba_dpex/core/kernel_launcher.py
diff --git a/numba_dpex/decorators.py b/numba_dpex/decorators.py
deleted file mode 100644
index 97e1ed8028..0000000000
--- a/numba_dpex/decorators.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# SPDX-FileCopyrightText: 2020 - 2024 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import inspect
-import warnings
-
-from numba.core import decorators, sigutils
-from numba.core.target_extension import jit_registry, target_registry
-
-from numba_dpex.core import config
-from numba_dpex.core.kernel_interface.dispatcher import JitKernel
-from numba_dpex.core.kernel_interface.func import (
-    compile_func,
-    compile_func_template,
-)
-from numba_dpex.core.pipelines.dpjit_compiler import get_compiler
-from numba_dpex.core.targets.dpjit_target import DPEX_TARGET_NAME
-
-
-def kernel(
-    func_or_sig=None,
-    debug=False,
-    enable_cache=True,
-):
-    """A decorator to define a kernel function.
-
-    A kernel function is conceptually equivalent to a SYCL kernel function, and
-    gets compiled into either an OpenCL or a LevelZero SPIR-V binary kernel.
-    A kernel decorated Python function has the following restrictions:
-
-        * The function can not return any value.
-        * All array arguments passed to a kernel should adhere to compute
-          follows data programming model.
-    """
-
-    def _kernel_dispatcher(pyfunc, sigs=None):
-        return JitKernel(
-            pyfunc=pyfunc,
-            debug_flags=debug,
-            enable_cache=enable_cache,
-            specialization_sigs=sigs,
-        )
-
-    if func_or_sig is None:
-        return _kernel_dispatcher
-    elif isinstance(func_or_sig, str):
-        raise NotImplementedError(
-            "Specifying signatures as string is not yet supported by numba-dpex"
-        )
-    elif isinstance(func_or_sig, list) or sigutils.is_signature(func_or_sig):
-        # String signatures are not supported as passing usm_ndarray type as
-        # a string is not possible. Numba's sigutils relies on the type being
-        # available in Numba's `types.__dict__` and dpex types are not registered
-        # there yet.
-        if isinstance(func_or_sig, list):
-            for sig in func_or_sig:
-                if isinstance(sig, str):
-                    raise NotImplementedError(
-                        "Specifying signatures as string is not yet supported "
-                        "by numba-dpex"
-                    )
-        # Specialized signatures can either be a single signature or a list.
-        # In case only one signature is provided convert it to a list
-        if not isinstance(func_or_sig, list):
-            func_or_sig = [func_or_sig]
-
-        def _specialized_kernel_dispatcher(pyfunc):
-            return JitKernel(
-                pyfunc=pyfunc,
-                debug_flags=debug,
-                enable_cache=enable_cache,
-                specialization_sigs=func_or_sig,
-            )
-
-        return _specialized_kernel_dispatcher
-    else:
-        func = func_or_sig
-        if not inspect.isfunction(func):
-            raise ValueError(
-                "Argument passed to the kernel decorator is neither a "
-                "function object, nor a signature. If you are trying to "
-                "specialize the kernel that takes a single argument, specify "
-                "the return type as void explicitly."
-            )
-        return _kernel_dispatcher(func)
-
-
-def func(func_or_sig=None, debug=False, enable_cache=True):
-    """A decorator to define a kernel device function.
-
-    Device functions are functions that can be only invoked from a kernel
-    and not from a host function. This provides a special decorator
-    `numba_dpex.func` specifically to implement a device function.
-
-    A device function can be invoked from another device function and
-    unlike a kernel function, a device function can return a value like
-    normal functions.
-    """
-
-    def _func_autojit(pyfunc):
-        return compile_func_template(
-            pyfunc, debug=debug, enable_cache=enable_cache
-        )
-
-    if func_or_sig is None:
-        return _func_autojit
-    elif isinstance(func_or_sig, str):
-        raise NotImplementedError(
-            "Specifying signatures as string is not yet supported by numba-dpex"
-        )
-    elif isinstance(func_or_sig, list) or sigutils.is_signature(func_or_sig):
-        # String signatures are not supported as passing usm_ndarray type as
-        # a string is not possible. Numba's sigutils relies on the type being
-        # available in Numba's types.__dict__ and dpex types are not registered
-        # there yet.
-        if isinstance(func_or_sig, list):
-            for sig in func_or_sig:
-                if isinstance(sig, str):
-                    raise NotImplementedError(
-                        "Specifying signatures as string is not yet supported "
-                        "by numba-dpex"
-                    )
-        # Specialized signatures can either be a single signature or a list.
-        # In case only one signature is provided convert it to a list
-        if not isinstance(func_or_sig, list):
-            func_or_sig = [func_or_sig]
-
-        def _wrapped(pyfunc):
-            return compile_func(pyfunc, func_or_sig, debug=debug)
-
-        return _wrapped
-    else:
-        # no signature
-        func = func_or_sig
-        return _func_autojit(func)
-
-
-# ----------------- Experimental dpjit decorator ------------------------------#
-
-
-def dpjit(*args, **kws):
-    if "nopython" in kws and kws["nopython"] is not True:
-        warnings.warn(
-            "nopython is set for dpjit and is ignored", RuntimeWarning
-        )
-    if "forceobj" in kws:
-        warnings.warn(
-            "forceobj is set for dpjit and is ignored", RuntimeWarning
-        )
-        del kws["forceobj"]
-    if "pipeline_class" in kws:
-        warnings.warn(
-            "pipeline class is set for dpjit and is ignored", RuntimeWarning
-        )
-        del kws["pipeline_class"]
-
-    use_mlir = kws.pop("use_mlir", bool(config.USE_MLIR))
-
-    kws.update({"nopython": True})
-    kws.update({"parallel": True})
-    kws.update({"pipeline_class": get_compiler(use_mlir)})
-
-    kws.update({"_target": DPEX_TARGET_NAME})
-
-    return decorators.jit(*args, **kws)
-
-
-# add it to the decorator registry, this is so e.g. @overload can look up a
-# JIT function to do the compilation work.
-jit_registry[target_registry[DPEX_TARGET_NAME]] = dpjit
diff --git a/numba_dpex/dpnp_iface/_intrinsic.py b/numba_dpex/dpnp_iface/_intrinsic.py
index d37402079b..0e402331f0 100644
--- a/numba_dpex/dpnp_iface/_intrinsic.py
+++ b/numba_dpex/dpnp_iface/_intrinsic.py
@@ -281,7 +281,7 @@ def _empty_nd_impl(context, builder, arrtype, shapes, queue_ref):
         types.uint64,
         types.voidptr,
     )
-    from numba_dpex.decorators import dpjit
+    from numba_dpex.core.decorators import dpjit
 
     op = dpjit(_call_usm_allocator)
     fnop = context.typing_context.resolve_value_type(op)
diff --git a/numba_dpex/experimental/__init__.py b/numba_dpex/experimental/__init__.py
index 97134ee042..c5682cb2ad 100644
--- a/numba_dpex/experimental/__init__.py
+++ b/numba_dpex/experimental/__init__.py
@@ -20,8 +20,6 @@
     _index_space_id_overloads,
     _private_array_overloads,
 )
-from .decorators import device_func, kernel
-from .launcher import call_kernel, call_kernel_async
 from .models import *
 from .types import KernelDispatcherType
 
@@ -38,9 +36,5 @@ def dpex_dispatcher_const(context):
 
 
 __all__ = [
-    "device_func",
-    "kernel",
-    "call_kernel",
-    "call_kernel_async",
     "SPIRVKernelDispatcher",
 ]
diff --git a/numba_dpex/tests/kernel_tests/test_async_kernel.py b/numba_dpex/tests/kernel_tests/test_async_kernel.py
index 58ec040e43..f3801ec537 100644
--- a/numba_dpex/tests/kernel_tests/test_async_kernel.py
+++ b/numba_dpex/tests/kernel_tests/test_async_kernel.py
@@ -7,12 +7,12 @@
 import pytest
 from numba.core.errors import TypingError
 
-import numba_dpex.experimental as exp_dpex
+import numba_dpex as dpex
 from numba_dpex.experimental import testing
 from numba_dpex.kernel_api import Item, Range
 
 
-@exp_dpex.kernel(
+@dpex.kernel(
     release_gil=False,
     no_compile=True,
     no_cpython_wrapper=True,
@@ -31,7 +31,7 @@ def test_async_add():
 
     r = Range(size)
 
-    host_ref, event_ref = exp_dpex.call_kernel_async(
+    host_ref, event_ref = dpex.call_kernel_async(
         add,
         r,
         (),
@@ -60,7 +60,7 @@ def test_async_dependent_add_list_exception():
     # TODO: should capture ValueError, but numba captures it and generates
     # TypingError. ValueError is still readable there.
     with pytest.raises(TypingError):
-        exp_dpex.call_kernel_async(
+        dpex.call_kernel_async(
             add,
             Range(size),
             [dpctl.SyclEvent()],
@@ -78,7 +78,7 @@ def test_async_dependent_add():
 
     r = Range(size)
 
-    host_ref, event_ref = exp_dpex.call_kernel_async(
+    host_ref, event_ref = dpex.call_kernel_async(
         add,
         r,
         (),
@@ -87,7 +87,7 @@ def test_async_dependent_add():
         c,
     )
 
-    host2_ref, event2_ref = exp_dpex.call_kernel_async(
+    host2_ref, event2_ref = dpex.call_kernel_async(
         add,
         r,
         (event_ref,),
diff --git a/numba_dpex/tests/kernel_tests/test_atomic_fence.py b/numba_dpex/tests/kernel_tests/test_atomic_fence.py
index dd9196028f..cea112379f 100644
--- a/numba_dpex/tests/kernel_tests/test_atomic_fence.py
+++ b/numba_dpex/tests/kernel_tests/test_atomic_fence.py
@@ -5,7 +5,6 @@
 import dpnp
 
 import numba_dpex as dpex
-import numba_dpex.experimental as dpex_exp
 from numba_dpex.kernel_api import (
     AtomicRef,
     Item,
@@ -18,7 +17,7 @@
 def test_atomic_fence():
     """A test for atomic_fence function."""
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def _kernel(item: Item, a, b):
         i = item.get_id(0)
 
@@ -39,6 +38,6 @@ def _kernel(item: Item, a, b):
     a = dpnp.ones(N, dtype=dpnp.int64)
     b = dpnp.zeros(1, dtype=dpnp.int64)
 
-    dpex_exp.call_kernel(_kernel, dpex.Range(N), a, b)
+    dpex.call_kernel(_kernel, dpex.Range(N), a, b)
 
     assert a[0] == N + 1
diff --git a/numba_dpex/tests/kernel_tests/test_atomic_fetch_phi.py b/numba_dpex/tests/kernel_tests/test_atomic_fetch_phi.py
index c239c8f91a..00dbc498b3 100644
--- a/numba_dpex/tests/kernel_tests/test_atomic_fetch_phi.py
+++ b/numba_dpex/tests/kernel_tests/test_atomic_fetch_phi.py
@@ -6,7 +6,7 @@
 import pytest
 from numba.core.errors import TypingError
 
-import numba_dpex.experimental as dpex_exp
+import numba_dpex as dpex
 from numba_dpex.kernel_api import AtomicRef, Item, Range
 from numba_dpex.tests._helper import get_all_dtypes
 
@@ -43,7 +43,7 @@ def input_arrays(request):
 def test_fetch_phi_fn(input_arrays, ref_index, fetch_phi_fn):
     """A test for all fetch_phi atomic functions."""
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def _kernel(item: Item, a, b, ref_index):
         i = item.get_id(0)
         v = AtomicRef(b, index=ref_index)
@@ -59,9 +59,9 @@ def _kernel(item: Item, a, b, ref_index):
         # fetch_and, fetch_or, fetch_xor accept only int arguments.
         # test for TypingError when float arguments are passed.
         with pytest.raises(TypingError):
-            dpex_exp.call_kernel(_kernel, Range(10), a, b, ref_index)
+            dpex.call_kernel(_kernel, Range(10), a, b, ref_index)
     else:
-        dpex_exp.call_kernel(_kernel, Range(10), a, b, ref_index)
+        dpex.call_kernel(_kernel, Range(10), a, b, ref_index)
         # Verify that `a` accumulated at b[ref_index] by kernel
         # matches the `a` accumulated at  b[ref_index+1] using Python
         for i in range(a.size):
@@ -74,7 +74,7 @@ def _kernel(item: Item, a, b, ref_index):
 def test_fetch_phi_retval(fetch_phi_fn):
     """A test for all fetch_phi atomic functions."""
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def _kernel(item: Item, a, b, c):
         i = item.get_id(0)
         v = AtomicRef(b, index=i)
@@ -88,7 +88,7 @@ def _kernel(item: Item, a, b, c):
     b_copy = dpnp.copy(b)
     c_copy = dpnp.copy(c)
 
-    dpex_exp.call_kernel(_kernel, Range(10), a, b, c)
+    dpex.call_kernel(_kernel, Range(10), a, b, c)
 
     # Verify if the value returned by fetch_phi kernel
     # stored into `c` is same as the value returned
@@ -106,7 +106,7 @@ def test_fetch_phi_diff_types(fetch_phi_fn):
     AtomicRef type and value to be added are of different types.
     """
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def _kernel(item: Item, a, b):
         i = item.get_id(0)
         v = AtomicRef(b, index=0)
@@ -117,17 +117,17 @@ def _kernel(item: Item, a, b):
     b = dpnp.zeros(N, dtype=dpnp.int32)
 
     with pytest.raises(TypingError):
-        dpex_exp.call_kernel(_kernel, Range(10), a, b)
+        dpex.call_kernel(_kernel, Range(10), a, b)
 
 
-@dpex_exp.kernel
+@dpex.kernel
 def atomic_ref_0(item: Item, a):
     i = item.get_id(0)
     v = AtomicRef(a, index=0)
     v.fetch_add(a[i + 2])
 
 
-@dpex_exp.kernel
+@dpex.kernel
 def atomic_ref_1(item: Item, a):
     i = item.get_id(0)
     v = AtomicRef(a, index=1)
@@ -143,14 +143,14 @@ def test_spirv_compiler_flags_add():
     N = 10
     a = dpnp.ones(N, dtype=dpnp.float32)
 
-    dpex_exp.call_kernel(atomic_ref_0, Range(N - 2), a)
-    dpex_exp.call_kernel(atomic_ref_1, Range(N - 2), a)
+    dpex.call_kernel(atomic_ref_0, Range(N - 2), a)
+    dpex.call_kernel(atomic_ref_1, Range(N - 2), a)
 
     assert a[0] == N - 1
     assert a[1] == N - 1
 
 
-@dpex_exp.kernel
+@dpex.kernel
 def atomic_max_0(item: Item, a):
     i = item.get_id(0)
     v = AtomicRef(a, index=0)
@@ -158,7 +158,7 @@ def atomic_max_0(item: Item, a):
         v.fetch_max(a[i])
 
 
-@dpex_exp.kernel
+@dpex.kernel
 def atomic_max_1(item: Item, a):
     i = item.get_id(0)
     v = AtomicRef(a, index=0)
@@ -176,8 +176,8 @@ def test_spirv_compiler_flags_max():
     a = dpnp.arange(N, dtype=dpnp.float32)
     b = dpnp.arange(N, dtype=dpnp.float32)
 
-    dpex_exp.call_kernel(atomic_max_0, Range(N), a)
-    dpex_exp.call_kernel(atomic_max_1, Range(N), b)
+    dpex.call_kernel(atomic_max_0, Range(N), a)
+    dpex.call_kernel(atomic_max_1, Range(N), b)
 
     assert a[0] == N - 1
     assert b[0] == N - 1
diff --git a/numba_dpex/tests/kernel_tests/test_atomic_load_store_cmp_exchg.py b/numba_dpex/tests/kernel_tests/test_atomic_load_store_cmp_exchg.py
index 9d34d99319..77618e3965 100644
--- a/numba_dpex/tests/kernel_tests/test_atomic_load_store_cmp_exchg.py
+++ b/numba_dpex/tests/kernel_tests/test_atomic_load_store_cmp_exchg.py
@@ -8,7 +8,6 @@
 from numba.core.errors import TypingError
 
 import numba_dpex as dpex
-import numba_dpex.experimental as dpex_exp
 from numba_dpex.kernel_api import AtomicRef
 from numba_dpex.tests._helper import get_all_dtypes
 
@@ -26,7 +25,7 @@ def store_exchange_fn(request):
 def test_load_store_fn(supported_dtype):
     """A test for load/store atomic functions."""
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def _kernel(item, a, b):
         i = item.get_id(0)
         a_ref = AtomicRef(a, index=i)
@@ -38,7 +37,7 @@ def _kernel(item, a, b):
     a = dpnp.zeros(2 * N, dtype=supported_dtype)
     b = dpnp.arange(N, dtype=supported_dtype)
 
-    dpex_exp.call_kernel(_kernel, dpex.Range(b.size), a, b)
+    dpex.call_kernel(_kernel, dpex.Range(b.size), a, b)
     # Verify that `b[i]` loaded and stored into a[i] by kernel
     # matches the `b[i]` loaded stored into a[i] using Python
     for i in range(b.size):
@@ -54,7 +53,7 @@ def _kernel(item, a, b):
 def test_exchange_fn(supported_dtype):
     """A test for exchange atomic function."""
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def _kernel(item, a, b):
         i = item.get_id(0)
         v = AtomicRef(a, index=i)
@@ -67,7 +66,7 @@ def _kernel(item, a, b):
     a_copy = dpnp.copy(a_orig)
     b_copy = dpnp.copy(b_orig)
 
-    dpex_exp.call_kernel(_kernel, dpex.Range(b_orig.size), a_copy, b_copy)
+    dpex.call_kernel(_kernel, dpex.Range(b_orig.size), a_copy, b_copy)
 
     # Values in `b` have been exchanged
     # with values in `a`.
@@ -82,7 +81,7 @@ def _kernel(item, a, b):
 def test_compare_exchange_fns(supported_dtype):
     """A test for compare exchange atomic functions."""
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def _kernel(b):
         b_ref = AtomicRef(b, index=1)
         b[0] = b_ref.compare_exchange(
@@ -91,13 +90,13 @@ def _kernel(b):
 
     b = dpnp.arange(4, dtype=supported_dtype)
 
-    dpex_exp.call_kernel(_kernel, dpex.Range(1), b)
+    dpex.call_kernel(_kernel, dpex.Range(1), b)
 
     # check for failure
     assert b[0] == 0
     assert b[2] == b[1]
 
-    dpex_exp.call_kernel(_kernel, dpex.Range(1), b)
+    dpex.call_kernel(_kernel, dpex.Range(1), b)
 
     # check for success
     assert b[0] == 1
@@ -109,7 +108,7 @@ def test_store_exchange_diff_types(store_exchange_fn):
     AtomicRef type and value are of different types.
     """
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def _kernel(item, a, b):
         i = item.get_id(0)
         v = AtomicRef(b, index=0)
@@ -120,4 +119,4 @@ def _kernel(item, a, b):
     b = dpnp.zeros(N, dtype=dpnp.int32)
 
     with pytest.raises(TypingError):
-        dpex_exp.call_kernel(_kernel, dpex.Range(10), a, b)
+        dpex.call_kernel(_kernel, dpex.Range(10), a, b)
diff --git a/numba_dpex/tests/kernel_tests/test_atomic_ref.py b/numba_dpex/tests/kernel_tests/test_atomic_ref.py
index dc93bcf620..7632cc6411 100644
--- a/numba_dpex/tests/kernel_tests/test_atomic_ref.py
+++ b/numba_dpex/tests/kernel_tests/test_atomic_ref.py
@@ -7,7 +7,7 @@
 import pytest
 from numba.core.errors import TypingError
 
-import numba_dpex.experimental as dpex_exp
+import numba_dpex as dpex
 import numba_dpex.kernel_api as kapi
 
 
@@ -17,7 +17,7 @@ def ref_type_options(request):
 
 
 def test_atomic_ref_compilation():
-    @dpex_exp.kernel
+    @dpex.kernel
     def atomic_ref_kernel(item: kapi.Item, a, b):
         i = item.get_id(0)
         v = kapi.AtomicRef(b, index=0)
@@ -26,13 +26,13 @@ def atomic_ref_kernel(item: kapi.Item, a, b):
     a = dpnp.ones(10)
     b = dpnp.zeros(10)
     try:
-        dpex_exp.call_kernel(atomic_ref_kernel, kapi.Range(10), a, b)
+        dpex.call_kernel(atomic_ref_kernel, kapi.Range(10), a, b)
     except Exception:
         pytest.fail("Unexpected execution failure")
 
 
 def test_atomic_ref_3_dim_compilation():
-    @dpex_exp.kernel
+    @dpex.kernel
     def atomic_ref_kernel(item: kapi.Item, a, b):
         i = item.get_id(0)
         v = kapi.AtomicRef(b, index=(1, 1, 1))
@@ -45,7 +45,7 @@ def atomic_ref_kernel(item: kapi.Item, a, b):
     want[1, 1, 1] = a.size
 
     try:
-        dpex_exp.call_kernel(atomic_ref_kernel, kapi.Range(a.size), a, b)
+        dpex.call_kernel(atomic_ref_kernel, kapi.Range(a.size), a, b)
     except Exception:
         pytest.fail("Unexpected execution failure")
 
@@ -58,7 +58,7 @@ def test_atomic_ref_compilation_failure():
     ref.
     """
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def atomic_ref_kernel(item: kapi.Item, a, b):
         i = item.get_id(0)
         v = kapi.AtomicRef(b, index=0, address_space=kapi.AddressSpace.LOCAL)
@@ -68,13 +68,13 @@ def atomic_ref_kernel(item: kapi.Item, a, b):
     b = dpnp.zeros(10)
 
     with pytest.raises(TypingError):
-        dpex_exp.call_kernel(atomic_ref_kernel, kapi.Range(10), a, b)
+        dpex.call_kernel(atomic_ref_kernel, kapi.Range(10), a, b)
 
 
 def test_atomic_ref_compilation_local_accessor():
     """Tests if an AtomicRef object can be constructed from a LocalAccessor"""
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def atomic_ref_slm_kernel(nditem: kapi.Item, a, slm):
         gi = nditem.get_global_id(0)
         v = kapi.AtomicRef(slm, 0)
@@ -85,9 +85,7 @@ def atomic_ref_slm_kernel(nditem: kapi.Item, a, slm):
 
     a = dpnp.zeros(32)
     slm = kapi.LocalAccessor(1, a.dtype)
-    dpex_exp.call_kernel(
-        atomic_ref_slm_kernel, kapi.NdRange((32,), (32,)), a, slm
-    )
+    dpex.call_kernel(atomic_ref_slm_kernel, kapi.NdRange((32,), (32,)), a, slm)
     want = dpnp.full_like(a, 32 * a.dtype.type(5))
     assert np.allclose(a.asnumpy(), want.asnumpy())
 
diff --git a/numba_dpex/tests/kernel_tests/test_barriers.py b/numba_dpex/tests/kernel_tests/test_barriers.py
index f6c93fb692..7d81b2eb3e 100644
--- a/numba_dpex/tests/kernel_tests/test_barriers.py
+++ b/numba_dpex/tests/kernel_tests/test_barriers.py
@@ -5,14 +5,13 @@
 import dpnp
 
 import numba_dpex as dpex
-import numba_dpex.experimental as dpex_exp
 from numba_dpex.kernel_api import MemoryScope, NdItem, group_barrier
 
 
 def test_group_barrier():
     """A test for group_barrier function."""
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def _kernel(nd_item: NdItem, a):
         i = nd_item.get_global_id(0)
 
@@ -26,7 +25,7 @@ def _kernel(nd_item: NdItem, a):
     N = 16
     a = dpnp.ones(N, dtype=dpnp.int32)
 
-    dpex_exp.call_kernel(_kernel, dpex.NdRange((N,), (N,)), a)
+    dpex.call_kernel(_kernel, dpex.NdRange((N,), (N,)), a)
 
     assert a[0] == N * 2
 
@@ -34,7 +33,7 @@ def _kernel(nd_item: NdItem, a):
 def test_group_barrier_device_func():
     """A test for group_barrier function."""
 
-    @dpex_exp.device_func
+    @dpex.device_func
     def _increment_value(nd_item: NdItem, a):
         i = nd_item.get_global_id(0)
 
@@ -45,13 +44,13 @@ def _increment_value(nd_item: NdItem, a):
             for idx in range(1, a.size):
                 a[0] += a[idx]
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def _kernel(nd_item: NdItem, a):
         _increment_value(nd_item, a)
 
     N = 16
     a = dpnp.ones(N, dtype=dpnp.int32)
 
-    dpex_exp.call_kernel(_kernel, dpex.NdRange((N,), (N,)), a)
+    dpex.call_kernel(_kernel, dpex.NdRange((N,), (N,)), a)
 
     assert a[0] == N * 2
diff --git a/numba_dpex/tests/kernel_tests/test_compiler_warnings.py b/numba_dpex/tests/kernel_tests/test_compiler_warnings.py
index 67f26dd8d2..c051c39dea 100644
--- a/numba_dpex/tests/kernel_tests/test_compiler_warnings.py
+++ b/numba_dpex/tests/kernel_tests/test_compiler_warnings.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from numba_dpex import experimental as dpex_exp
+from numba_dpex import kernel
 from numba_dpex.kernel_api import Item
 
 
@@ -15,4 +15,4 @@ def _kernel(item: Item, a, b, c):
 
 def test_compilation_mode_option_user_definition():
     with pytest.warns(UserWarning):
-        dpex_exp.kernel(_compilation_mode="kernel")(_kernel)
+        kernel(_compilation_mode="kernel")(_kernel)
diff --git a/numba_dpex/tests/kernel_tests/test_complex_array_kernel_arg.py b/numba_dpex/tests/kernel_tests/test_complex_array_kernel_arg.py
index 0b110595fc..283b738005 100644
--- a/numba_dpex/tests/kernel_tests/test_complex_array_kernel_arg.py
+++ b/numba_dpex/tests/kernel_tests/test_complex_array_kernel_arg.py
@@ -6,7 +6,7 @@
 import numpy
 import pytest
 
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 from numba_dpex.tests._helper import get_all_dtypes
 
 N = 1024
diff --git a/numba_dpex/tests/kernel_tests/test_dpnp_ndarray_args.py b/numba_dpex/tests/kernel_tests/test_dpnp_ndarray_args.py
index 0077f4dff3..2eac38c249 100644
--- a/numba_dpex/tests/kernel_tests/test_dpnp_ndarray_args.py
+++ b/numba_dpex/tests/kernel_tests/test_dpnp_ndarray_args.py
@@ -4,7 +4,7 @@
 
 import dpnp
 
-import numba_dpex.experimental as ndpx
+import numba_dpex as ndpx
 from numba_dpex import float32
 from numba_dpex import kernel_api as kapi
 
diff --git a/numba_dpex/tests/kernel_tests/test_dump_kernel_llvm.py b/numba_dpex/tests/kernel_tests/test_dump_kernel_llvm.py
index 666852eefc..01428187b6 100644
--- a/numba_dpex/tests/kernel_tests/test_dump_kernel_llvm.py
+++ b/numba_dpex/tests/kernel_tests/test_dump_kernel_llvm.py
@@ -9,7 +9,7 @@
 
 from numba.core import types
 
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 from numba_dpex import float32, usm_ndarray
 from numba_dpex.core import config
 from numba_dpex.core.descriptor import dpex_kernel_target
diff --git a/numba_dpex/tests/kernel_tests/test_exec_queue_inference.py b/numba_dpex/tests/kernel_tests/test_exec_queue_inference.py
index b5df39c427..9b6334efd6 100644
--- a/numba_dpex/tests/kernel_tests/test_exec_queue_inference.py
+++ b/numba_dpex/tests/kernel_tests/test_exec_queue_inference.py
@@ -7,12 +7,12 @@
 import pytest
 from numba.core import config
 
-import numba_dpex.experimental as exp_dpex
+import numba_dpex as dpex
 from numba_dpex import Range
 from numba_dpex.core.exceptions import ExecutionQueueInferenceError
 
 
-@exp_dpex.kernel(
+@dpex.kernel(
     release_gil=False,
     no_compile=True,
     no_cpython_wrapper=True,
@@ -38,7 +38,7 @@ def test_successful_execution_queue_inference():
     config.CAPTURED_ERRORS = "new_style"
 
     try:
-        exp_dpex.call_kernel(add, r, a, b, c)
+        dpex.call_kernel(add, r, a, b, c)
     except:
         pytest.fail("Unexpected error when calling kernel")
 
@@ -65,7 +65,7 @@ def test_execution_queue_inference_error():
     config.CAPTURED_ERRORS = "new_style"
 
     with pytest.raises(ExecutionQueueInferenceError):
-        exp_dpex.call_kernel(add, r, a, b, c)
+        dpex.call_kernel(add, r, a, b, c)
 
     config.CAPTURED_ERRORS = current_captured_error_style
 
@@ -86,6 +86,6 @@ def test_error_when_no_array_args():
     config.CAPTURED_ERRORS = "new_style"
 
     with pytest.raises(ExecutionQueueInferenceError):
-        exp_dpex.call_kernel(add, r, a, b, c)
+        dpex.call_kernel(add, r, a, b, c)
 
     config.CAPTURED_ERRORS = current_captured_error_style
diff --git a/numba_dpex/tests/kernel_tests/test_func.py b/numba_dpex/tests/kernel_tests/test_func.py
index b6c17ba327..f984fd08dc 100644
--- a/numba_dpex/tests/kernel_tests/test_func.py
+++ b/numba_dpex/tests/kernel_tests/test_func.py
@@ -5,7 +5,7 @@
 import dpnp
 import numpy
 
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 
 
 @dpex.device_func
diff --git a/numba_dpex/tests/kernel_tests/test_func_qualname_disambiguation.py b/numba_dpex/tests/kernel_tests/test_func_qualname_disambiguation.py
index 5b18ab367a..8bfc1a2ab4 100644
--- a/numba_dpex/tests/kernel_tests/test_func_qualname_disambiguation.py
+++ b/numba_dpex/tests/kernel_tests/test_func_qualname_disambiguation.py
@@ -5,7 +5,7 @@
 import dpnp
 import numpy as np
 
-import numba_dpex.experimental as ndpx
+import numba_dpex as ndpx
 
 
 def make_write_values_kernel(n_rows):
diff --git a/numba_dpex/tests/kernel_tests/test_func_specialization.py b/numba_dpex/tests/kernel_tests/test_func_specialization.py
index 223c9c0521..47b7ca5f3f 100644
--- a/numba_dpex/tests/kernel_tests/test_func_specialization.py
+++ b/numba_dpex/tests/kernel_tests/test_func_specialization.py
@@ -6,7 +6,7 @@
 import numpy as np
 from numba import int32, int64
 
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 
 i32_signature = dpex.device_func(int32(int32))
 i32i64_signature = dpex.device_func([int32(int32), int64(int64)])
diff --git a/numba_dpex/tests/kernel_tests/test_index_space_ids.py b/numba_dpex/tests/kernel_tests/test_index_space_ids.py
index eae0133406..88999a98bc 100644
--- a/numba_dpex/tests/kernel_tests/test_index_space_ids.py
+++ b/numba_dpex/tests/kernel_tests/test_index_space_ids.py
@@ -9,7 +9,6 @@
 import pytest
 
 import numba_dpex as dpex
-import numba_dpex.experimental as dpex_exp
 from numba_dpex.kernel_api import Item, NdItem, NdRange
 from numba_dpex.kernel_api import call_kernel as kapi_call_kernel
 from numba_dpex.tests._helper import has_cpu
@@ -18,7 +17,7 @@
 _GROUP_SIZE = 20
 
 
-_kernel_decorators = [(dpex_exp.call_kernel, dpex_exp.kernel)]
+_kernel_decorators = [(dpex.call_kernel, dpex.kernel)]
 # run simulator tests only with arrays allocated on cpu to avoid performance
 # issues
 if has_cpu():
diff --git a/numba_dpex/tests/kernel_tests/test_inline_threshold_config.py b/numba_dpex/tests/kernel_tests/test_inline_threshold_config.py
index 75c0430171..d4695a1b61 100644
--- a/numba_dpex/tests/kernel_tests/test_inline_threshold_config.py
+++ b/numba_dpex/tests/kernel_tests/test_inline_threshold_config.py
@@ -5,7 +5,6 @@
 from numba.core import compiler
 
 import numba_dpex as dpex
-from numba_dpex import experimental as dpex_exp
 from numba_dpex.kernel_api import Item
 
 
@@ -18,7 +17,7 @@ def test_inline_threshold_set_using_config():
     oldConfig = dpex.config.INLINE_THRESHOLD
     dpex.config.INLINE_THRESHOLD = None
 
-    disp = dpex_exp.kernel(kernel_func)
+    disp = dpex.kernel(kernel_func)
     flags = compiler.Flags()
     disp.targetdescr.options.parse_as_flags(flags, disp.targetoptions)
 
@@ -39,7 +38,7 @@ def test_inline_threshold_set_using_decorator_option():
     Test setting the inline_threshold value using the kernel decorator flag
     """
 
-    disp = dpex_exp.kernel(inline_threshold=2)(kernel_func)
+    disp = dpex.kernel(inline_threshold=2)(kernel_func)
     flags = compiler.Flags()
     disp.targetdescr.options.parse_as_flags(flags, disp.targetoptions)
 
@@ -50,7 +49,7 @@ def test_inline_threshold_set_using_decorator_supersedes_config_option():
     oldConfig = dpex.config.INLINE_THRESHOLD
     dpex.config.INLINE_THRESHOLD = None
 
-    disp = dpex_exp.kernel(inline_threshold=3)(kernel_func)
+    disp = dpex.kernel(inline_threshold=3)(kernel_func)
     flags = compiler.Flags()
     disp.targetdescr.options.parse_as_flags(flags, disp.targetoptions)
 
diff --git a/numba_dpex/tests/kernel_tests/test_invalid_kernel_args.py b/numba_dpex/tests/kernel_tests/test_invalid_kernel_args.py
index a8143f6f88..45f5adeb42 100644
--- a/numba_dpex/tests/kernel_tests/test_invalid_kernel_args.py
+++ b/numba_dpex/tests/kernel_tests/test_invalid_kernel_args.py
@@ -5,7 +5,7 @@
 import numpy
 import pytest
 
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 from numba_dpex import kernel_api as kapi
 
 N = 1024
diff --git a/numba_dpex/tests/kernel_tests/test_kernel_dispatcher.py b/numba_dpex/tests/kernel_tests/test_kernel_dispatcher.py
index 9c41bf9960..c0d86daf64 100644
--- a/numba_dpex/tests/kernel_tests/test_kernel_dispatcher.py
+++ b/numba_dpex/tests/kernel_tests/test_kernel_dispatcher.py
@@ -5,11 +5,11 @@
 import dpctl
 import dpnp
 
-import numba_dpex.experimental as exp_dpex
+import numba_dpex as dpex
 from numba_dpex import NdRange, Range, dpjit
 
 
-@exp_dpex.kernel(
+@dpex.kernel(
     release_gil=False,
     no_compile=True,
     no_cpython_wrapper=True,
@@ -19,7 +19,7 @@ def add(a, b, c):
     c[0] = b[0] + a[0]
 
 
-@exp_dpex.kernel(
+@dpex.kernel(
     release_gil=False,
     no_compile=True,
     no_cpython_wrapper=True,
@@ -42,11 +42,11 @@ def test_call_kernel_from_cpython():
     r = Range(100)
     ndr = NdRange(global_size=(100,), local_size=(1,))
 
-    exp_dpex.call_kernel(add, r, a, b, c)
+    dpex.call_kernel(add, r, a, b, c)
 
     assert c[0] == b[0] + a[0]
 
-    exp_dpex.call_kernel(add, ndr, a, b, c)
+    dpex.call_kernel(add, ndr, a, b, c)
 
     assert c[0] == b[0] + a[0]
 
@@ -60,7 +60,7 @@ def test_call_kernel_from_dpjit():
     @dpjit
     def range_kernel_caller(q, a, b, c):
         r = Range(100)
-        exp_dpex.call_kernel(add, r, a, b, c)
+        dpex.call_kernel(add, r, a, b, c)
         return c
 
     @dpjit
@@ -68,7 +68,7 @@ def ndrange_kernel_caller(q, a, b, c):
         gr = Range(100)
         lr = Range(1)
         ndr = NdRange(gr, lr)
-        exp_dpex.call_kernel(add, ndr, a, b, c)
+        dpex.call_kernel(add, ndr, a, b, c)
         return c
 
     q = dpctl.SyclQueue()
@@ -96,10 +96,10 @@ def test_call_multiple_kernels():
     c = dpnp.zeros_like(a, sycl_queue=q)
     r = Range(100)
 
-    exp_dpex.call_kernel(add, r, a, b, c)
+    dpex.call_kernel(add, r, a, b, c)
 
     assert c[0] == b[0] + a[0]
 
-    exp_dpex.call_kernel(sq, r, a, c)
+    dpex.call_kernel(sq, r, a, c)
 
     assert a[0] == c[0] * c[0]
diff --git a/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py b/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py
index 739c82ec4a..36d5c3987b 100644
--- a/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py
+++ b/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py
@@ -6,7 +6,7 @@
 import pytest
 from numba.core.errors import TypingError
 
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 from numba_dpex import int32, usm_ndarray
 from numba_dpex.core.exceptions import KernelHasReturnValueError
 from numba_dpex.core.types.kernel_api.index_space_ids import ItemType
diff --git a/numba_dpex/tests/kernel_tests/test_kernel_specialization.py b/numba_dpex/tests/kernel_tests/test_kernel_specialization.py
index b52062eeae..2e68134ca8 100644
--- a/numba_dpex/tests/kernel_tests/test_kernel_specialization.py
+++ b/numba_dpex/tests/kernel_tests/test_kernel_specialization.py
@@ -7,7 +7,7 @@
 import pytest
 from numba.core.errors import TypingError
 
-import numba_dpex.experimental as dpex_exp
+import numba_dpex as dpex
 from numba_dpex import DpnpNdArray, float32, int64
 from numba_dpex.core.exceptions import InvalidKernelSpecializationError
 from numba_dpex.core.types.kernel_api.index_space_ids import ItemType
@@ -17,8 +17,8 @@
 f32arrty = DpnpNdArray(ndim=1, dtype=float32, layout="C")
 item_ty = ItemType(ndim=1)
 
-specialized_kernel1 = dpex_exp.kernel((item_ty, i64arrty, i64arrty, i64arrty))
-specialized_kernel2 = dpex_exp.kernel(
+specialized_kernel1 = dpex.kernel((item_ty, i64arrty, i64arrty, i64arrty))
+specialized_kernel2 = dpex.kernel(
     [
         (item_ty, i64arrty, i64arrty, i64arrty),
         (item_ty, f32arrty, f32arrty, f32arrty),
@@ -50,7 +50,7 @@ def test_invalid_specialization_error():
     """Test if an InvalidKernelSpecializationError is raised when attempting to
     specialize with NumPy arrays.
     """
-    specialized_kernel3 = dpex_exp.kernel(
+    specialized_kernel3 = dpex.kernel(
         (item_ty, int64[::1], int64[::1], int64[::1])
     )
     with pytest.raises(InvalidKernelSpecializationError):
@@ -68,9 +68,7 @@ def test_missing_specialization_error():
 
     with pytest.raises(TypingError):
         data_parallel_sum_specialized = specialized_kernel1(data_parallel_sum)
-        dpex_exp.call_kernel(
-            data_parallel_sum_specialized, Range(SIZE), a, b, c
-        )
+        dpex.call_kernel(data_parallel_sum_specialized, Range(SIZE), a, b, c)
 
 
 def test_execution_of_specialized_kernel():
@@ -83,7 +81,7 @@ def test_execution_of_specialized_kernel():
 
     data_parallel_sum_specialized = specialized_kernel1(data_parallel_sum)
 
-    dpex_exp.call_kernel(data_parallel_sum_specialized, Range(SIZE), a, b, c)
+    dpex.call_kernel(data_parallel_sum_specialized, Range(SIZE), a, b, c)
 
     npc = dpnp.asnumpy(c)
     import numpy as np
@@ -96,10 +94,10 @@ def test_string_specialization():
     """Test if NotImplementedError is raised when signature is a string"""
 
     with pytest.raises(NotImplementedError):
-        dpex_exp.kernel("(item_ty, i64arrty, i64arrty, i64arrty)")
+        dpex.kernel("(item_ty, i64arrty, i64arrty, i64arrty)")
 
     with pytest.raises(NotImplementedError):
-        dpex_exp.kernel(
+        dpex.kernel(
             [
                 "(item_ty, i64arrty, i64arrty, i64arrty)",
                 "(item_ty, f32arrty, f32arrty, f32arrty)",
@@ -107,4 +105,4 @@ def test_string_specialization():
         )
 
     with pytest.raises(ValueError):
-        dpex_exp.kernel((i64arrty))
+        dpex.kernel((i64arrty))
diff --git a/numba_dpex/tests/kernel_tests/test_local_accessors.py b/numba_dpex/tests/kernel_tests/test_local_accessors.py
index d8ae378908..05fe1eedfa 100644
--- a/numba_dpex/tests/kernel_tests/test_local_accessors.py
+++ b/numba_dpex/tests/kernel_tests/test_local_accessors.py
@@ -8,7 +8,6 @@
 from numba.core.errors import TypingError
 
 import numba_dpex as dpex
-import numba_dpex.experimental as dpex_exp
 from numba_dpex.kernel_api import LocalAccessor, NdItem
 from numba_dpex.kernel_api import call_kernel as kapi_call_kernel
 from numba_dpex.tests._helper import get_all_dtypes
@@ -65,9 +64,9 @@ def _kernel3(nd_item: NdItem, a, slm):
 
 
 def device_func_kernel(func):
-    _df = dpex_exp.device_func(func)
+    _df = dpex.device_func(func)
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def _kernel(item, a, slm):
         _df(item, a, slm)
 
@@ -86,8 +85,8 @@ def _kernel(item, a, slm):
 @pytest.mark.parametrize(
     "call_kernel, kernel",
     [
-        (dpex_exp.call_kernel, dpex_exp.kernel),
-        (dpex_exp.call_kernel, device_func_kernel),
+        (dpex.call_kernel, dpex.kernel),
+        (dpex.call_kernel, device_func_kernel),
         (kapi_call_kernel, lambda f: f),
     ],
 )
@@ -123,4 +122,4 @@ def test_local_accessor_argument_to_range_kernel():
     # A TypeError is raised if NUMBA_CAPTURED_ERROR=new_style and a
     # numba.TypingError is raised if NUMBA_CAPTURED_ERROR=old_style
     with pytest.raises((TypeError, TypingError)):
-        dpex_exp.call_kernel(_kernel1, dpex.Range(N), a, slm)
+        dpex.call_kernel(_kernel1, dpex.Range(N), a, slm)
diff --git a/numba_dpex/tests/kernel_tests/test_math_functions.py b/numba_dpex/tests/kernel_tests/test_math_functions.py
index e090dc7989..1c37c99581 100644
--- a/numba_dpex/tests/kernel_tests/test_math_functions.py
+++ b/numba_dpex/tests/kernel_tests/test_math_functions.py
@@ -8,7 +8,7 @@
 import numpy
 import pytest
 
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 from numba_dpex.tests._helper import get_all_dtypes
 
 list_of_unary_ops = ["fabs", "exp", "log", "sqrt", "sin", "cos", "tan"]
diff --git a/numba_dpex/tests/kernel_tests/test_memory_enum_compilation.py b/numba_dpex/tests/kernel_tests/test_memory_enum_compilation.py
index 991e256784..a0c1e01214 100644
--- a/numba_dpex/tests/kernel_tests/test_memory_enum_compilation.py
+++ b/numba_dpex/tests/kernel_tests/test_memory_enum_compilation.py
@@ -4,7 +4,7 @@
 
 import dpnp
 
-import numba_dpex.experimental as exp_dpex
+import numba_dpex as dpex
 from numba_dpex import Range
 from numba_dpex.kernel_api import AddressSpace, MemoryOrder, MemoryScope
 
@@ -12,7 +12,7 @@
 def test_compilation_of_memory_order():
     """Tests if a MemoryOrder flags can be used inside a kernel function."""
 
-    @exp_dpex.kernel
+    @dpex.kernel
     def store_memory_order_flag(a):
         a[0] = MemoryOrder.RELAXED
         a[1] = MemoryOrder.CONSUME_UNSUPPORTED
@@ -22,7 +22,7 @@ def store_memory_order_flag(a):
         a[5] = MemoryOrder.SEQ_CST
 
     a = dpnp.ones(10, dtype=dpnp.int64)
-    exp_dpex.call_kernel(store_memory_order_flag, Range(10), a)
+    dpex.call_kernel(store_memory_order_flag, Range(10), a)
 
     assert a[0] == MemoryOrder.RELAXED
     assert a[1] == MemoryOrder.CONSUME_UNSUPPORTED
@@ -35,7 +35,7 @@ def store_memory_order_flag(a):
 def test_compilation_of_memory_scope():
     """Tests if a MemoryScope flags can be used inside a kernel function."""
 
-    @exp_dpex.kernel
+    @dpex.kernel
     def store_memory_scope_flag(a):
         a[0] = MemoryScope.DEVICE
         a[1] = MemoryScope.SUB_GROUP
@@ -44,7 +44,7 @@ def store_memory_scope_flag(a):
         a[4] = MemoryScope.WORK_ITEM
 
     a = dpnp.ones(10, dtype=dpnp.int64)
-    exp_dpex.call_kernel(store_memory_scope_flag, Range(10), a)
+    dpex.call_kernel(store_memory_scope_flag, Range(10), a)
 
     assert a[0] == MemoryScope.DEVICE
     assert a[1] == MemoryScope.SUB_GROUP
@@ -56,7 +56,7 @@ def store_memory_scope_flag(a):
 def test_compilation_of_address_space():
     """Tests if a AddressSpace flags can be used inside a kernel function."""
 
-    @exp_dpex.kernel
+    @dpex.kernel
     def store_address_space_flag(a):
         a[0] = AddressSpace.CONSTANT
         a[1] = AddressSpace.GENERIC
@@ -65,7 +65,7 @@ def store_address_space_flag(a):
         a[4] = AddressSpace.PRIVATE
 
     a = dpnp.ones(10, dtype=dpnp.int64)
-    exp_dpex.call_kernel(store_address_space_flag, Range(10), a)
+    dpex.call_kernel(store_address_space_flag, Range(10), a)
 
     assert a[0] == AddressSpace.CONSTANT
     assert a[1] == AddressSpace.GENERIC
diff --git a/numba_dpex/tests/kernel_tests/test_print.py b/numba_dpex/tests/kernel_tests/test_print.py
index 5e951d091a..88455e29d2 100644
--- a/numba_dpex/tests/kernel_tests/test_print.py
+++ b/numba_dpex/tests/kernel_tests/test_print.py
@@ -7,7 +7,7 @@
 import pytest
 from numba.core.errors import TypingError
 
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 
 list_of_dtypes = [
     dpnp.int32,
diff --git a/numba_dpex/tests/kernel_tests/test_private_array.py b/numba_dpex/tests/kernel_tests/test_private_array.py
index 44370f30da..487c4da3d0 100644
--- a/numba_dpex/tests/kernel_tests/test_private_array.py
+++ b/numba_dpex/tests/kernel_tests/test_private_array.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pytest
 
-import numba_dpex.experimental as dpex_exp
+import numba_dpex as dpex
 from numba_dpex.kernel_api import Item, PrivateArray, Range
 from numba_dpex.kernel_api import call_kernel as kapi_call_kernel
 
@@ -70,7 +70,7 @@ def private_2d_array_kernel(item: Item, a):
 )
 @pytest.mark.parametrize(
     "call_kernel, decorator",
-    [(dpex_exp.call_kernel, dpex_exp.kernel), (kapi_call_kernel, lambda a: a)],
+    [(dpex.call_kernel, dpex.kernel), (kapi_call_kernel, lambda a: a)],
 )
 def test_private_array(call_kernel, decorator, kernel):
     kernel = decorator(kernel)
@@ -95,14 +95,14 @@ def test_private_array(call_kernel, decorator, kernel):
 )
 def test_private_array_in_device_func(func):
 
-    _df = dpex_exp.device_func(func)
+    _df = dpex.device_func(func)
 
-    @dpex_exp.kernel
+    @dpex.kernel
     def _kernel(item: Item, a):
         _df(item, a)
 
     a = dpnp.empty(10, dtype=dpnp.float32)
-    dpex_exp.call_kernel(_kernel, Range(a.size), a)
+    dpex.call_kernel(_kernel, Range(a.size), a)
 
     # sum of squares from 1 to n: n*(n+1)*(2*n+1)/6
     want = np.full(a.size, (9) * (9 + 1) * (2 * 9 + 1) / 6, dtype=np.float32)
diff --git a/numba_dpex/tests/kernel_tests/test_scalar_arg_types.py b/numba_dpex/tests/kernel_tests/test_scalar_arg_types.py
index ba4965bfd4..d593c7a0ac 100644
--- a/numba_dpex/tests/kernel_tests/test_scalar_arg_types.py
+++ b/numba_dpex/tests/kernel_tests/test_scalar_arg_types.py
@@ -6,7 +6,7 @@
 import numpy
 import pytest
 
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 from numba_dpex.tests._helper import get_all_dtypes
 
 N = 1024
diff --git a/numba_dpex/tests/kernel_tests/test_strided_dpnp_array_in_kernel.py b/numba_dpex/tests/kernel_tests/test_strided_dpnp_array_in_kernel.py
index 90e48356ba..09a9bf5429 100644
--- a/numba_dpex/tests/kernel_tests/test_strided_dpnp_array_in_kernel.py
+++ b/numba_dpex/tests/kernel_tests/test_strided_dpnp_array_in_kernel.py
@@ -8,7 +8,7 @@
 import numpy as np
 import pytest
 
-import numba_dpex.experimental as exp_dpex
+import numba_dpex as dpex
 from numba_dpex.kernel_api import Item, Range
 
 
@@ -34,7 +34,7 @@ def get_order(a):
         raise Exception("Unknown order/layout")
 
 
-@exp_dpex.kernel
+@dpex.kernel
 def change_values_1d(item: Item, x, v):
     """Assign values in a 1d dpnp.ndarray
 
@@ -57,7 +57,7 @@ def change_values_1d_func(a, p):
         a[i] = p
 
 
-@exp_dpex.kernel
+@dpex.kernel
 def change_values_2d(item: Item, x, v):
     """Assign values in a 2d dpnp.ndarray
 
@@ -82,7 +82,7 @@ def change_values_2d_func(a, p):
             a[i, j] = p
 
 
-@exp_dpex.kernel
+@dpex.kernel
 def change_values_3d(item: Item, x, v):
     """Assign values in a 3d dpnp.ndarray
 
@@ -122,7 +122,7 @@ def test_1d_strided_dpnp_array_in_kernel(s):
     u = dpnp.asarray(t)
 
     v = u[::s]
-    exp_dpex.call_kernel(change_values_1d, Range(v.shape[0]), v, k)
+    dpex.call_kernel(change_values_1d, Range(v.shape[0]), v, k)
 
     x = t[::s]
     change_values_1d_func(x, k)
@@ -148,7 +148,7 @@ def test_multievel_1d_strided_dpnp_array_in_kernel(s):
     v, x = u, t
     while v.shape[0] > 1:
         v = v[::s]
-        exp_dpex.call_kernel(change_values_1d, Range(v.shape[0]), v, k)
+        dpex.call_kernel(change_values_1d, Range(v.shape[0]), v, k)
 
         x = x[::s]
         change_values_1d_func(x, k)
@@ -177,7 +177,7 @@ def test_2d_strided_dpnp_array_in_kernel(s1, s2, order):
     assert get_order(u) == order
 
     v = u[::s1, ::s2]
-    exp_dpex.call_kernel(change_values_2d, Range(*v.shape), v, k)
+    dpex.call_kernel(change_values_2d, Range(*v.shape), v, k)
 
     x = t[::s1, ::s2]
     change_values_2d_func(x, k)
@@ -208,7 +208,7 @@ def test_multilevel_2d_strided_dpnp_array_in_kernel(s1, s2, order):
     v, x = u, t
     while v.shape[0] > 1 and v.shape[1] > 1:
         v = v[::s1, ::s2]
-        exp_dpex.call_kernel(change_values_2d, Range(*v.shape), v, k)
+        dpex.call_kernel(change_values_2d, Range(*v.shape), v, k)
 
         x = x[::s1, ::s2]
         change_values_2d_func(x, k)
@@ -238,7 +238,7 @@ def test_3d_strided_dpnp_array_in_kernel(s1, s2, s3, order):
     assert get_order(u) == order
 
     v = u[::s1, ::s2, ::s3]
-    exp_dpex.call_kernel(change_values_3d, Range(*v.shape), v, k)
+    dpex.call_kernel(change_values_3d, Range(*v.shape), v, k)
 
     x = t[::s1, ::s2, ::s3]
     change_values_3d_func(x, k)
@@ -270,7 +270,7 @@ def test_multilevel_3d_strided_dpnp_array_in_kernel(s1, s2, s3, order):
     v, x = u, t
     while v.shape[0] > 1 and v.shape[1] > 1 and v.shape[2] > 1:
         v = v[::s1, ::s2, ::s3]
-        exp_dpex.call_kernel(change_values_3d, Range(*v.shape), v, k)
+        dpex.call_kernel(change_values_3d, Range(*v.shape), v, k)
 
         x = x[::s1, ::s2, ::s3]
         change_values_3d_func(x, k)
diff --git a/numba_dpex/tests/kernel_tests/test_supported_array_types_as_kernel_args.py b/numba_dpex/tests/kernel_tests/test_supported_array_types_as_kernel_args.py
index 3c4c1be6d2..647ee9da67 100644
--- a/numba_dpex/tests/kernel_tests/test_supported_array_types_as_kernel_args.py
+++ b/numba_dpex/tests/kernel_tests/test_supported_array_types_as_kernel_args.py
@@ -8,7 +8,7 @@
 import dpnp
 import pytest
 
-import numba_dpex.experimental as dpex_exp
+import numba_dpex as dpex
 from numba_dpex.kernel_api import Item, Range
 from numba_dpex.tests._helper import get_all_dtypes
 
@@ -27,13 +27,13 @@ def input_array(request):
     return zeros(_SIZE, dtype=dtype)
 
 
-@dpex_exp.kernel
+@dpex.kernel
 def set_ones(item: Item, a):
     i = item.get_id(0)
     a[i] = 1
 
 
 def test_fetch_add(input_array):
-    dpex_exp.call_kernel(set_ones, Range(_SIZE), input_array)
+    dpex.call_kernel(set_ones, Range(_SIZE), input_array)
 
     assert input_array[0] == 1
diff --git a/numba_dpex/tests/kernel_tests/test_target_specific_overload.py b/numba_dpex/tests/kernel_tests/test_target_specific_overload.py
index 3dddeadc4b..be34a122b4 100644
--- a/numba_dpex/tests/kernel_tests/test_target_specific_overload.py
+++ b/numba_dpex/tests/kernel_tests/test_target_specific_overload.py
@@ -5,7 +5,7 @@
 import dpnp
 from numba.core.extending import overload
 
-import numba_dpex.experimental as dpex_exp
+import numba_dpex as dpex
 from numba_dpex.core.descriptor import dpex_kernel_target
 from numba_dpex.experimental.target import (
     DPEX_KERNEL_EXP_TARGET_NAME,
@@ -26,7 +26,7 @@ def ol_scalar_add_impl(a, b):
     return ol_scalar_add_impl
 
 
-@dpex_exp.kernel
+@dpex.kernel
 def kernel_calling_overload(item: Item, a, b, c):
     i = item.get_id(0)
     c[i] = scalar_add(a[i], b[i])
@@ -36,7 +36,7 @@ def kernel_calling_overload(item: Item, a, b, c):
 b = dpnp.ones(10, dtype=dpnp.int64)
 c = dpnp.zeros(10, dtype=dpnp.int64)
 
-dpex_exp.call_kernel(kernel_calling_overload, Range(10), a, b, c)
+dpex.call_kernel(kernel_calling_overload, Range(10), a, b, c)
 
 
 def test_end_to_end_overload_execution():
diff --git a/numba_dpex/tests/kernel_tests/test_usm_ndarray_args.py b/numba_dpex/tests/kernel_tests/test_usm_ndarray_args.py
index fe1edb3f06..dfeed5955b 100644
--- a/numba_dpex/tests/kernel_tests/test_usm_ndarray_args.py
+++ b/numba_dpex/tests/kernel_tests/test_usm_ndarray_args.py
@@ -6,7 +6,7 @@
 import numpy
 import pytest
 
-import numba_dpex.experimental as dpex
+import numba_dpex as dpex
 from numba_dpex.tests._helper import get_all_dtypes
 
 list_of_dtype = get_all_dtypes(

From 479b7c79dc2abe03a76d9dc71cd6ca0c229ac832 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Thu, 21 Mar 2024 14:56:14 -0500
Subject: [PATCH 03/14] Remove old tests cases.

---
 .../test_sycl_usm_array_iface_interop.py      | 115 ------------------
 .../kernel_tests/test_usm_ndarray_interop.py  |  61 ----------
 2 files changed, 176 deletions(-)
 delete mode 100644 numba_dpex/tests/kernel_tests/test_sycl_usm_array_iface_interop.py
 delete mode 100644 numba_dpex/tests/kernel_tests/test_usm_ndarray_interop.py

diff --git a/numba_dpex/tests/kernel_tests/test_sycl_usm_array_iface_interop.py b/numba_dpex/tests/kernel_tests/test_sycl_usm_array_iface_interop.py
deleted file mode 100644
index 908350858d..0000000000
--- a/numba_dpex/tests/kernel_tests/test_sycl_usm_array_iface_interop.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# SPDX-FileCopyrightText: 2020 - 2024 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import numpy as np
-import pytest
-from dpctl import tensor as dpt
-
-import numba_dpex as dpex
-from numba_dpex.tests._helper import get_all_dtypes
-
-
-class DuckUSMArray:
-    """A Python class that defines a __sycl_usm_array_interface__ attribute."""
-
-    def __init__(self, shape, dtype="d", host_buffer=None):
-        _tensor = dpt.empty(shape, dtype=dtype, usm_type="shared")
-        shmem = _tensor.usm_data
-        if isinstance(host_buffer, np.ndarray):
-            shmem.copy_from_host(host_buffer.view(dtype="|u1"))
-        self.arr = np.ndarray(shape, dtype=dtype, buffer=shmem)
-
-    def __getitem__(self, indx):
-        return self.arr[indx]
-
-    def __setitem__(self, indx, val):
-        self.arr.__setitem__(indx, val)
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        iface = self.arr.__array_interface__
-        b = self.arr.base
-        iface["syclobj"] = b.__sycl_usm_array_interface__["syclobj"]
-        iface["version"] = 1
-        return iface
-
-
-class PseudoDuckUSMArray:
-    """A Python class that defines an attributed called
-    __sycl_usm_array_interface__, but is not actually backed by USM memory.
-
-    """
-
-    def __init__(self):
-        pass
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        iface = {}
-        iface["syclobj"] = None
-        iface["version"] = 0
-        return iface
-
-
-@dpex.kernel
-def vecadd(a, b, c):
-    i = dpex.get_global_id(0)
-    c[i] = a[i] + b[i]
-
-
-dtypes = get_all_dtypes(
-    no_bool=True, no_float16=True, no_none=True, no_complex=True
-)
-
-
-@pytest.fixture(params=dtypes)
-def dtype(request):
-    return request.param
-
-
-def test_kernel_valid_usm_obj(dtype):
-    """Test if a ``numba_dpex.kernel`` function accepts a DuckUSMArray argument.
-
-    The ``DuckUSMArray`` uses ``dpctl.memory`` to allocate a Python object that
-    defines a ``__sycl_usm_array_interface__`` attribute. We test if
-    ``numba_dpex`` recognizes the ``DuckUSMArray`` as a valid USM-backed Python
-    object and accepts it as a kernel argument.
-
-    """
-    N = 1024
-
-    buffA = np.arange(0, N, dtype=dtype)
-    buffB = np.arange(0, N, dtype=dtype)
-    buffC = np.zeros(N, dtype=dtype)
-
-    A = DuckUSMArray(shape=buffA.shape, dtype=dtype, host_buffer=buffA)
-    B = DuckUSMArray(shape=buffB.shape, dtype=dtype, host_buffer=buffB)
-    C = DuckUSMArray(shape=buffC.shape, dtype=dtype, host_buffer=buffC)
-
-    try:
-        dpex.call_kernel(vecadd, dpex.Range(N), A, B, C)
-    except Exception:
-        pytest.fail(
-            "Could not pass Python object with sycl_usm_array_interface"
-            + " to a kernel."
-        )
-
-
-def test_kernel_invalid_usm_obj(dtype):
-    """Test if a ``numba_dpex.kernel`` function rejects a PseudoDuckUSMArray
-    argument.
-
-    The ``PseudoDuckUSMArray`` defines a fake attribute called
-    __sycl_usm_array__interface__. We test if
-    ``numba_dpex`` correctly recognizes and rejects the ``PseudoDuckUSMArray``.
-
-    """
-    N = 1024
-
-    A = PseudoDuckUSMArray()
-    B = PseudoDuckUSMArray()
-    C = PseudoDuckUSMArray()
-
-    with pytest.raises(Exception):
-        dpex.call_kernel(vecadd, dpex.Range(N), A, B, C)
diff --git a/numba_dpex/tests/kernel_tests/test_usm_ndarray_interop.py b/numba_dpex/tests/kernel_tests/test_usm_ndarray_interop.py
deleted file mode 100644
index e6863e47e5..0000000000
--- a/numba_dpex/tests/kernel_tests/test_usm_ndarray_interop.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# SPDX-FileCopyrightText: 2020 - 2024 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import dpctl.tensor as dpt
-import numpy
-import pytest
-
-import numba_dpex as dpex
-from numba_dpex.tests._helper import get_all_dtypes
-
-list_of_dtype = get_all_dtypes(
-    no_bool=True, no_float16=True, no_none=True, no_complex=True
-)
-
-
-@pytest.fixture(params=list_of_dtype)
-def dtype(request):
-    return request.param
-
-
-list_of_usm_type = [
-    "shared",
-    "device",
-    "host",
-]
-
-
-@pytest.fixture(params=list_of_usm_type)
-def usm_type(request):
-    return request.param
-
-
-def test_consuming_usm_ndarray(dtype, usm_type):
-    @dpex.kernel
-    def data_parallel_sum(a, b, c):
-        """
-        Vector addition using the ``kernel`` decorator.
-        """
-        i = dpex.get_global_id(0)
-        j = dpex.get_global_id(1)
-        c[i, j] = a[i, j] + b[i, j]
-
-    N = 1000
-    global_size = N * N
-
-    a = dpt.arange(global_size, dtype=dtype, usm_type=usm_type)
-    a = dpt.reshape(a, shape=(N, N))
-
-    b = dpt.arange(global_size, dtype=dtype, usm_type=usm_type)
-    b = dpt.reshape(b, shape=(N, N))
-
-    c = dpt.empty_like(a)
-
-    dpex.call_kernel(data_parallel_sum, dpex.Range(N, N), a, b, c)
-
-    na = dpt.asnumpy(a)
-    nb = dpt.asnumpy(b)
-    nc = dpt.asnumpy(c)
-
-    assert numpy.array_equal(nc, na + nb)

From 630cf99548469e75616fe22d69c193952a996591 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Thu, 21 Mar 2024 17:15:18 -0500
Subject: [PATCH 04/14] Remove use of experimental module.

---
 .../tests/core/types/IntEnumLiteral/test_compilation.py     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/numba_dpex/tests/core/types/IntEnumLiteral/test_compilation.py b/numba_dpex/tests/core/types/IntEnumLiteral/test_compilation.py
index 9362ba20ff..8fb3509a31 100644
--- a/numba_dpex/tests/core/types/IntEnumLiteral/test_compilation.py
+++ b/numba_dpex/tests/core/types/IntEnumLiteral/test_compilation.py
@@ -4,7 +4,7 @@
 
 import dpnp
 
-import numba_dpex.experimental as exp_dpex
+import numba_dpex as dpex
 from numba_dpex import Range
 from numba_dpex.kernel_api.flag_enum import FlagEnum
 
@@ -14,7 +14,7 @@ class MockFlags(FlagEnum):
     FLAG2 = 200
 
 
-@exp_dpex.kernel(
+@dpex.kernel(
     release_gil=False,
     no_compile=True,
     no_cpython_wrapper=True,
@@ -28,7 +28,7 @@ def update_with_flag(a):
 def test_compilation_of_flag_enum():
     """Tests if a FlagEnum subclass can be used inside a kernel function."""
     a = dpnp.ones(10, dtype=dpnp.int64)
-    exp_dpex.call_kernel(update_with_flag, Range(10), a)
+    dpex.call_kernel(update_with_flag, Range(10), a)
 
     assert a[0] == MockFlags.FLAG1
     assert a[1] == MockFlags.FLAG2

From 1a6754b3e7b3dcb2de758a08cb85c4fb63ff717f Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Thu, 21 Mar 2024 18:17:20 -0500
Subject: [PATCH 05/14] Fix codegen tests

---
 numba_dpex/tests/{experimental => codegen}/__init__.py   | 0
 .../codegen/test_inline_threshold_codegen.py             | 9 ++++-----
 .../codegen/test_intenum_literal_codegen.py              | 6 +++---
 .../codegen/test_local_accessor_kernel_arg.py            | 8 +++-----
 numba_dpex/tests/experimental/codegen/__init__.py        | 3 ---
 5 files changed, 10 insertions(+), 16 deletions(-)
 rename numba_dpex/tests/{experimental => codegen}/__init__.py (100%)
 rename numba_dpex/tests/{experimental => }/codegen/test_inline_threshold_codegen.py (91%)
 rename numba_dpex/tests/{experimental => }/codegen/test_intenum_literal_codegen.py (91%)
 rename numba_dpex/tests/{experimental => }/codegen/test_local_accessor_kernel_arg.py (90%)
 delete mode 100644 numba_dpex/tests/experimental/codegen/__init__.py

diff --git a/numba_dpex/tests/experimental/__init__.py b/numba_dpex/tests/codegen/__init__.py
similarity index 100%
rename from numba_dpex/tests/experimental/__init__.py
rename to numba_dpex/tests/codegen/__init__.py
diff --git a/numba_dpex/tests/experimental/codegen/test_inline_threshold_codegen.py b/numba_dpex/tests/codegen/test_inline_threshold_codegen.py
similarity index 91%
rename from numba_dpex/tests/experimental/codegen/test_inline_threshold_codegen.py
rename to numba_dpex/tests/codegen/test_inline_threshold_codegen.py
index e3953adfb3..c5af4e70c6 100644
--- a/numba_dpex/tests/experimental/codegen/test_inline_threshold_codegen.py
+++ b/numba_dpex/tests/codegen/test_inline_threshold_codegen.py
@@ -5,9 +5,8 @@
 import dpctl
 from numba.core import types
 
-from numba_dpex import DpctlSyclQueue, DpnpNdArray
-from numba_dpex import experimental as dpex_exp
-from numba_dpex import int64
+import numba_dpex as dpex
+from numba_dpex import DpctlSyclQueue, DpnpNdArray, int64
 from numba_dpex.core.types.kernel_api.index_space_ids import ItemType
 from numba_dpex.kernel_api import Item
 
@@ -39,7 +38,7 @@ def test_codegen_with_max_inline_threshold():
     i64arr_ty = DpnpNdArray(ndim=1, dtype=int64, layout="C", queue=queue_ty)
     kernel_sig = types.void(ItemType(1), i64arr_ty, i64arr_ty, i64arr_ty)
 
-    disp = dpex_exp.kernel(inline_threshold=1)(kernel_func)
+    disp = dpex.kernel(inline_threshold=1)(kernel_func)
     disp.compile(kernel_sig)
     kcres = disp.overloads[kernel_sig.args]
     llvm_ir_mod = kcres.library._final_module
@@ -60,7 +59,7 @@ def test_codegen_without_max_inline_threshold():
     i64arr_ty = DpnpNdArray(ndim=1, dtype=int64, layout="C", queue=queue_ty)
     kernel_sig = types.void(ItemType(1), i64arr_ty, i64arr_ty, i64arr_ty)
 
-    disp = dpex_exp.kernel(inline_threshold=0)(kernel_func)
+    disp = dpex.kernel(inline_threshold=0)(kernel_func)
     disp.compile(kernel_sig)
     kcres = disp.overloads[kernel_sig.args]
     llvm_ir_mod = kcres.library._final_module
diff --git a/numba_dpex/tests/experimental/codegen/test_intenum_literal_codegen.py b/numba_dpex/tests/codegen/test_intenum_literal_codegen.py
similarity index 91%
rename from numba_dpex/tests/experimental/codegen/test_intenum_literal_codegen.py
rename to numba_dpex/tests/codegen/test_intenum_literal_codegen.py
index 33ca4063e4..5d875f37af 100644
--- a/numba_dpex/tests/experimental/codegen/test_intenum_literal_codegen.py
+++ b/numba_dpex/tests/codegen/test_intenum_literal_codegen.py
@@ -7,7 +7,7 @@
 import dpctl
 from numba.core import types
 
-import numba_dpex.experimental as exp_dpex
+import numba_dpex as dpex
 from numba_dpex import DpctlSyclQueue, DpnpNdArray, int64
 from numba_dpex.kernel_api.flag_enum import FlagEnum
 
@@ -27,7 +27,7 @@ class PseudoFlags(FlagEnum):
         FLAG1 = 1
         FLAG2 = 2
 
-    @exp_dpex.device_func
+    @dpex.device_func
     def bitwise_or_flags(flag1, flag2):
         return flag1 | flag2
 
@@ -40,7 +40,7 @@ def pass_flags_to_func(a):
     i64arr_ty = DpnpNdArray(ndim=1, dtype=int64, layout="C", queue=queue_ty)
     kernel_sig = types.void(i64arr_ty)
 
-    disp = exp_dpex.kernel(inline_threshold=0)(pass_flags_to_func)
+    disp = dpex.kernel(inline_threshold=0)(pass_flags_to_func)
     disp.compile(kernel_sig)
     kcres = disp.overloads[kernel_sig.args]
     llvm_ir_mod = kcres.library._final_module.__str__()
diff --git a/numba_dpex/tests/experimental/codegen/test_local_accessor_kernel_arg.py b/numba_dpex/tests/codegen/test_local_accessor_kernel_arg.py
similarity index 90%
rename from numba_dpex/tests/experimental/codegen/test_local_accessor_kernel_arg.py
rename to numba_dpex/tests/codegen/test_local_accessor_kernel_arg.py
index 26905bf19b..b1d52b6cc3 100644
--- a/numba_dpex/tests/experimental/codegen/test_local_accessor_kernel_arg.py
+++ b/numba_dpex/tests/codegen/test_local_accessor_kernel_arg.py
@@ -4,11 +4,9 @@
 
 import dpctl
 from llvmlite import ir as llvmir
-from numba.core import types
 
-from numba_dpex import DpctlSyclQueue, DpnpNdArray
-from numba_dpex import experimental as dpex_exp
-from numba_dpex import int64
+import numba_dpex as dpex
+from numba_dpex import DpctlSyclQueue, DpnpNdArray, int64
 from numba_dpex.core.types.kernel_api.index_space_ids import NdItemType
 from numba_dpex.core.types.kernel_api.local_accessor import LocalAccessorType
 from numba_dpex.kernel_api import (
@@ -37,7 +35,7 @@ def test_codegen_local_accessor_kernel_arg():
     queue_ty = DpctlSyclQueue(dpctl.SyclQueue())
     i64arr_ty = DpnpNdArray(ndim=1, dtype=int64, layout="C", queue=queue_ty)
     slm_ty = LocalAccessorType(ndim=1, dtype=int64)
-    disp = dpex_exp.kernel(inline_threshold=3)(kernel_func)
+    disp = dpex.kernel(inline_threshold=3)(kernel_func)
     dmm = disp.targetctx.data_model_manager
 
     i64arr_ty_flattened_arg_count = dmm.lookup(i64arr_ty).flattened_field_count
diff --git a/numba_dpex/tests/experimental/codegen/__init__.py b/numba_dpex/tests/experimental/codegen/__init__.py
deleted file mode 100644
index 1d329c9611..0000000000
--- a/numba_dpex/tests/experimental/codegen/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-FileCopyrightText: 2023 - 2024 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0

From 957c8b59b2dc6bf53bb3ec338e0f42d34206e6c0 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Thu, 21 Mar 2024 18:19:52 -0500
Subject: [PATCH 06/14] Fix warnings tests

---
 numba_dpex/tests/misc/test_warnings.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/numba_dpex/tests/misc/test_warnings.py b/numba_dpex/tests/misc/test_warnings.py
index 247964eaca..b09d2fecc5 100644
--- a/numba_dpex/tests/misc/test_warnings.py
+++ b/numba_dpex/tests/misc/test_warnings.py
@@ -11,9 +11,9 @@
 from numba_dpex.core import config
 
 
-@dpex.kernel(enable_cache=False)
-def foo(a):
-    a[dpex.get_global_id(0)] = 0
+@dpex.kernel
+def foo(item, a):
+    a[item.get_id(0)] = 0
 
 
 def test_inline_threshold_negative_val_warning_():
@@ -26,16 +26,6 @@ def test_inline_threshold_negative_val_warning_():
     config.INLINE_THRESHOLD = bkp
 
 
-def test_inline_threshold_gt_3_warning():
-    bkp = config.INLINE_THRESHOLD
-    config.INLINE_THRESHOLD = 4
-
-    with pytest.warns(UserWarning):
-        dpex.call_kernel(foo, dpex.Range(10), dpnp.arange(10))
-
-    config.INLINE_THRESHOLD = bkp
-
-
 def test_no_warning():
     with warnings.catch_warnings():
         warnings.simplefilter("error")

From b71f540f671089b81353e4c02bfa5f752ac0137a Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Thu, 21 Mar 2024 20:57:26 -0500
Subject: [PATCH 07/14] Port test_debuginfo to new API.

---
 numba_dpex/tests/test_debuginfo.py | 105 ++++++++++++++++-------------
 1 file changed, 58 insertions(+), 47 deletions(-)

diff --git a/numba_dpex/tests/test_debuginfo.py b/numba_dpex/tests/test_debuginfo.py
index 140b46a37a..e95e9e6d1f 100644
--- a/numba_dpex/tests/test_debuginfo.py
+++ b/numba_dpex/tests/test_debuginfo.py
@@ -7,15 +7,18 @@
 import re
 
 import pytest
+from numba.core import types
 
 import numba_dpex as dpex
 from numba_dpex import float32, int32, usm_ndarray
 from numba_dpex.core.descriptor import dpex_kernel_target
+from numba_dpex.core.types.kernel_api.index_space_ids import ItemType
 from numba_dpex.tests._helper import override_config
 
 debug_options = [True, False]
 
 f32arrty = usm_ndarray(ndim=1, dtype=float32, layout="C")
+itemty = ItemType(ndim=1)
 
 
 @pytest.fixture(params=debug_options)
@@ -23,20 +26,6 @@ def debug_option(request):
     return request.param
 
 
-def get_kernel_ir(fn, sig, debug=False):
-    kernel = dpex.core.kernel_interface.spirv_kernel.SpirvKernel(
-        fn, fn.__name__
-    )
-    kernel.compile(
-        args=sig,
-        target_ctx=dpex_kernel_target.target_context,
-        typing_ctx=dpex_kernel_target.typing_context,
-        debug=debug,
-        compile_flags=None,
-    )
-    return kernel.llvm_module
-
-
 def make_check(ir, val_to_search):
     """
     Check the compiled assembly for debuginfo.
@@ -52,11 +41,15 @@ def test_debug_flag_generates_ir_with_debuginfo(debug_option):
     Check debug info is emitting to IR if debug parameter is set to True
     """
 
-    def foo(x):
-        x = 1  # noqa
+    def foo(item, x):
+        i = item.get_id(0)
+        x[i] = 1  # noqa
+
+    sig = (itemty, f32arrty)
+    disp = dpex.kernel(sig, debug=debug_option)(foo)
+    kcres = disp.get_compile_result(types.void(itemty, f32arrty))
+    kernel_ir = kcres.library.get_llvm_str()
 
-    sig = (int32,)
-    kernel_ir = get_kernel_ir(foo, sig, debug=debug_option)
     tag = "!dbg"
 
     if debug_option:
@@ -71,8 +64,8 @@ def test_debug_info_locals_vars_on_no_opt():
     if debug parameter is set to True and optimization is O0
     """
 
-    def foo(var_a, var_b, var_c):
-        i = dpex.get_global_id(0)
+    def foo(item, var_a, var_b, var_c):
+        i = item.get_id(0)
         var_c[i] = var_a[i] + var_b[i]
 
     ir_tags = [
@@ -81,10 +74,14 @@ def foo(var_a, var_b, var_c):
         '!DILocalVariable(name: "var_c"',
         '!DILocalVariable(name: "i"',
     ]
-    sig = (f32arrty, f32arrty, f32arrty)
+    sig = (itemty, f32arrty, f32arrty, f32arrty)
 
     with override_config("OPT", 0):
-        kernel_ir = get_kernel_ir(foo, sig, debug=True)
+        disp = dpex.kernel(sig, debug=True)(foo)
+        kcres = disp.get_compile_result(
+            types.void(itemty, f32arrty, f32arrty, f32arrty)
+        )
+        kernel_ir = kcres.library.get_llvm_str()
 
     for tag in ir_tags:
         assert tag in kernel_ir
@@ -96,8 +93,8 @@ def test_debug_kernel_local_vars_in_ir():
     created in kernel
     """
 
-    def foo(arr):
-        index = dpex.get_global_id(0)
+    def foo(item, arr):
+        index = item.get_id(0)
         local_d = 9 * 99 + 5
         arr[index] = local_d + 100
 
@@ -105,8 +102,10 @@ def foo(arr):
         '!DILocalVariable(name: "index"',
         '!DILocalVariable(name: "local_d"',
     ]
-    sig = (f32arrty,)
-    kernel_ir = get_kernel_ir(foo, sig, debug=True)
+    sig = (itemty, f32arrty)
+    disp = dpex.kernel(sig, debug=True)(foo)
+    kcres = disp.get_compile_result(types.void(itemty, f32arrty))
+    kernel_ir = kcres.library.get_llvm_str()
 
     for tag in ir_tags:
         assert tag in kernel_ir
@@ -117,13 +116,13 @@ def test_debug_flag_generates_ir_with_debuginfo_for_func(debug_option):
     Check debug info is emitting to IR if debug parameter is set to True
     """
 
-    @dpex.func(debug=debug_option)
+    @dpex.device_func(debug=debug_option)
     def func_sum(a, b):
         result = a + b
         return result
 
-    def data_parallel_sum(a, b, c):
-        i = dpex.get_global_id(0)
+    def data_parallel_sum(item, a, b, c):
+        i = item.get_id(0)
         c[i] = func_sum(a[i], b[i])
 
     ir_tags = [
@@ -131,9 +130,12 @@ def data_parallel_sum(a, b, c):
         r'\!DISubprogram\(name: ".*data_parallel_sum\$?\d*"',
     ]
 
-    sig = (f32arrty, f32arrty, f32arrty)
-
-    kernel_ir = get_kernel_ir(data_parallel_sum, sig, debug=debug_option)
+    sig = (itemty, f32arrty, f32arrty, f32arrty)
+    disp = dpex.kernel(sig, debug=debug_option)(data_parallel_sum)
+    kcres = disp.get_compile_result(
+        types.void(itemty, f32arrty, f32arrty, f32arrty)
+    )
+    kernel_ir = kcres.library.get_llvm_str()
 
     for tag in ir_tags:
         assert debug_option == make_check(kernel_ir, tag)
@@ -144,13 +146,13 @@ def test_env_var_generates_ir_with_debuginfo_for_func(debug_option):
     Check debug info is emitting to IR if NUMBA_DPEX_DEBUGINFO is set to 1
     """
 
-    @dpex.func
+    @dpex.device_func(debug=debug_option)
     def func_sum(a, b):
         result = a + b
         return result
 
-    def data_parallel_sum(a, b, c):
-        i = dpex.get_global_id(0)
+    def data_parallel_sum(item, a, b, c):
+        i = item.get_id(0)
         c[i] = func_sum(a[i], b[i])
 
     ir_tags = [
@@ -158,43 +160,52 @@ def data_parallel_sum(a, b, c):
         r'\!DISubprogram\(name: ".*data_parallel_sum\$\d*"',
     ]
 
-    sig = (f32arrty, f32arrty, f32arrty)
+    sig = (itemty, f32arrty, f32arrty, f32arrty)
 
     with override_config("DEBUGINFO_DEFAULT", int(debug_option)):
-        kernel_ir = get_kernel_ir(data_parallel_sum, sig)
+        disp = dpex.kernel(sig, debug=debug_option, inline_threshold=0)(
+            data_parallel_sum
+        )
+        kcres = disp.get_compile_result(
+            types.void(itemty, f32arrty, f32arrty, f32arrty)
+        )
+        kernel_ir = kcres.library.get_llvm_str()
 
     for tag in ir_tags:
         assert debug_option == make_check(kernel_ir, tag)
 
 
 def test_debuginfo_DISubprogram_linkageName():
-    def func(a, b):
-        i = dpex.get_global_id(0)
+    def foo(item, a, b):
+        i = item.get_id(0)
         b[i] = a[i]
 
     ir_tags = [
-        r'\!DISubprogram\(.*linkageName: ".*func.*"',
+        r'\!DISubprogram\(.*linkageName: ".*foo.*"',
     ]
 
-    sig = (f32arrty, f32arrty)
-    kernel_ir = get_kernel_ir(func, sig, debug=True)
+    sig = (itemty, f32arrty, f32arrty)
+    disp = dpex.kernel(sig, debug=debug_option)(foo)
+    kcres = disp.get_compile_result(types.void(itemty, f32arrty, f32arrty))
+    kernel_ir = kcres.library.get_llvm_str()
 
     for tag in ir_tags:
         assert make_check(kernel_ir, tag)
 
 
 def test_debuginfo_DICompileUnit_language_and_producer():
-    def func(a, b):
-        i = dpex.get_global_id(0)
+    def foo(item, a, b):
+        i = item.get_id(0)
         b[i] = a[i]
 
     ir_tags = [
         r"\!DICompileUnit\(language: DW_LANG_C_plus_plus,",
     ]
 
-    sig = (f32arrty, f32arrty)
-
-    kernel_ir = get_kernel_ir(func, sig, debug=True)
+    sig = (itemty, f32arrty, f32arrty)
+    disp = dpex.kernel(sig, debug=debug_option)(foo)
+    kcres = disp.get_compile_result(types.void(itemty, f32arrty, f32arrty))
+    kernel_ir = kcres.library.get_llvm_str()
 
     for tag in ir_tags:
         assert make_check(kernel_ir, tag)

From c82e60b1ae2394e2ab92060ebc2c6ab927f445de Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Fri, 22 Mar 2024 01:21:37 -0500
Subject: [PATCH 08/14] Update all kernel examples to new API.

---
 numba_dpex/examples/kernel/atomic_op.py       |  45 +++--
 numba_dpex/examples/kernel/black_scholes.py   |  40 +++-
 numba_dpex/examples/kernel/device_func.py     | 172 +++++-------------
 numba_dpex/examples/kernel/interpolation.py   |  36 ++--
 .../examples/kernel/kernel_private_memory.py  |  34 ++--
 .../examples/kernel/kernel_specialization.py  |  82 ++++-----
 numba_dpex/examples/kernel/matmul.py          | 154 +++++++++-------
 .../examples/kernel/pairwise_distance.py      |  16 +-
 numba_dpex/examples/kernel/pipelining.py      |   7 +-
 numba_dpex/examples/kernel/scan.py            |  52 +++---
 ...sive_ocl.py => sum_reduction_recursive.py} |  33 ++--
 ...reduction_ocl.py => sum_reduction_tree.py} |  35 ++--
 numba_dpex/examples/kernel/vector_sum.py      |  13 +-
 numba_dpex/examples/kernel/vector_sum2D.py    |  13 +-
 14 files changed, 376 insertions(+), 356 deletions(-)
 rename numba_dpex/examples/kernel/{sum_reduction_recursive_ocl.py => sum_reduction_recursive.py} (78%)
 rename numba_dpex/examples/kernel/{sum_reduction_ocl.py => sum_reduction_tree.py} (61%)

diff --git a/numba_dpex/examples/kernel/atomic_op.py b/numba_dpex/examples/kernel/atomic_op.py
index f0e22aa6aa..59ffe18a7e 100644
--- a/numba_dpex/examples/kernel/atomic_op.py
+++ b/numba_dpex/examples/kernel/atomic_op.py
@@ -2,36 +2,45 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import dpnp as np
+"""The example demonstrates the use of :class:`numba_dpex.kernel_api.AtomicRef`.
 
-import numba_dpex as ndpx
+The kernel shows the implementation of a reduction operation in numba-dpex
+where every work-item is updating a global accumulator atomically.
+"""
+import dpnp
 
+import numba_dpex as dpex
+from numba_dpex import kernel_api as kapi
 
-@ndpx.kernel
-def atomic_reduction(a, res):
-    """Summarize all the items in a and writes it into res using atomic.add.
 
-    :param a: array of values to get sum
-    :param res: result where to add all the items from a array. It must be preset to 0.
+@dpex.kernel
+def atomic_reduction(item: kapi.Item, a, res):
+    """Array reduction using :func:`AtomicRef.fetch_add`.
+
+    Args:
+        item (kapi.Item): Index space id for each work item.
+        a (dpnp.ndarray): An 1-d array to be reduced.
+        res (dpnp.ndarray): A single element array into which the result is
+            accumulated.
     """
-    idx = ndpx.get_global_id(0)
-    ndpx.atomic.add(res, 0, a[idx])
+    idx = item.get_id(0)
+    acc = kapi.AtomicRef(res, 0)
+    acc.fetch_add(a[idx])
 
 
 def main():
-    N = 10
+    N = 1024
 
-    # We are storing sum to the first element
-    a = np.arange(0, N)
-    res = np.zeros(1, dtype=a.dtype)
+    a = dpnp.arange(0, N)
+    res = dpnp.zeros(1, dtype=a.dtype)
 
-    print("Using device ...")
-    print(a.device)
+    print("Executing on device:")
+    a.device.print_device_info()
 
-    atomic_reduction[ndpx.Range(N)](a, res)
-    print("Reduction sum =", res[0])
+    dpex.call_kernel(atomic_reduction, dpex.Range(N), a, res)
+    print(f"Summation of {N} integers = {res[0]}")
 
-    print("Done...")
+    assert res[0] == N * (N - 1) / 2
 
 
 if __name__ == "__main__":
diff --git a/numba_dpex/examples/kernel/black_scholes.py b/numba_dpex/examples/kernel/black_scholes.py
index b139371d87..37bcda9d06 100644
--- a/numba_dpex/examples/kernel/black_scholes.py
+++ b/numba_dpex/examples/kernel/black_scholes.py
@@ -1,13 +1,22 @@
 # SPDX-FileCopyrightText: 2022 - 2024 Intel Corporation
 #
 # SPDX-License-Identifier: Apache 2.0
-# SPDX-License-Identifier: Apache-2.0
+
+"""The example shows the implementation of the Black-Scholes formula as a range kernel.
+
+The Black-Scholes model is a mathematical model for derivatives trading with
+various underlying assumptions. The example shown here is a simplified
+representation of the actual model.
+
+Refer: https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model
+"""
 
 from math import erf, exp, log, sqrt
 
 import dpnp as np
 
-import numba_dpex as ndpx
+import numba_dpex as dpex
+from numba_dpex import kernel_api as kapi
 
 # Stock price range
 S0L = 10.0
@@ -47,14 +56,17 @@ def initialize():
     return price, strike, t, rate, volatility, call, put
 
 
-@ndpx.kernel
-def kernel_black_scholes(price, strike, t, rate, volatility, call, put):
+@dpex.kernel
+def kernel_black_scholes(
+    item: kapi.Item, price, strike, t, rate, volatility, call, put
+):
+    """A range kernel implementing a simplified Black-Scholes model."""
     # Scalars
     mr = -rate
     sig_sig_two = volatility * volatility * 2.0
 
     # Current index
-    i = ndpx.get_global_id(0)
+    i = item.get_id(0)
 
     # Get inputs into private memory
     p = price[i]
@@ -86,11 +98,19 @@ def kernel_black_scholes(price, strike, t, rate, volatility, call, put):
 def main():
     price, strike, t, rate, volatility, call, put = initialize()
 
-    print("Using device ...")
-    print(price.device)
-
-    kernel_black_scholes[ndpx.Range(NOPT)](
-        price, strike, t, rate, volatility, call, put
+    print("Executing on device:")
+    price.device.print_device_info()
+
+    dpex.call_kernel(
+        kernel_black_scholes,
+        dpex.Range(NOPT),
+        price,
+        strike,
+        t,
+        rate,
+        volatility,
+        call,
+        put,
     )
 
     print("Call:", call)
diff --git a/numba_dpex/examples/kernel/device_func.py b/numba_dpex/examples/kernel/device_func.py
index c178999157..576a1ce9a2 100644
--- a/numba_dpex/examples/kernel/device_func.py
+++ b/numba_dpex/examples/kernel/device_func.py
@@ -2,148 +2,64 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import dpnp as np
+"""Demonstrates the usage of the :func:`numba_dpex.device_func` decorator.
 
-import numba_dpex as ndpx
-from numba_dpex import float32, int32, int64
+Refer the API documentation and the Kenrel programming guide for further
+details.
+"""
 
-# Array size
-N = 10
+import dpnp
 
-
-# A device callable function that can be invoked from
-# ``kernel`` and other device functions
-@ndpx.func
-def a_device_function(a):
-    return a + 1
+import numba_dpex as dpex
+from numba_dpex import kernel_api as kapi
 
 
-# A device callable function with signature that can be invoked
-# from ``kernel`` and other device functions
-@ndpx.func(int32(int32))
-def a_device_function_int32(a):
+@dpex.device_func
+def increment_by_1(a):
+    """A device callable function that can be invoked from a kernel or
+    another device function.
+    """
     return a + 1
 
 
-# A device callable function with list signature that can be invoked
-# from ``kernel`` and other device functions
-@ndpx.func([int32(int32), float32(float32)])
-def a_device_function_int32_float32(a):
-    return a + 1
-
-
-# A device callable function can call another device function
-@ndpx.func
-def another_device_function(a):
-    return a_device_function(a * 2)
-
-
-# A kernel function that calls the device function
-@ndpx.kernel
-def a_kernel_function(a, b):
-    i = ndpx.get_global_id(0)
-    b[i] = another_device_function(a[i])
-
-
-# A kernel function that calls the device function
-@ndpx.kernel
-def a_kernel_function_int32(a, b):
-    i = ndpx.get_global_id(0)
-    b[i] = a_device_function_int32(a[i])
-
-
-# A kernel function that calls the device function
-@ndpx.kernel
-def a_kernel_function_int32_float32(a, b):
-    i = ndpx.get_global_id(0)
-    b[i] = a_device_function_int32_float32(a[i])
-
-
-# test function 1: tests basic
-def test1():
-    a = np.ones(N)
-    b = np.ones(N)
-
-    print("Using device ...")
-    print(a.device)
-
-    print("A=", a)
-    try:
-        a_kernel_function[ndpx.Range(N)](a, b)
-    except Exception as err:
-        print(err)
-    print("B=", b)
-
-    print("Done...")
-
-
-# test function 2: test device func with signature
-def test2():
-    a = np.ones(N, dtype=np.int32)
-    b = np.ones(N, dtype=np.int32)
-
-    print("Using device ...")
-    print(a.device)
-
-    print("A=", a)
-    try:
-        a_kernel_function_int32[ndpx.Range(N)](a, b)
-    except Exception as err:
-        print(err)
-    print("B=", b)
-
-    print("Done...")
-
-
-# test function 3: test device function with list signature
-def test3():
-    a = np.ones(N, dtype=np.int32)
-    b = np.ones(N, dtype=np.int32)
-
-    print("Using device ...")
-    print(a.device)
-
-    print("A=", a)
-    try:
-        a_kernel_function_int32_float32[ndpx.Range(N)](a, b)
-    except Exception as err:
-        print(err)
-    print("B=", b)
-
-    # with a different dtype
-    a = np.ones(N, dtype=np.float32)
-    b = np.ones(N, dtype=np.float32)
+@dpex.device_func
+def increment_and_sum_up(nd_item: kapi.NdItem, a):
+    """Demonstrates the usage of group_barrier and NdItem usage in a
+    device_func.
+    """
+    i = nd_item.get_global_id(0)
 
-    print("Using device ...")
-    print(a.device)
+    a[i] += 1
+    kapi.group_barrier(nd_item.get_group(), kapi.MemoryScope.DEVICE)
 
-    print("A=", a)
-    try:
-        a_kernel_function_int32_float32[ndpx.Range(N)](a, b)
-    except Exception as err:
-        print(err)
-    print("B=", b)
+    if i == 0:
+        for idx in range(1, a.size):
+            a[0] += a[idx]
 
-    # this will fail, since int64 is not in
-    # the signature list: [int32(int32), float32(float32)]
-    a = np.ones(N, dtype=np.int64)
-    b = np.ones(N, dtype=np.int64)
 
-    print("Using device ...")
-    print(a.device)
+@dpex.kernel
+def kernel1(item: kapi.Item, a, b):
+    """Demonstrates calling a device function from a kernel."""
+    i = item.get_id(0)
+    b[i] = increment_by_1(a[i])
 
-    print("A=", a)
-    try:
-        a_kernel_function_int32_float32[ndpx.Range(N)](a, b)
-    except Exception as err:
-        print(err)
-    print("B=", b)
 
-    print("Done...")
+@dpex.kernel
+def kernel2(nd_item: kapi.NdItem, a):
+    """The kernel delegates everything to a device_func and calls it."""
+    increment_and_sum_up(nd_item, a)
 
 
-# main function
 if __name__ == "__main__":
-    test1()
-    test2()
-    test3()
+    # Array size
+    N = 100
+    a = dpnp.ones(N, dtype=dpnp.int32)
+    b = dpnp.zeros(N, dtype=dpnp.int32)
+
+    dpex.call_kernel(kernel1, dpex.Range(N), a, b)
+    # b should be [2, 2, ...., 2]
+    print(b)
+
+    dpex.call_kernel(kernel2, dpex.NdRange((N,), (N,)), b)
+    # b[0] should be 300
+    print(b[0])
diff --git a/numba_dpex/examples/kernel/interpolation.py b/numba_dpex/examples/kernel/interpolation.py
index cd023c69b6..ab3f5f261e 100644
--- a/numba_dpex/examples/kernel/interpolation.py
+++ b/numba_dpex/examples/kernel/interpolation.py
@@ -2,11 +2,17 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+"""Demonstrates natural cubic spline implemented as an nd-range kernel.
+
+Refer: https://en.wikipedia.org/wiki/Spline_interpolation
+"""
+
 import dpnp as np
 from numba import float32
 from numpy.testing import assert_almost_equal
 
-import numba_dpex as ndpx
+import numba_dpex as dpex
+from numba_dpex import kernel_api as kapi
 
 # Interpolation domain
 XLO = 10.0
@@ -81,15 +87,15 @@
 )
 
 
-@ndpx.kernel()
-def kernel_polynomial(x, y, coefficients):
-    c = ndpx.private.array(
+@dpex.kernel()
+def kernel_polynomial(nditem: kapi.NdItem, x, y, coefficients):
+    c = kapi.PrivateArray(
         4, dtype=float32
     )  # Coefficients of a polynomial of a given segment
-    z = ndpx.private.array(1, dtype=float32)  # Keep x[i] in private memory
+    z = kapi.PrivateArray(1, dtype=float32)  # Keep x[i] in private memory
 
-    gid = ndpx.get_global_id(0)
-    gr_id = ndpx.get_group_id(0)
+    gid = nditem.get_global_id(0)
+    gr_id = nditem.get_group().get_group_id(0)
 
     # Polynomial coefficients are fixed within a workgroup
     c[0] = coefficients[gr_id][0]
@@ -112,16 +118,20 @@ def main():
     xp = np.arange(XLO, XHI, (XHI - XLO) / N_POINTS)
     yp = np.empty(xp.shape)
 
-    print("Using device ...")
-    print(xp.device)
-    global_range = ndpx.Range(
+    print("Executing on device:")
+    xp.device.print_device_info()
+    global_range = kapi.Range(
         N_POINTS // N_POINTS_PER_WORK_ITEM,
     )
-    local_range = ndpx.Range(
+    local_range = kapi.Range(
         LOCAL_SIZE,
     )
-    kernel_polynomial[ndpx.NdRange(global_range, local_range)](
-        xp, yp, COEFFICIENTS
+    dpex.call_kernel(
+        kernel_polynomial,
+        dpex.NdRange(global_range, local_range),
+        xp,
+        yp,
+        COEFFICIENTS,
     )
 
     # Copy results back to the host
diff --git a/numba_dpex/examples/kernel/kernel_private_memory.py b/numba_dpex/examples/kernel/kernel_private_memory.py
index 249274c002..c257cb3c7e 100644
--- a/numba_dpex/examples/kernel/kernel_private_memory.py
+++ b/numba_dpex/examples/kernel/kernel_private_memory.py
@@ -1,47 +1,55 @@
 # SPDX-FileCopyrightText: 2020 - 2024 Intel Corporation
 #
 # SPDX-License-Identifier: Apache-2.0
+""" This example demonstrates the usage of :class:`numba_dpex.kernel_api.PrivateArray`.
+
+A ``PrivateArray`` is an array allocated on a device's private address space. As
+the amount of available private memory is usually limited, programmers should be
+careful with the size of a ``PrivateArray``. Allocating an array that is too big
+will cause the data to be spilled to global memory, causing adverse performance.
+"""
 
 import dpctl
 import dpctl.tensor as dpt
 import numpy as np
-from numba import float32
 
 import numba_dpex as ndpx
+from numba_dpex import float32
+from numba_dpex import kernel_api as kapi
 
 
 def private_memory():
-    """
-    This example demonstrates the usage of numba_dpex's `private.array`
-    intrinsic function. The function is used to create a static array
-    allocated on the devices private address space.
-    """
+    """Demonstrates usage of :class:`numba_dpex.kernel_api.PrivateArray`."""
 
     @ndpx.kernel
-    def private_memory_kernel(A):
-        memory = ndpx.private.array(shape=1, dtype=np.float32)
-        i = ndpx.get_global_id(0)
+    def private_memory_kernel(nditem: kapi.NdItem, A):
+        memory = kapi.PrivateArray(shape=1, dtype=np.float32)
+        i = nditem.get_global_id(0)
 
         # preload
         memory[0] = i
-        ndpx.barrier(ndpx.LOCAL_MEM_FENCE)  # local mem fence
+        gr = nditem.get_group()
+        # local mem fence
+        kapi.group_barrier(gr)
 
         # memory will not hold correct deterministic result if it is not
         # private to each thread.
         A[i] = memory[0] * 2
 
-    N = 4
+    N = 100
     device = dpctl.select_default_device()
 
     arr = dpt.zeros(N, dtype=dpt.float32, device=device)
     orig = np.arange(N).astype(np.float32)
 
-    print("Using device ...")
+    print("Executing on device:")
     device.print_device_info()
 
     global_range = ndpx.Range(N)
     local_range = ndpx.Range(N)
-    private_memory_kernel[ndpx.NdRange(global_range, local_range)](arr)
+    ndpx.call_kernel(
+        private_memory_kernel, ndpx.NdRange(global_range, local_range), arr
+    )
 
     arr_out = dpt.asnumpy(arr)
     np.testing.assert_allclose(orig * 2, arr_out)
diff --git a/numba_dpex/examples/kernel/kernel_specialization.py b/numba_dpex/examples/kernel/kernel_specialization.py
index 7367ec380f..1fd9ce70fa 100644
--- a/numba_dpex/examples/kernel/kernel_specialization.py
+++ b/numba_dpex/examples/kernel/kernel_specialization.py
@@ -2,6 +2,14 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+"""Demonstrates signature specialization feature to pre-compile a kernel.
+
+As opposed to JIT compilation at first call, a ``kernel`` or ``device_func``
+decorated function with signature specialization gets compiled on module
+load and is cached in memory. The following examples demonstrate the feature for
+the numba_dpex.kernel decorator and presents usage scenarios and current
+limitations.
+"""
 import dpctl.tensor as dpt
 import numpy as np
 
@@ -11,34 +19,39 @@
     InvalidKernelSpecializationError,
     MissingSpecializationError,
 )
-
-# Similar to Numba, numba-ndpx supports eager compilation of functions. The
-# following examples demonstrate the feature for numba_ndpx.kernel and presents
-# usage scenarios and current limitations.
+from numba_dpex.core.types.kernel_api.index_space_ids import ItemType
 
 # ------------                 Example 1.                   ------------ #
 
-# Define type specializations using the numba_ndpx usm_ndarray data type.
+# Define type specializations using the numba_dpex usm_ndarray data type.
 i64arrty = usm_ndarray(1, "C", int64)
 f32arrty = usm_ndarray(1, "C", float32)
+# Type specialization for the index space id type
+itemty = ItemType(ndim=1)
 
 
 # specialize a kernel for the i64arrty
-@ndpx.kernel((i64arrty, i64arrty, i64arrty))
-def data_parallel_sum(a, b, c):
+specialized_kernel = ndpx.kernel((itemty, i64arrty, i64arrty, i64arrty))
+
+
+def data_parallel_sum(item, a, b, c):
     """
     Vector addition using the ``kernel`` decorator.
     """
-    i = ndpx.get_global_id(0)
+    i = item.get_id(0)
     c[i] = a[i] + b[i]
 
 
+# pre-compiled kernel
+pre_compiled_kernel = specialized_kernel(data_parallel_sum)
+
 # run the specialized kernel
 a = dpt.ones(1024, dtype=dpt.int64)
 b = dpt.ones(1024, dtype=dpt.int64)
 c = dpt.zeros(1024, dtype=dpt.int64)
 
-data_parallel_sum[ndpx.Range(1024)](a, b, c)
+# Call the pre-compiled kernel
+ndpx.call_kernel(pre_compiled_kernel, ndpx.Range(1024), a, b, c)
 
 npc = dpt.asnumpy(c)
 npc_expected = np.full(1024, 2, dtype=np.int64)
@@ -52,21 +65,32 @@ def data_parallel_sum(a, b, c):
 
 
 # specialize a kernel for the i64arrty
-@ndpx.kernel([(i64arrty, i64arrty, i64arrty), (f32arrty, f32arrty, f32arrty)])
-def data_parallel_sum2(a, b, c):
+specialized_kernels_list = ndpx.kernel(
+    [
+        (itemty, i64arrty, i64arrty, i64arrty),
+        (itemty, f32arrty, f32arrty, f32arrty),
+    ]
+)
+
+
+def data_parallel_sum2(item, a, b, c):
     """
     Vector addition using the ``kernel`` decorator.
     """
-    i = ndpx.get_global_id(0)
+    i = item.get_id(0)
     c[i] = a[i] + b[i]
 
 
+# Pre-compile both variants of the kernel
+pre_compiled_kernels = specialized_kernels_list(data_parallel_sum2)
+
 # run the i64 specialized kernel
 a = dpt.ones(1024, dtype=dpt.int64)
 b = dpt.ones(1024, dtype=dpt.int64)
 c = dpt.zeros(1024, dtype=dpt.int64)
 
-data_parallel_sum2[ndpx.Range(1024)](a, b, c)
+# Compiler will type match the right variant and call it.
+ndpx.call_kernel(pre_compiled_kernels, ndpx.Range(1024), a, b, c)
 
 npc = dpt.asnumpy(c)
 npc_expected = np.full(1024, 2, dtype=np.int64)
@@ -77,43 +101,13 @@ def data_parallel_sum2(a, b, c):
 b = dpt.ones(1024, dtype=dpt.float32)
 c = dpt.zeros(1024, dtype=dpt.float32)
 
-data_parallel_sum2[ndpx.Range(1024)](a, b, c)
+ndpx.call_kernel(pre_compiled_kernels, ndpx.Range(1024), a, b, c)
 
 npc = dpt.asnumpy(c)
 npc_expected = np.full(1024, 2, dtype=np.float32)
 assert np.array_equal(npc, npc_expected)
 
 
-# ------------                 Example 3.                   ------------ #
-
-# A specialized kernel cannot be jit compiled. Calling a specialized kernel
-# with arguments having type different from the specialization will result in
-# an MissingSpecializationError.
-
-a = dpt.ones(1024, dtype=dpt.int32)
-b = dpt.ones(1024, dtype=dpt.int32)
-c = dpt.zeros(1024, dtype=dpt.int32)
-
-try:
-    data_parallel_sum[ndpx.Range(1024)](a, b, c)
-except MissingSpecializationError as mse:
-    print(mse)
-
-
-# ------------                 Example 4.                   ------------ #
-
-# Numba_ndpx does not support NumPy arrays as kernel arguments and all
-# array arguments should be inferable as a numba_ndpx.types.usm_ndarray. Trying
-# to eager compile with a NumPy array-based signature will lead to an
-# InvalidKernelSpecializationError
-
-try:
-    ndpx.kernel((int64[::1], int64[::1], int64[::1]))
-except InvalidKernelSpecializationError as e:
-    print("Dpex kernels cannot be specialized using NumPy arrays.")
-    print(e)
-
-
 # ------------                 Limitations                       ------------ #
 
 
diff --git a/numba_dpex/examples/kernel/matmul.py b/numba_dpex/examples/kernel/matmul.py
index 804dd3b16e..335d319bda 100644
--- a/numba_dpex/examples/kernel/matmul.py
+++ b/numba_dpex/examples/kernel/matmul.py
@@ -4,69 +4,97 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import dpctl
+"""The example demonstrates a sliding window matrix-matrix multiplication kernel.
+"""
+
 import dpctl.tensor as dpt
 import numpy as np
 
-import numba_dpex as ndpx
-
-
-@ndpx.kernel
-def gemm(a, b, c):
-    """
-    A basic DGEMM implemented as a ``kernel`` function.
-    """
-    i = ndpx.get_global_id(0)
-    j = ndpx.get_global_id(1)
-    if i >= c.shape[0] or j >= c.shape[1]:
-        return
-    c[i, j] = 0
-    for k in range(c.shape[0]):
-        c[i, j] += a[i, k] * b[k, j]
-
-
-# Array dimensions
-X = 1024
-Y = 16
-global_size = X, X
-
-griddim = ndpx.Range(X, X)
-blockdim = ndpx.Range(Y, Y)
-
-
-def driver(a, b, c):
-    # Invoke the kernel
-    gemm[ndpx.NdRange(griddim, blockdim)](a, b, c)
-
-
-def main():
-    a = np.arange(X * X, dtype=np.float32).reshape(X, X)
-    b = np.array(np.random.random(X * X), dtype=np.float32).reshape(X, X)
-
-    device = dpctl.select_default_device()
-    a_dpt = dpt.arange(X * X, dtype=dpt.float32, device=device)
-    a_dpt = dpt.reshape(a_dpt, (X, X))
-    b_dpt = dpt.asarray(b, dtype=dpt.float32, device=device)
-    b_dpt = dpt.reshape(b_dpt, (X, X))
-    c_dpt = dpt.ones_like(a_dpt)
-    c_dpt = dpt.reshape(c_dpt, (X, X))
-
-    print("Using device ...")
-    device.print_device_info()
-
-    driver(a_dpt, b_dpt, c_dpt)
-    c_out = dpt.asnumpy(c_dpt)
-
-    # Host compute using standard NumPy
-    Amat = np.matrix(a)
-    Bmat = np.matrix(b)
-    Cans = Amat * Bmat
-
-    # Check result
-    assert np.allclose(c_out, Cans)
-
-    print("Done...")
-
-
-if __name__ == "__main__":
-    main()
+import numba_dpex as dpex
+from numba_dpex import kernel_api as kapi
+
+square_block_side = 2
+work_group_size = (square_block_side, square_block_side)
+dtype = np.float32
+
+
+@dpex.kernel
+def matmul(
+    nditem: kapi.NdItem,
+    X,  # IN READ-ONLY    (X_n_rows, n_cols)
+    y,  # IN READ-ONLY    (n_cols, y_n_rows),
+    X_slm,  # SLM to store a sliding window over X
+    Y_slm,  # SLM to store a sliding window over Y
+    result,  # OUT        (X_n_rows, y_n_rows)
+):
+    X_n_rows = X.shape[0]
+    Y_n_cols = y.shape[1]
+    n_cols = X.shape[1]
+
+    result_row_idx = nditem.get_global_id(0)
+    result_col_idx = nditem.get_global_id(1)
+
+    local_row_idx = nditem.get_local_id(0)
+    local_col_idx = nditem.get_local_id(1)
+
+    n_blocks_for_cols = n_cols // square_block_side
+    if (n_cols % square_block_side) > 0:
+        n_blocks_for_cols += 1
+
+    output = dtype(0)
+
+    gr = nditem.get_group()
+
+    for block_idx in range(n_blocks_for_cols):
+        X_slm[local_row_idx, local_col_idx] = dtype(0)
+        Y_slm[local_row_idx, local_col_idx] = dtype(0)
+        if (result_row_idx < X_n_rows) and (
+            (local_col_idx + (square_block_side * block_idx)) < n_cols
+        ):
+            X_slm[local_row_idx, local_col_idx] = X[
+                result_row_idx, local_col_idx + (square_block_side * block_idx)
+            ]
+
+        if (result_col_idx < Y_n_cols) and (
+            (local_row_idx + (square_block_side * block_idx)) < n_cols
+        ):
+            Y_slm[local_row_idx, local_col_idx] = y[
+                local_row_idx + (square_block_side * block_idx), result_col_idx
+            ]
+
+        kapi.group_barrier(gr)
+
+        for idx in range(square_block_side):
+            output += X_slm[local_row_idx, idx] * Y_slm[idx, local_col_idx]
+
+        kapi.group_barrier(gr)
+
+    if (result_row_idx < X_n_rows) and (result_col_idx < Y_n_cols):
+        result[result_row_idx, result_col_idx] = output
+
+
+def _arange_reshaped(shape, dtype):
+    n_items = shape[0] * shape[1]
+    return np.arange(n_items, dtype=dtype).reshape(shape)
+
+
+X = _arange_reshaped((5, 5), dtype)
+Y = _arange_reshaped((5, 5), dtype)
+X = dpt.asarray(X)
+Y = dpt.asarray(Y)
+device = X.device.sycl_device
+result = dpt.zeros((5, 5), dtype, device=device)
+X_slm = kapi.LocalAccessor(shape=work_group_size, dtype=dtype)
+Y_slm = kapi.LocalAccessor(shape=work_group_size, dtype=dtype)
+
+dpex.call_kernel(
+    matmul, kapi.NdRange((6, 6), (2, 2)), X, Y, X_slm, Y_slm, result
+)
+
+# Expected:
+# [[ 150.  160.  170.  180.  190.]
+#  [ 400.  435.  470.  505.  540.]
+#  [ 650.  710.  770.  830.  890.]
+#  [ 900.  985. 1070. 1155. 1240.]
+#  [1150. 1260. 1370. 1480. 1590.]]
+print(result)
diff --git a/numba_dpex/examples/kernel/pairwise_distance.py b/numba_dpex/examples/kernel/pairwise_distance.py
index 0bfb6205d7..0737ee52c7 100644
--- a/numba_dpex/examples/kernel/pairwise_distance.py
+++ b/numba_dpex/examples/kernel/pairwise_distance.py
@@ -2,13 +2,14 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+"""The example demonstrates a N^2 pairwise distance matrix computation for an
+array of N elements using a numba_dpex range kernel.
+"""
 import argparse
 from math import sqrt
 from string import Template
 from time import time
 
-import dpctl
-import dpctl.memory as dpctl_mem
 import dpnp as np
 
 import numba_dpex as ndpx
@@ -26,7 +27,6 @@
 
 # Global work size is equal to the number of points
 global_size = ndpx.Range(args.n)
-# Local Work size is optional
 local_size = ndpx.Range(args.l)
 
 X = np.random.random((args.n, args.d)).astype(np.single)
@@ -34,12 +34,12 @@
 
 
 @ndpx.kernel
-def pairwise_distance(X, D, xshape0, xshape1):
+def pairwise_distance(nditem, X, D):
     """
     An Euclidean pairwise distance computation implemented as
     a ``kernel`` function.
     """
-    idx = ndpx.get_global_id(0)
+    idx = nditem.get_global_id(0)
 
     d0 = X[idx, 0] - X[idx, 0]
     # for i in range(xshape0):
@@ -54,10 +54,10 @@ def pairwise_distance(X, D, xshape0, xshape1):
 def driver():
     # measure running time
     times = list()
-    for repeat in range(args.r):
+    for _ in range(args.r):
         start = time()
-        pairwise_distance[ndpx.NdRange(global_size, local_size)](
-            X, D, X.shape[0], X.shape[1]
+        ndpx.call_kernel(
+            pairwise_distance, ndpx.NdRange(global_size, local_size), X, D
         )
         end = time()
 
diff --git a/numba_dpex/examples/kernel/pipelining.py b/numba_dpex/examples/kernel/pipelining.py
index 4272ed5b26..4c9bf29fb5 100644
--- a/numba_dpex/examples/kernel/pipelining.py
+++ b/numba_dpex/examples/kernel/pipelining.py
@@ -20,10 +20,9 @@
 import numpy as np
 
 import numba_dpex as dpex
-import numba_dpex.experimental as dpex_exp
 
 
-@dpex_exp.kernel
+@dpex.kernel
 def async_kernel(x):
     idx = dpex.get_global_id(0)
 
@@ -48,7 +47,7 @@ def run_serial(host_arr, n_itr):
 
         q.memcpy(_a_data, usm_host_data, usm_host_data.nbytes)
 
-        dpex_exp.call_kernel(
+        dpex.call_kernel(
             async_kernel,
             dpex.Range(len(_a)),
             _a,
@@ -85,7 +84,7 @@ def run_pipeline(host_arr, n_itr):
 
         e_a.wait()
 
-        _, e_a = dpex_exp.call_kernel_async(
+        _, e_a = dpex.call_kernel_async(
             async_kernel,
             dpex.Range(len(_a)),
             (e_a,),
diff --git a/numba_dpex/examples/kernel/scan.py b/numba_dpex/examples/kernel/scan.py
index bf2d194414..36f3b41cd2 100644
--- a/numba_dpex/examples/kernel/scan.py
+++ b/numba_dpex/examples/kernel/scan.py
@@ -2,52 +2,55 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-# scan.py is not working due to issue: https://github.com/IntelPython/numba-dpex/issues/829
+"""An implementation of the Hillis-Steele algorithm to compute prefix sums.
+
+The algorithm is implemented to work with a single work group of N work items,
+where N is the number of elements.
+"""
 
 import dpnp as np
 
 import numba_dpex as ndpx
+from numba_dpex import kernel_api as kapi
 
 # 1D array size
 N = 64
 
 
-# Implements Hillis-Steele prefix sum algorithm
 @ndpx.kernel
-def kernel_hillis_steele_scan(a):
+def kernel_hillis_steele_scan(nditem: kapi.NdItem, a, slm_b, slm_c):
     # Get local and global id and workgroup size
-    gid = ndpx.get_global_id(0)
-    lid = ndpx.get_local_id(0)
-    ls = ndpx.get_local_size(0)
-
-    # Create temporals in local memory
-    b = ndpx.local.array(ls, dtype=a.dtype)
-    c = ndpx.local.array(ls, dtype=a.dtype)
+    gid = nditem.get_global_id(0)
+    lid = nditem.get_local_id(0)
+    ls = nditem.get_local_range(0)
+    gr = nditem.get_group()
 
     # Initialize locals
-    c[lid] = b[lid] = a[gid]
-    ndpx.barrier(ndpx.LOCAL_MEM_FENCE)
+    slm_c[lid] = slm_b[lid] = a[gid]
+
+    kapi.group_barrier(gr)
 
     # Calculate prefix sum
     d = 1
     while d < ls:
         if lid > d:
-            c[lid] = b[lid] + b[lid - d]
+            slm_c[lid] = slm_b[lid] + slm_b[lid - d]
         else:
-            c[lid] = b[lid]
+            slm_c[lid] = slm_b[lid]
 
-        ndpx.barrier(ndpx.LOCAL_MEM_FENCE)
+        kapi.group_barrier(gr)
 
         # Swap c and b
-        e = c[lid]
-        c[lid] = b[lid]
-        b[lid] = e
+        e = slm_c[lid]
+        slm_c[lid] = slm_b[lid]
+        slm_b[lid] = e
 
         # Double the stride
         d *= 2
 
-    ndpx.barrier()  # The same as ndpx.barrier(ndpx.GLOBAL_MEM_FENCE)
-    a[gid] = b[lid]
+    kapi.group_barrier(gr, kapi.MemoryScope.DEVICE)
+
+    a[gid] = slm_b[lid]
 
 
 def main():
@@ -56,7 +59,14 @@ def main():
 
     print("Using device ...")
     print(arr.device)
-    kernel_hillis_steele_scan[ndpx.Range(N)](arr)
+
+    # Create temporals in local memory
+    slm_b = kapi.LocalAccessor(N, dtype=arr.dtype)
+    slm_c = kapi.LocalAccessor(N, dtype=arr.dtype)
+
+    ndpx.call_kernel(
+        kernel_hillis_steele_scan, ndpx.NdRange((N,), (N,)), arr, slm_b, slm_c
+    )
 
     # the output should be [0, 1, 3, 6, ...]
     arr_np = np.asnumpy(arr)
diff --git a/numba_dpex/examples/kernel/sum_reduction_recursive_ocl.py b/numba_dpex/examples/kernel/sum_reduction_recursive.py
similarity index 78%
rename from numba_dpex/examples/kernel/sum_reduction_recursive_ocl.py
rename to numba_dpex/examples/kernel/sum_reduction_recursive.py
index 8cd4463464..a6f55a65d9 100644
--- a/numba_dpex/examples/kernel/sum_reduction_recursive_ocl.py
+++ b/numba_dpex/examples/kernel/sum_reduction_recursive.py
@@ -13,21 +13,20 @@
 from numba import int32
 
 import numba_dpex as ndpx
+from numba_dpex import kernel_api as kapi
 
 
 @ndpx.kernel
-def sum_reduction_kernel(A, input_size, partial_sums):
+def sum_reduction_kernel(nditem: kapi.NdItem, A, input_size, partial_sums, slm):
     local_id = ndpx.get_local_id(0)
     global_id = ndpx.get_global_id(0)
     group_size = ndpx.get_local_size(0)
     group_id = ndpx.get_group_id(0)
 
-    local_sums = ndpx.local.array(64, int32)
-
-    local_sums[local_id] = 0
+    slm[local_id] = 0
 
     if global_id < input_size:
-        local_sums[local_id] = A[global_id]
+        slm[local_id] = A[global_id]
 
     # Loop for computing local_sums : divide workgroup into 2 parts
     stride = group_size // 2
@@ -37,12 +36,12 @@ def sum_reduction_kernel(A, input_size, partial_sums):
 
         # Add elements 2 by 2 between local_id and local_id + stride
         if local_id < stride:
-            local_sums[local_id] += local_sums[local_id + stride]
+            slm[local_id] += slm[local_id + stride]
 
         stride >>= 1
 
     if local_id == 0:
-        partial_sums[group_id] = local_sums[0]
+        partial_sums[group_id] = slm[0]
 
 
 def sum_recursive_reduction(size, group_size, Dinp, Dpartial_sums):
@@ -60,12 +59,24 @@ def sum_recursive_reduction(size, group_size, Dinp, Dpartial_sums):
 
     gr = ndpx.Range(passed_size)
     lr = ndpx.Range(group_size)
-
-    sum_reduction_kernel[ndpx.NdRange(gr, lr)](Dinp, size, Dpartial_sums)
+    slm = kapi.LocalAccessor(64, Dinp.dtype)
+    ndpx.call_kernel(
+        sum_reduction_kernel,
+        ndpx.NdRange(gr, lr),
+        Dinp,
+        size,
+        Dpartial_sums,
+        slm,
+    )
 
     if nb_work_groups <= group_size:
-        sum_reduction_kernel[ndpx.NdRange(lr, lr)](
-            Dpartial_sums, nb_work_groups, Dinp
+        ndpx.call_kernel(
+            sum_reduction_kernel,
+            ndpx.NdRange(lr, lr),
+            Dpartial_sums,
+            nb_work_groups,
+            Dinp,
+            slm,
         )
         result = int(Dinp[0])
     else:
diff --git a/numba_dpex/examples/kernel/sum_reduction_ocl.py b/numba_dpex/examples/kernel/sum_reduction_tree.py
similarity index 61%
rename from numba_dpex/examples/kernel/sum_reduction_ocl.py
rename to numba_dpex/examples/kernel/sum_reduction_tree.py
index 889fca0082..d2375ca5a3 100644
--- a/numba_dpex/examples/kernel/sum_reduction_ocl.py
+++ b/numba_dpex/examples/kernel/sum_reduction_tree.py
@@ -2,43 +2,49 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+""" Demonstration of a simple tree reduction algorithm to reduce an array of scalars.
+
+The algorithm works in two steps: First an nd-range kernel is launched to
+calculate a partially reduced array. The size of the partially reduced array is
+equal to the number of work groups over which the initial reduction is done.
+The partial results are then summed up on the host device.
+"""
 import dpctl
 import dpctl.tensor as dpt
-from numba import int32
 
 import numba_dpex as ndpx
+from numba_dpex import kernel_api as kapi
 
 
 @ndpx.kernel
-def sum_reduction_kernel(A, partial_sums):
+def sum_reduction_kernel(nditem: kapi.NdItem, A, partial_sums, slm):
     """
     The example demonstrates a reduction kernel implemented as a ``kernel``
     function.
     """
-    local_id = ndpx.get_local_id(0)
-    global_id = ndpx.get_global_id(0)
-    group_size = ndpx.get_local_size(0)
-    group_id = ndpx.get_group_id(0)
-
-    local_sums = ndpx.local.array(64, int32)
+    local_id = nditem.get_local_id(0)
+    global_id = nditem.get_global_id(0)
+    group_size = nditem.get_local_range(0)
+    gr = nditem.get_group()
+    group_id = gr.get_group_id(0)
 
     # Copy from global to local memory
-    local_sums[local_id] = A[global_id]
+    slm[local_id] = A[global_id]
 
     # Loop for computing local_sums : divide workgroup into 2 parts
     stride = group_size // 2
     while stride > 0:
         # Waiting for each 2x2 addition into given workgroup
-        ndpx.barrier(ndpx.LOCAL_MEM_FENCE)
+        kapi.group_barrier(gr)
 
         # Add elements 2 by 2 between local_id and local_id + stride
         if local_id < stride:
-            local_sums[local_id] += local_sums[local_id + stride]
+            slm[local_id] += slm[local_id + stride]
 
         stride >>= 1
 
     if local_id == 0:
-        partial_sums[group_id] = local_sums[0]
+        partial_sums[group_id] = slm[0]
 
 
 def sum_reduce(A):
@@ -51,7 +57,10 @@ def sum_reduce(A):
 
     gs = ndpx.Range(global_size)
     ls = ndpx.Range(work_group_size)
-    sum_reduction_kernel[ndpx.NdRange(gs, ls)](A, partial_sums)
+    slm = kapi.LocalAccessor(64, A.dtype)
+    ndpx.call_kernel(
+        sum_reduction_kernel, ndpx.NdRange(gs, ls), A, partial_sums, slm
+    )
 
     final_sum = 0
     # calculate the final sum in HOST
diff --git a/numba_dpex/examples/kernel/vector_sum.py b/numba_dpex/examples/kernel/vector_sum.py
index 6b89cbe112..dc0bf7643d 100644
--- a/numba_dpex/examples/kernel/vector_sum.py
+++ b/numba_dpex/examples/kernel/vector_sum.py
@@ -2,6 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+"""The example demonstrates a 1D vector addition kernel.
+"""
+
 import dpnp
 import numpy.testing as testing
 
@@ -10,14 +13,14 @@
 
 # Data parallel kernel implementing vector sum
 @ndpx.kernel
-def kernel_vector_sum(a, b, c):
-    i = ndpx.get_global_id(0)
+def kernel_vector_sum(item, a, b, c):
+    i = item.get_id(0)
     c[i] = a[i] + b[i]
 
 
 # Utility function for printing and testing
 def driver(a, b, c, global_size):
-    kernel_vector_sum[ndpx.Range(global_size)](a, b, c)
+    ndpx.call_kernel(kernel_vector_sum, ndpx.Range(global_size), a, b, c)
     a_np = dpnp.asnumpy(a)  # Copy dpnp array a to NumPy array a_np
     b_np = dpnp.asnumpy(b)  # Copy dpnp array b to NumPy array b_np
     c_np = dpnp.asnumpy(c)  # Copy dpnp array c to NumPy array c_np
@@ -35,8 +38,8 @@ def main():
     b = dpnp.random.random(N)
     c = dpnp.ones_like(a)
 
-    print("Using device ...")
-    print(a.device)
+    print("Executing on device:")
+    a.device.print_device_info()
     driver(a, b, c, global_size)
     print("Done...")
 
diff --git a/numba_dpex/examples/kernel/vector_sum2D.py b/numba_dpex/examples/kernel/vector_sum2D.py
index 0dd60d1165..79fb857711 100644
--- a/numba_dpex/examples/kernel/vector_sum2D.py
+++ b/numba_dpex/examples/kernel/vector_sum2D.py
@@ -4,6 +4,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+"""The example demonstrates a 2-D vector addition kernel.
+"""
+
 import dpctl
 import dpctl.tensor as dpt
 import numpy as np
@@ -12,17 +15,17 @@
 
 
 @ndpx.kernel
-def data_parallel_sum(a, b, c):
+def data_parallel_sum(item, a, b, c):
     """
     A two-dimensional vector addition example using the ``kernel`` decorator.
     """
-    i = ndpx.get_global_id(0)
-    j = ndpx.get_global_id(1)
+    i = item.get_id(0)
+    j = item.get_id(1)
     c[i, j] = a[i, j] + b[i, j]
 
 
 def driver(a, b, c, global_size):
-    data_parallel_sum[global_size](a, b, c)
+    ndpx.call_kernel(data_parallel_sum, global_size, a, b, c)
 
 
 def main():
@@ -45,7 +48,7 @@ def main():
     c_dpt = dpt.empty_like(a_dpt)
     c_dpt = dpt.reshape(c_dpt, (X, Y))
 
-    print("Using device ...")
+    print("Executing on device:")
     device.print_device_info()
 
     print("Running kernel ...")

From 888c0c38fe5d797c343606482d5626e8402722c7 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Fri, 22 Mar 2024 01:33:16 -0500
Subject: [PATCH 09/14] Update API doc generation scripts.

---
 docs/_templates/autoapi/index.rst | 4 ++--
 docs/source/conf.py               | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/_templates/autoapi/index.rst b/docs/_templates/autoapi/index.rst
index 6f37ea6912..f111f90ed9 100644
--- a/docs/_templates/autoapi/index.rst
+++ b/docs/_templates/autoapi/index.rst
@@ -7,8 +7,8 @@ This page contains auto-generated API reference documentation [#f1]_.
    :maxdepth: 1
 
    numba_dpex/kernel_api/index
-   numba_dpex/experimental/decorators/index
-   numba_dpex/experimental/launcher/index
+   numba_dpex/core/decorators/index
+   numba_dpex/core/kernel_launcher/index
 
    {% for page in pages %}
    {% if page.top_level_object and page.display %}
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6b053075c3..4276307360 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -115,7 +115,7 @@
 
 autoapi_dirs = [
     "../../numba_dpex/kernel_api",
-    "../../numba_dpex/experimental",
+    "../../numba_dpex/core",
 ]
 autoapi_type = "python"
 

From 1e48797abd7e6409131bdfc1b62ba1779fb50bcf Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Fri, 22 Mar 2024 01:51:28 -0500
Subject: [PATCH 10/14] Remove redundant example.

---
 .../examples/{ => dpjit}/blacksholes_njit.py  |  0
 numba_dpex/examples/sum_reduction.py          | 53 -------------------
 2 files changed, 53 deletions(-)
 rename numba_dpex/examples/{ => dpjit}/blacksholes_njit.py (100%)
 delete mode 100644 numba_dpex/examples/sum_reduction.py

diff --git a/numba_dpex/examples/blacksholes_njit.py b/numba_dpex/examples/dpjit/blacksholes_njit.py
similarity index 100%
rename from numba_dpex/examples/blacksholes_njit.py
rename to numba_dpex/examples/dpjit/blacksholes_njit.py
diff --git a/numba_dpex/examples/sum_reduction.py b/numba_dpex/examples/sum_reduction.py
deleted file mode 100644
index 2c7e66423b..0000000000
--- a/numba_dpex/examples/sum_reduction.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# SPDX-FileCopyrightText: 2020 - 2024 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import math
-
-import dpnp as np
-
-import numba_dpex as ndpx
-
-
-@ndpx.kernel
-def sum_reduction_kernel(A, R, stride):
-    i = ndpx.get_global_id(0)
-    # sum two element
-    R[i] = A[i] + A[i + stride]
-    # store the sum to be used in nex iteration
-    A[i] = R[i]
-
-
-def sum_reduce(A):
-    """Size of A should be power of two."""
-    total = len(A)
-    # max size will require half the size of A to store sum
-    R = np.array(np.random.random(math.floor(total / 2)), dtype=A.dtype)
-
-    while total > 1:
-        global_size = math.floor(total // 2)
-        total = total - global_size
-        sum_reduction_kernel[ndpx.Range(global_size)](A, R, total)
-
-    return R[0]
-
-
-def test_sum_reduce():
-    N = 2048
-
-    A = np.arange(N, dtype=np.float32)
-    A_copy = np.arange(N, dtype=np.float32)
-
-    actual = sum_reduce(A)
-    expected = A_copy.sum()
-
-    print("Actual:  ", actual)
-    print("Expected:", expected)
-
-    assert expected - actual < 1e-2
-
-    print("Done...")
-
-
-if __name__ == "__main__":
-    test_sum_reduce()

From 0fc57fe645d5cacd128f146903e05722d2c19a4a Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Fri, 22 Mar 2024 11:17:55 -0500
Subject: [PATCH 11/14] Fix include path in kernel_builder

---
 numba_dpex/core/parfors/kernel_builder.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/numba_dpex/core/parfors/kernel_builder.py b/numba_dpex/core/parfors/kernel_builder.py
index 588a617684..b43b5fa31c 100644
--- a/numba_dpex/core/parfors/kernel_builder.py
+++ b/numba_dpex/core/parfors/kernel_builder.py
@@ -26,9 +26,10 @@
 
 import numba_dpex as dpex
 from numba_dpex.core import config
+from numba_dpex.core.kernel_interface.spirv_kernel import SpirvKernel
 
 from ..descriptor import dpex_kernel_target
-from ..types import DpnpNdArray, USMNdArray
+from ..types import DpnpNdArray
 from ..utils.kernel_templates import RangeKernelTemplate
 
 
@@ -66,9 +67,7 @@ def _compile_kernel_parfor(
     sycl_queue, kernel_name, func_ir, argtypes, debug=False
 ):
     # Create a SPIRVKernel object
-    kernel = dpex.core.kernel_interface.spirv_kernel.SpirvKernel(
-        func_ir, kernel_name
-    )
+    kernel = SpirvKernel(func_ir, kernel_name)
 
     # compile the kernel
     kernel.compile(

From 716a8c88d4d1c51f50f483febe851ece0a428c1c Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Fri, 22 Mar 2024 13:27:38 -0500
Subject: [PATCH 12/14] Port all debug examples to new API

---
 numba_dpex/examples/debug/dpex_func.py             | 9 ++++-----
 numba_dpex/examples/debug/side-by-side-2.py        | 9 ++++-----
 numba_dpex/examples/debug/side-by-side.py          | 9 ++++-----
 numba_dpex/examples/debug/simple_dpex_func.py      | 8 ++++----
 numba_dpex/examples/debug/simple_sum.py            | 6 +++---
 numba_dpex/examples/debug/sum.py                   | 6 +++---
 numba_dpex/examples/debug/sum_local_vars.py        | 5 ++---
 numba_dpex/examples/debug/sum_local_vars_revive.py | 8 ++++----
 8 files changed, 28 insertions(+), 32 deletions(-)

diff --git a/numba_dpex/examples/debug/dpex_func.py b/numba_dpex/examples/debug/dpex_func.py
index 321eaeb004..f7573f3c51 100644
--- a/numba_dpex/examples/debug/dpex_func.py
+++ b/numba_dpex/examples/debug/dpex_func.py
@@ -2,21 +2,20 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import dpctl
 import dpnp as np
 
 import numba_dpex as ndpx
 
 
-@ndpx.func(debug=True)
+@ndpx.device_func(debug=True)
 def func_sum(a_in_func, b_in_func):
     result = a_in_func + b_in_func
     return result
 
 
 @ndpx.kernel(debug=True)
-def kernel_sum(a_in_kernel, b_in_kernel, c_in_kernel):
-    i = ndpx.get_global_id(0)
+def kernel_sum(item, a_in_kernel, b_in_kernel, c_in_kernel):
+    i = item.get_id(0)
     c_in_kernel[i] = func_sum(a_in_kernel[i], b_in_kernel[i])
 
 
@@ -24,7 +23,7 @@ def driver(a, b, c, global_size):
     print("a = ", a)
     print("b = ", b)
     print("c = ", c)
-    kernel_sum[ndpx.Range(global_size)](a, b, c)
+    ndpx.call_kernel(kernel_sum, ndpx.Range(global_size), a, b, c)
     print("a + b = ", c)
 
 
diff --git a/numba_dpex/examples/debug/side-by-side-2.py b/numba_dpex/examples/debug/side-by-side-2.py
index 15301b7a3e..3295ca1d3a 100644
--- a/numba_dpex/examples/debug/side-by-side-2.py
+++ b/numba_dpex/examples/debug/side-by-side-2.py
@@ -4,7 +4,6 @@
 
 import argparse
 
-import dpctl
 import dpnp
 import numba
 import numpy as np
@@ -57,17 +56,17 @@ def numba_func_driver(a, b, c):
 
 
 def ndpx_func_driver(a, b, c):
-    kernel[ndpx.Range(len(c))](a, b, c)
+    ndpx.call_kernel(_kernel, ndpx.Range(len(c)), a, b, c)
 
 
 @ndpx.kernel(debug=True)
-def kernel(a_in_kernel, b_in_kernel, c_in_kernel):
-    i = ndpx.get_global_id(0)
+def _kernel(item, a_in_kernel, b_in_kernel, c_in_kernel):
+    i = item.get_id(0)
     c_in_kernel[i] = ndpx_loop_body(i, a_in_kernel, b_in_kernel)
 
 
 numba_loop_body = numba.njit(debug=True)(common_loop_body)
-ndpx_loop_body = ndpx.func(debug=True)(common_loop_body)
+ndpx_loop_body = ndpx.device_func(debug=True)(common_loop_body)
 
 
 def main():
diff --git a/numba_dpex/examples/debug/side-by-side.py b/numba_dpex/examples/debug/side-by-side.py
index 0c26d5be71..5df0b551eb 100644
--- a/numba_dpex/examples/debug/side-by-side.py
+++ b/numba_dpex/examples/debug/side-by-side.py
@@ -4,7 +4,6 @@
 
 import argparse
 
-import dpctl
 import dpnp
 import numba
 import numpy as np
@@ -55,17 +54,17 @@ def numba_func_driver(a, b, c):
 
 
 def ndpx_func_driver(a, b, c):
-    kernel[ndpx.Range(len(c))](a, b, c)
+    ndpx.call_kernel(_kernel, ndpx.Range(len(c)), a, b, c)
 
 
 @ndpx.kernel(debug=True)
-def kernel(a_in_kernel, b_in_kernel, c_in_kernel):
-    i = ndpx.get_global_id(0)
+def _kernel(item, a_in_kernel, b_in_kernel, c_in_kernel):
+    i = item.get_global_id(0)
     c_in_kernel[i] = ndpx_loop_body(a_in_kernel[i], b_in_kernel[i])
 
 
 numba_loop_body = numba.njit(debug=True)(common_loop_body)
-ndpx_loop_body = ndpx.func(debug=True)(common_loop_body)
+ndpx_loop_body = ndpx.device_func(debug=True)(common_loop_body)
 
 
 def main():
diff --git a/numba_dpex/examples/debug/simple_dpex_func.py b/numba_dpex/examples/debug/simple_dpex_func.py
index a7e2c241cb..e16cc1224b 100644
--- a/numba_dpex/examples/debug/simple_dpex_func.py
+++ b/numba_dpex/examples/debug/simple_dpex_func.py
@@ -7,15 +7,15 @@
 import numba_dpex as ndpx
 
 
-@ndpx.func(debug=True)
+@ndpx.device_func(debug=True)
 def func_sum(a_in_func, b_in_func):
     result = a_in_func + b_in_func  # breakpoint location
     return result
 
 
 @ndpx.kernel(debug=True)
-def kernel_sum(a_in_kernel, b_in_kernel, c_in_kernel):
-    i = ndpx.get_global_id(0)
+def kernel_sum(item, a_in_kernel, b_in_kernel, c_in_kernel):
+    i = item.get_id(0)
     c_in_kernel[i] = func_sum(a_in_kernel[i], b_in_kernel[i])
 
 
@@ -24,6 +24,6 @@ def kernel_sum(a_in_kernel, b_in_kernel, c_in_kernel):
 b = np.arange(global_size, dtype=np.float32)
 c = np.empty_like(a)
 
-kernel_sum[ndpx.Range(global_size)](a, b, c)
+ndpx.call_kernel(kernel_sum, ndpx.Range(global_size), a, b, c)
 
 print("Done...")
diff --git a/numba_dpex/examples/debug/simple_sum.py b/numba_dpex/examples/debug/simple_sum.py
index 0bfa88396f..b676b927fe 100644
--- a/numba_dpex/examples/debug/simple_sum.py
+++ b/numba_dpex/examples/debug/simple_sum.py
@@ -8,8 +8,8 @@
 
 
 @ndpx.kernel(debug=True)
-def data_parallel_sum(a, b, c):
-    i = ndpx.get_global_id(0)
+def data_parallel_sum(item, a, b, c):
+    i = item.get_id(0)
     c[i] = a[i] + b[i]  # Condition breakpoint location
 
 
@@ -20,6 +20,6 @@ def data_parallel_sum(a, b, c):
 b = np.array(np.random.random(N), dtype=np.float32)
 c = np.ones_like(a)
 
-data_parallel_sum[ndpx.Range(global_size)](a, b, c)
+ndpx.call_kernel(data_parallel_sum, ndpx.Range(global_size), a, b, c)
 
 print("Done...")
diff --git a/numba_dpex/examples/debug/sum.py b/numba_dpex/examples/debug/sum.py
index 9974a65a96..9bed988eab 100644
--- a/numba_dpex/examples/debug/sum.py
+++ b/numba_dpex/examples/debug/sum.py
@@ -8,8 +8,8 @@
 
 
 @ndpx.kernel(debug=True)
-def data_parallel_sum(a_in_kernel, b_in_kernel, c_in_kernel):
-    i = ndpx.get_global_id(0)  # numba-kernel-breakpoint
+def data_parallel_sum(item, a_in_kernel, b_in_kernel, c_in_kernel):
+    i = item.get_id(0)  # numba-kernel-breakpoint
     l1 = a_in_kernel[i]  # second-line
     l2 = b_in_kernel[i]  # third-line
     c_in_kernel[i] = l1 + l2  # fourth-line
@@ -19,7 +19,7 @@ def driver(a, b, c, global_size):
     print("before : ", a)
     print("before : ", b)
     print("before : ", c)
-    data_parallel_sum[ndpx.Range(global_size)](a, b, c)
+    ndpx.call_kernel(data_parallel_sum, ndpx.Range(global_size), a, b, c)
     print("after : ", c)
 
 
diff --git a/numba_dpex/examples/debug/sum_local_vars.py b/numba_dpex/examples/debug/sum_local_vars.py
index 537aab40ce..0440e6857e 100644
--- a/numba_dpex/examples/debug/sum_local_vars.py
+++ b/numba_dpex/examples/debug/sum_local_vars.py
@@ -2,14 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import dpctl
 import dpnp as np
 
 import numba_dpex as ndpx
 
 
 @ndpx.kernel(debug=True)
-def data_parallel_sum(a, b, c):
+def data_parallel_sum(item, a, b, c):
     i = ndpx.get_global_id(0)
     l1 = a[i] + 2.5
     l2 = b[i] * 0.3
@@ -23,6 +22,6 @@ def data_parallel_sum(a, b, c):
 b = np.array(np.random.random(N), dtype=np.float32)
 c = np.ones_like(a)
 
-data_parallel_sum[ndpx.Range(global_size)](a, b, c)
+ndpx.call_kernel(data_parallel_sum, ndpx.Range(global_size), a, b, c)
 
 print("Done...")
diff --git a/numba_dpex/examples/debug/sum_local_vars_revive.py b/numba_dpex/examples/debug/sum_local_vars_revive.py
index 11b53d99f3..47e4ad3a8e 100644
--- a/numba_dpex/examples/debug/sum_local_vars_revive.py
+++ b/numba_dpex/examples/debug/sum_local_vars_revive.py
@@ -7,14 +7,14 @@
 import numba_dpex as ndpx
 
 
-@ndpx.func
+@ndpx.device_func
 def revive(x):
     return x
 
 
 @ndpx.kernel(debug=True)
-def data_parallel_sum(a, b, c):
-    i = ndpx.get_global_id(0)
+def data_parallel_sum(item, a, b, c):
+    i = item.get_id(0)
     l1 = a[i] + 2.5
     l2 = b[i] * 0.3
     c[i] = l1 + l2
@@ -28,6 +28,6 @@ def data_parallel_sum(a, b, c):
 b = np.array(np.random.random(N), dtype=np.float32)
 c = np.ones_like(a)
 
-data_parallel_sum[ndpx.Range(global_size)](a, b, c)
+ndpx.call_kernel(data_parallel_sum, ndpx.Range(global_size), a, b, c)
 
 print("Done...")

From d0ad17e8b58ab371e7c3ab67c7cc2116bc3dac1a Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Fri, 22 Mar 2024 13:28:01 -0500
Subject: [PATCH 13/14] Remove leftover print

---
 numba_dpex/core/utils/kernel_launcher.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/numba_dpex/core/utils/kernel_launcher.py b/numba_dpex/core/utils/kernel_launcher.py
index 7c48589d49..8bc181d1ab 100644
--- a/numba_dpex/core/utils/kernel_launcher.py
+++ b/numba_dpex/core/utils/kernel_launcher.py
@@ -328,8 +328,6 @@ def set_kernel_from_spirv(
                     "unexpected behavior"
                 )
 
-        print(build_kernel_options)
-
         if build_kernel_options != "":
             spv_compiler_options = self.context.insert_const_string(
                 self.builder.module, build_kernel_options

From fb5642c41cbdaa295f70ce1debe9707d7be4c0e8 Mon Sep 17 00:00:00 2001
From: Yevhenii Havrylko <yevhenii.havrylko@intel.com>
Date: Fri, 22 Mar 2024 19:41:16 -0400
Subject: [PATCH 14/14] Fix gdb tests

---
 numba_dpex/examples/debug/side-by-side.py      |  2 +-
 numba_dpex/tests/debugging/test_breakpoints.py |  6 +++---
 numba_dpex/tests/debugging/test_info.py        | 18 +++++++++---------
 numba_dpex/tests/debugging/test_stepping.py    | 15 +++++++++------
 4 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/numba_dpex/examples/debug/side-by-side.py b/numba_dpex/examples/debug/side-by-side.py
index 5df0b551eb..972b05844e 100644
--- a/numba_dpex/examples/debug/side-by-side.py
+++ b/numba_dpex/examples/debug/side-by-side.py
@@ -59,7 +59,7 @@ def ndpx_func_driver(a, b, c):
 
 @ndpx.kernel(debug=True)
 def _kernel(item, a_in_kernel, b_in_kernel, c_in_kernel):
-    i = item.get_global_id(0)
+    i = item.get_id(0)
     c_in_kernel[i] = ndpx_loop_body(a_in_kernel[i], b_in_kernel[i])
 
 
diff --git a/numba_dpex/tests/debugging/test_breakpoints.py b/numba_dpex/tests/debugging/test_breakpoints.py
index bcb87fe179..ef178e5415 100644
--- a/numba_dpex/tests/debugging/test_breakpoints.py
+++ b/numba_dpex/tests/debugging/test_breakpoints.py
@@ -21,7 +21,7 @@
 @pytest.mark.parametrize(
     "breakpoint",
     [
-        "side-by-side.py:16",
+        "side-by-side.py:15",
         "common_loop_body",
         "side-by-side.py:common_loop_body",
     ],
@@ -56,14 +56,14 @@ def test_device_func_breakpoint(
     SAT-4449
     """
 
-    if api == "numba-ndpx-kernel" and breakpoint != "side-by-side.py:16":
+    if api == "numba-ndpx-kernel" and breakpoint != "side-by-side.py:15":
         pytest.skip(
             "Breakpoint by function name not working for numba-dpex."
         )  # TODO: https://github.com/IntelPython/numba-dpex/issues/1242
 
     app.breakpoint(breakpoint, condition=condition)
     app.run(f"side-by-side.py --api={api}")
-    app.expect_hit_breakpoint("side-by-side.py:16")
+    app.expect_hit_breakpoint("side-by-side.py:15")
     if exp_var is not None:
         app.print(exp_var, expected=exp_val)
 
diff --git a/numba_dpex/tests/debugging/test_info.py b/numba_dpex/tests/debugging/test_info.py
index e09d37858a..421aba8e16 100644
--- a/numba_dpex/tests/debugging/test_info.py
+++ b/numba_dpex/tests/debugging/test_info.py
@@ -24,7 +24,7 @@
         (
             "simple_dpex_func.py:18",
             "simple_dpex_func.py",
-            r"18\s+i = ndpx\.get_global_id\(0\)",
+            r"18\s+i = item\.get_id\(0\)",
             [
                 (
                     "a_in_kernel",
@@ -47,18 +47,18 @@
             ],
         ),
         (
-            "side-by-side.py:16",
+            "side-by-side.py:15",
             "side-by-side.py --api=numba",
-            r"16\s+param_c = param_a \+ numba\.float32\(10\)",
+            r"15\s+param_c = param_a \+ numba\.float32\(10\)",
             [
                 ("param_a", r"[0-9]+", r"type = float32", r"type = float32"),
                 ("param_b", r"[0-9]+", r"type = float32", r"type = float32"),
             ],
         ),
         (
-            "side-by-side.py:16",
+            "side-by-side.py:15",
             "side-by-side.py --api=numba-ndpx-kernel",
-            r"16\s+param_c = param_a \+ numba\.float32\(10\)",
+            r"15\s+param_c = param_a \+ numba\.float32\(10\)",
             [
                 ("param_a", r"[0-9]+", r"type = float32", r"type = float32"),
                 ("param_b", r"[0-9]+", r"type = float32", r"type = float32"),
@@ -100,7 +100,7 @@ def test_info_functions(app):
     app.breakpoint("simple_sum.py:12")
     app.run("simple_sum.py")
     app.expect_hit_breakpoint("simple_sum.py:12")
-    app.expect(r"12\s+i = ndpx.get_global_id\(0\)", with_eol=True)
+    app.expect(r"12\s+i = item.get_id\(0\)", with_eol=True)
 
     app.info_functions("data_parallel_sum")
 
@@ -117,7 +117,7 @@ def test_info_functions(app):
 def test_print_array_element(app, api):
     """Test access to array elements"""
 
-    app.breakpoint("side-by-side-2.py:17 if param_a == 5")
+    app.breakpoint("side-by-side-2.py:17", condition="param_a == 5")
     app.run(f"side-by-side-2.py --api={api}")
     app.expect_hit_breakpoint("side-by-side-2.py:17")
 
@@ -140,9 +140,9 @@ def test_print_array_element(app, api):
     ],
 )
 def test_assignment_to_variable(app, api, assign):
-    app.breakpoint("side-by-side-2.py:18", condition="param_a == 5")
+    app.breakpoint("side-by-side-2.py:17", condition="param_a == 5")
     app.run(f"side-by-side-2.py --api={api}")
-    app.expect_hit_breakpoint("side-by-side-2.py:18")
+    app.expect_hit_breakpoint("side-by-side-2.py:17")
 
     app.print("param_a", expected=5)
     if assign == "print":
diff --git a/numba_dpex/tests/debugging/test_stepping.py b/numba_dpex/tests/debugging/test_stepping.py
index 44f7e89246..58464350b4 100644
--- a/numba_dpex/tests/debugging/test_stepping.py
+++ b/numba_dpex/tests/debugging/test_stepping.py
@@ -22,7 +22,7 @@ def test_next(app: gdb):
     app.breakpoint("simple_dpex_func.py:18")
     app.run("simple_dpex_func.py")
     app.expect_hit_breakpoint("simple_dpex_func.py:18")
-    app.expect(r"18\s+i = ndpx.get_global_id\(0\)", with_eol=True)
+    app.expect(r"18\s+i = item.get_id\(0\)", with_eol=True)
     app.set_scheduler_lock()
     app.next()
     app.expect(
@@ -35,19 +35,22 @@ def test_next(app: gdb):
 
 
 def test_step(app: gdb):
-    app.breakpoint("simple_dpex_func.py:18")
+    app.breakpoint("simple_dpex_func.py:19")
     app.run("simple_dpex_func.py")
-    app.expect_hit_breakpoint("simple_dpex_func.py:18")
-    app.expect(r"18\s+i = ndpx.get_global_id\(0\)", with_eol=True)
-    app.set_scheduler_lock()
-    app.step()
+    app.expect_hit_breakpoint("simple_dpex_func.py:19")
     app.expect(
         r"19\s+c_in_kernel\[i\] = func_sum\(a_in_kernel\[i\], b_in_kernel\[i\]\)",
         with_eol=True,
     )
+    app.set_scheduler_lock()
     app.step()
     app.expect(r"__main__::func_sum.* at simple_dpex_func.py:12", with_eol=True)
     app.expect(r"12\s+result = a_in_func \+ b_in_func", with_eol=True)
+    app.step()
+    app.expect(
+        r"13\s+return result",
+        with_eol=True,
+    )
 
 
 @pytest.mark.parametrize("func", ["stepi", "nexti"])