From fedd4d5066144703e8aa97c24a89c80f362f0b39 Mon Sep 17 00:00:00 2001
From: Yevhenii Havrylko <yevhenii.havrylko@intel.com>
Date: Wed, 13 Mar 2024 19:12:29 -0400
Subject: [PATCH 1/5] Add llvm.assume to spirv indexing functions

---
 .../_index_space_id_overloads.py                    | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_index_space_id_overloads.py b/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_index_space_id_overloads.py
index 86506fdb42..93f87ca3ca 100644
--- a/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_index_space_id_overloads.py
+++ b/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_index_space_id_overloads.py
@@ -54,7 +54,7 @@ def _intrinsic_spirv_global_index_const(
     sig = types.int64(types.int32)
 
     def _intrinsic_spirv_global_index_const_gen(
-        context: SPIRVTargetContext,
+        context: SPIRVTargetContext,  # pylint: disable=unused-argument
         builder: llvmir.IRBuilder,
         sig,  # pylint: disable=unused-argument
         args,
@@ -79,7 +79,16 @@ def _intrinsic_spirv_global_index_const_gen(
             dim,
         )
 
-        return context.cast(builder, res, types.uintp, types.intp)
+        # Generating same check as sycl does. Did they add it to avoid pointer
+        # bitcast on special constant?
+        max_int32 = llvmir.Constant(res.type, 2147483648)
+        cmp = builder.icmp_unsigned("<", res, max_int32)
+
+        inst = builder.assume(cmp)
+        # TODO: tail does not always work
+        inst.tail = "tail"
+
+        return res
 
     return sig, _intrinsic_spirv_global_index_const_gen
 

From 3f0d0cf2e8693ad3459d90bc77fe875efc6cfea2 Mon Sep 17 00:00:00 2001
From: Yevhenii Havrylko <yevhenii.havrylko@intel.com>
Date: Wed, 6 Mar 2024 13:14:22 -0500
Subject: [PATCH 2/5] Add fill_zeros to private array

---
 .../_private_array_overloads.py               | 20 ++++++++++--
 numba_dpex/kernel_api/private_array.py        |  9 ++++--
 .../tests/experimental/test_private_array.py  | 32 ++++++++++++++++++-
 3 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_private_array_overloads.py b/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_private_array_overloads.py
index 66f7b7e835..e0796a2f9c 100644
--- a/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_private_array_overloads.py
+++ b/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_private_array_overloads.py
@@ -9,6 +9,7 @@
 
 import llvmlite.ir as llvmir
 from llvmlite.ir.builder import IRBuilder
+from numba.core import cgutils
 from numba.core.typing.npydecl import parse_dtype as _ty_parse_dtype
 from numba.core.typing.npydecl import parse_shape as _ty_parse_shape
 from numba.core.typing.templates import Signature
@@ -28,9 +29,13 @@
 
 @intrinsic(target=DPEX_KERNEL_EXP_TARGET_NAME)
 def _intrinsic_private_array_ctor(
-    ty_context, ty_shape, ty_dtype  # pylint: disable=unused-argument
+    ty_context,  # pylint: disable=unused-argument
+    ty_shape,
+    ty_dtype,
+    ty_fill_zeros,
 ):
     require_literal(ty_shape)
+    require_literal(ty_fill_zeros)
 
     ty_array = USMNdArray(
         dtype=_ty_parse_dtype(ty_dtype),
@@ -39,7 +44,7 @@ def _intrinsic_private_array_ctor(
         addrspace=AddressSpace.PRIVATE,
     )
 
-    sig = ty_array(ty_shape, ty_dtype)
+    sig = ty_array(ty_shape, ty_dtype, ty_fill_zeros)
 
     def codegen(
         context: DpexExpKernelTypingContext,
@@ -49,11 +54,18 @@ def codegen(
     ):
         shape = args[0]
         ty_shape = sig.args[0]
+        ty_fill_zeros = sig.args[-1]
         ty_array = sig.return_type
 
         ary = make_spirv_generic_array_on_stack(
             context, builder, ty_array, ty_shape, shape
         )
+
+        if ty_fill_zeros.literal_value:
+            cgutils.memset(
+                builder, ary.data, builder.mul(ary.itemsize, ary.nitems), 0
+            )
+
         return ary._getvalue()  # pylint: disable=protected-access
 
     return (
@@ -70,6 +82,7 @@ def codegen(
 def ol_private_array_ctor(
     shape,
     dtype,
+    fill_zeros=False,
 ):
     """Overload of the constructor for the class
     class:`numba_dpex.kernel_api.PrivateArray`.
@@ -84,8 +97,9 @@ def ol_private_array_ctor(
     def ol_private_array_ctor_impl(
         shape,
         dtype,
+        fill_zeros=False,
     ):
         # pylint: disable=no-value-for-parameter
-        return _intrinsic_private_array_ctor(shape, dtype)
+        return _intrinsic_private_array_ctor(shape, dtype, fill_zeros)
 
     return ol_private_array_ctor_impl
diff --git a/numba_dpex/kernel_api/private_array.py b/numba_dpex/kernel_api/private_array.py
index 7393cc71b8..95b9a7ae2a 100644
--- a/numba_dpex/kernel_api/private_array.py
+++ b/numba_dpex/kernel_api/private_array.py
@@ -7,7 +7,7 @@
 kernel function.
 """
 
-from numpy import ndarray
+import numpy as np
 
 
 class PrivateArray:
@@ -16,10 +16,13 @@ class PrivateArray:
     inside kernel work item.
     """
 
-    def __init__(self, shape, dtype) -> None:
+    def __init__(self, shape, dtype, fill_zeros=False) -> None:
         """Creates a new PrivateArray instance of the given shape and dtype."""
 
-        self._data = ndarray(shape=shape, dtype=dtype)
+        if fill_zeros:
+            self._data = np.zeros(shape=shape, dtype=dtype)
+        else:
+            self._data = np.empty(shape=shape, dtype=dtype)
 
     def __getitem__(self, idx_obj):
         """Returns the value stored at the position represented by idx_obj in
diff --git a/numba_dpex/tests/experimental/test_private_array.py b/numba_dpex/tests/experimental/test_private_array.py
index fcbf69b825..fa6af6f58b 100644
--- a/numba_dpex/tests/experimental/test_private_array.py
+++ b/numba_dpex/tests/experimental/test_private_array.py
@@ -23,6 +23,30 @@ def private_array_kernel(item: Item, a):
         a[i] += p[j]
 
 
+def private_array_kernel_fill_true(item: Item, a):
+    i = item.get_linear_id()
+    p = PrivateArray(10, a.dtype, fill_zeros=True)
+
+    for j in range(10):
+        p[j] = j * j
+
+    a[i] = 0
+    for j in range(10):
+        a[i] += p[j]
+
+
+def private_array_kernel_fill_false(item: Item, a):
+    i = item.get_linear_id()
+    p = PrivateArray(10, a.dtype, fill_zeros=False)
+
+    for j in range(10):
+        p[j] = j * j
+
+    a[i] = 0
+    for j in range(10):
+        a[i] += p[j]
+
+
 def private_2d_array_kernel(item: Item, a):
     i = item.get_linear_id()
     p = PrivateArray(shape=(5, 2), dtype=a.dtype)
@@ -36,7 +60,13 @@ def private_2d_array_kernel(item: Item, a):
 
 
 @pytest.mark.parametrize(
-    "kernel", [private_array_kernel, private_2d_array_kernel]
+    "kernel",
+    [
+        private_array_kernel,
+        private_array_kernel_fill_true,
+        private_array_kernel_fill_false,
+        private_2d_array_kernel,
+    ],
 )
 @pytest.mark.parametrize(
     "call_kernel, decorator",

From 057aaf819b3a78f8e8b6c0fbc88dc5faf46c44e2 Mon Sep 17 00:00:00 2001
From: Yevhenii Havrylko <yevhenii.havrylko@intel.com>
Date: Fri, 8 Mar 2024 13:49:52 -0500
Subject: [PATCH 3/5] Use lower instead of overload for private array

---
 .../_private_array_overloads.py               | 119 ++++++++----------
 .../_registry.py                              |  12 ++
 numba_dpex/kernel_api_impl/spirv/arrayobj.py  |   4 +-
 numba_dpex/kernel_api_impl/spirv/target.py    |   4 +
 4 files changed, 69 insertions(+), 70 deletions(-)
 create mode 100644 numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_registry.py

diff --git a/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_private_array_overloads.py b/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_private_array_overloads.py
index e0796a2f9c..958387c8df 100644
--- a/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_private_array_overloads.py
+++ b/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_private_array_overloads.py
@@ -9,11 +9,11 @@
 
 import llvmlite.ir as llvmir
 from llvmlite.ir.builder import IRBuilder
-from numba.core import cgutils
+from numba.core import cgutils, types
 from numba.core.typing.npydecl import parse_dtype as _ty_parse_dtype
 from numba.core.typing.npydecl import parse_shape as _ty_parse_shape
 from numba.core.typing.templates import Signature
-from numba.extending import intrinsic, overload
+from numba.extending import type_callable
 
 from numba_dpex.core.types import USMNdArray
 from numba_dpex.experimental.target import DpexExpKernelTypingContext
@@ -24,67 +24,12 @@
 )
 from numba_dpex.utils import address_space as AddressSpace
 
-from ..target import DPEX_KERNEL_EXP_TARGET_NAME
+from ._registry import lower
 
 
-@intrinsic(target=DPEX_KERNEL_EXP_TARGET_NAME)
-def _intrinsic_private_array_ctor(
-    ty_context,  # pylint: disable=unused-argument
-    ty_shape,
-    ty_dtype,
-    ty_fill_zeros,
-):
-    require_literal(ty_shape)
-    require_literal(ty_fill_zeros)
-
-    ty_array = USMNdArray(
-        dtype=_ty_parse_dtype(ty_dtype),
-        ndim=_ty_parse_shape(ty_shape),
-        layout="C",
-        addrspace=AddressSpace.PRIVATE,
-    )
-
-    sig = ty_array(ty_shape, ty_dtype, ty_fill_zeros)
-
-    def codegen(
-        context: DpexExpKernelTypingContext,
-        builder: IRBuilder,
-        sig: Signature,
-        args: list[llvmir.Value],
-    ):
-        shape = args[0]
-        ty_shape = sig.args[0]
-        ty_fill_zeros = sig.args[-1]
-        ty_array = sig.return_type
-
-        ary = make_spirv_generic_array_on_stack(
-            context, builder, ty_array, ty_shape, shape
-        )
-
-        if ty_fill_zeros.literal_value:
-            cgutils.memset(
-                builder, ary.data, builder.mul(ary.itemsize, ary.nitems), 0
-            )
-
-        return ary._getvalue()  # pylint: disable=protected-access
-
-    return (
-        sig,
-        codegen,
-    )
-
-
-@overload(
-    PrivateArray,
-    prefer_literal=True,
-    target=DPEX_KERNEL_EXP_TARGET_NAME,
-)
-def ol_private_array_ctor(
-    shape,
-    dtype,
-    fill_zeros=False,
-):
-    """Overload of the constructor for the class
+@type_callable(PrivateArray)
+def type_interval(context):  # pylint: disable=unused-argument
+    """Sets type of the constructor for the class
     class:`numba_dpex.kernel_api.PrivateArray`.
 
     Raises:
@@ -94,12 +39,48 @@ def ol_private_array_ctor(
             type.
     """
 
-    def ol_private_array_ctor_impl(
-        shape,
-        dtype,
-        fill_zeros=False,
-    ):
-        # pylint: disable=no-value-for-parameter
-        return _intrinsic_private_array_ctor(shape, dtype, fill_zeros)
+    def typer(shape, dtype, fill_zeros=types.BooleanLiteral(False)):
+        require_literal(shape)
+        require_literal(fill_zeros)
+
+        return USMNdArray(
+            dtype=_ty_parse_dtype(dtype),
+            ndim=_ty_parse_shape(shape),
+            layout="C",
+            addrspace=AddressSpace.PRIVATE,
+        )
+
+    return typer
+
+
+@lower(PrivateArray, types.IntegerLiteral, types.Any, types.BooleanLiteral)
+@lower(PrivateArray, types.Tuple, types.Any, types.BooleanLiteral)
+@lower(PrivateArray, types.UniTuple, types.Any, types.BooleanLiteral)
+@lower(PrivateArray, types.IntegerLiteral, types.Any)
+@lower(PrivateArray, types.Tuple, types.Any)
+@lower(PrivateArray, types.UniTuple, types.Any)
+def dpex_private_array_lower(
+    context: DpexExpKernelTypingContext,
+    builder: IRBuilder,
+    sig: Signature,
+    args: list[llvmir.Value],
+):
+    """Implements lower for the class:`numba_dpex.kernel_api.PrivateArray`"""
+    shape = args[0]
+    ty_shape = sig.args[0]
+    if len(sig.args) == 3:
+        fill_zeros = sig.args[-1].literal_value
+    else:
+        fill_zeros = False
+    ty_array = sig.return_type
+
+    ary = make_spirv_generic_array_on_stack(
+        context, builder, ty_array, ty_shape, shape
+    )
+
+    if fill_zeros:
+        cgutils.memset(
+            builder, ary.data, builder.mul(ary.itemsize, ary.nitems), 0
+        )
 
-    return ol_private_array_ctor_impl
+    return ary._getvalue()  # pylint: disable=protected-access
diff --git a/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_registry.py b/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_registry.py
new file mode 100644
index 0000000000..1fae06a258
--- /dev/null
+++ b/numba_dpex/experimental/_kernel_dpcpp_spirv_overloads/_registry.py
@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: 2024 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Implements the SPIR-V overloads for the kernel_api.PrivateArray class.
+"""
+
+from numba.core.imputils import Registry
+
+registry = Registry()
+lower = registry.lower
diff --git a/numba_dpex/kernel_api_impl/spirv/arrayobj.py b/numba_dpex/kernel_api_impl/spirv/arrayobj.py
index e1e5742b28..325d0e4e18 100644
--- a/numba_dpex/kernel_api_impl/spirv/arrayobj.py
+++ b/numba_dpex/kernel_api_impl/spirv/arrayobj.py
@@ -41,7 +41,9 @@ def require_literal(literal_type: types.Type):
 
     for i, _ in enumerate(literal_type):
         if not isinstance(literal_type[i], types.Literal):
-            raise errors.TypingError("requires literal type")
+            raise errors.TypingError(
+                "requires each element of tuple literal type"
+            )
 
 
 def make_spirv_array(  # pylint: disable=too-many-arguments
diff --git a/numba_dpex/kernel_api_impl/spirv/target.py b/numba_dpex/kernel_api_impl/spirv/target.py
index 4e51b9b8fd..4a1b4a42e2 100644
--- a/numba_dpex/kernel_api_impl/spirv/target.py
+++ b/numba_dpex/kernel_api_impl/spirv/target.py
@@ -383,12 +383,16 @@ def load_additional_registries(self):
         # pylint: disable=import-outside-toplevel
         from numba_dpex import printimpl
         from numba_dpex.dpnp_iface import dpnpimpl
+        from numba_dpex.experimental._kernel_dpcpp_spirv_overloads._registry import (
+            registry as spirv_registry,
+        )
         from numba_dpex.ocl import mathimpl, oclimpl
 
         self.insert_func_defn(oclimpl.registry.functions)
         self.insert_func_defn(mathimpl.registry.functions)
         self.insert_func_defn(dpnpimpl.registry.functions)
         self.install_registry(printimpl.registry)
+        self.install_registry(spirv_registry)
         # Replace dpnp math functions with their OpenCL versions.
         self.replace_dpnp_ufunc_with_ocl_intrinsics()
 

From ceb729013b721508b0a54be5843d329983529a44 Mon Sep 17 00:00:00 2001
From: Yevhenii Havrylko <yevhenii.havrylko@intel.com>
Date: Wed, 6 Mar 2024 13:15:05 -0500
Subject: [PATCH 4/5] Shrink file name

---
 .../kernel_api_impl/spirv/dispatcher.py       | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/numba_dpex/kernel_api_impl/spirv/dispatcher.py b/numba_dpex/kernel_api_impl/spirv/dispatcher.py
index 056c9ffc11..9aac39edb4 100644
--- a/numba_dpex/kernel_api_impl/spirv/dispatcher.py
+++ b/numba_dpex/kernel_api_impl/spirv/dispatcher.py
@@ -5,6 +5,7 @@
 """Implements a new numba dispatcher class and a compiler class to compile and
 call numba_dpex.kernel decorated function.
 """
+import hashlib
 from collections import namedtuple
 from contextlib import ExitStack
 from typing import Tuple
@@ -181,6 +182,9 @@ def _compile_to_spirv(
         # all linking libraries getting linked together and final optimization
         # including inlining of functions if an inlining level is specified.
         kernel_library.finalize()
+
+        if config.DUMP_KERNEL_LLVM:
+            self._dump_kernel(kernel_fndesc, kernel_library)
         # Compiled the LLVM IR to SPIR-V
         kernel_spirv_module = spirv_generator.llvm_to_spirv(
             kernel_targetctx,
@@ -268,20 +272,26 @@ def _compile_cached(
 
             kcres_attrs.append(kernel_device_ir_module)
 
-            if config.DUMP_KERNEL_LLVM:
-                with open(
-                    cres.fndesc.llvm_func_name + ".ll",
-                    "w",
-                    encoding="UTF-8",
-                ) as fptr:
-                    fptr.write(str(cres.library.final_module))
-
         except errors.TypingError as err:
             self._failed_cache[key] = err
             return False, err
 
         return True, _SPIRVKernelCompileResult(*kcres_attrs)
 
+    def _dump_kernel(self, fndesc, library):
+        """Dump kernel into file."""
+        name = fndesc.llvm_func_name
+        if len(name) > 200:
+            sha256 = hashlib.sha256(name.encode("utf-8")).hexdigest()
+            name = name[:150] + "_" + sha256
+
+        with open(
+            name + ".ll",
+            "w",
+            encoding="UTF-8",
+        ) as fptr:
+            fptr.write(str(library.final_module))
+
 
 class SPIRVKernelDispatcher(Dispatcher):
     """Dispatcher class designed to compile kernel decorated functions. The

From 01318388a66e52b9d240179bd978f368564e6dd4 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Thu, 14 Mar 2024 01:36:08 -0500
Subject: [PATCH 5/5] Use SPV_INTEL_arbitrary_precision_integers when compiling
 SPIRV from LLVM IR

---
 numba_dpex/kernel_api_impl/spirv/spirv_generator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/numba_dpex/kernel_api_impl/spirv/spirv_generator.py b/numba_dpex/kernel_api_impl/spirv/spirv_generator.py
index 1171faca4a..c731a9c75d 100644
--- a/numba_dpex/kernel_api_impl/spirv/spirv_generator.py
+++ b/numba_dpex/kernel_api_impl/spirv/spirv_generator.py
@@ -123,6 +123,7 @@ def finalize(self):
         llvm_spirv_args = [
             "--spirv-ext=+SPV_EXT_shader_atomic_float_add",
             "--spirv-ext=+SPV_EXT_shader_atomic_float_min_max",
+            "--spirv-ext=+SPV_INTEL_arbitrary_precision_integers",
         ]
         for key in list(self.context.extra_compile_options.keys()):
             if key == LLVM_SPIRV_ARGS: