diff --git a/README.md b/README.md index 73c195f133..dcd5f51638 100644 --- a/README.md +++ b/README.md @@ -12,27 +12,36 @@ -Data-parallel Extension for Numba* (numba-dpex) is a standalone extension for -the [Numba](http://numba.pydata.org) Python JIT compiler. Numba-dpex provides -a generic kernel programming API and an offload feature that extends Numba's -auto-parallelizer to generate data-parallel kernels for `parfor` nodes. - -Numba-dpex's kernel API has a design and API similar to Numba's `cuda.jit` -module, but is based on the [SYCL](https://sycl.tech/) language. The -code-generation for the kernel API currently supports -[SPIR-V](https://www.khronos.org/spir/)-based -[OpenCL](https://www.khronos.org/opencl/) and -[oneAPI Level Zero](https://spec.oneapi.io/level-zero/latest/index.html) -devices that are supported by Intel® DPC++ SYCL compiler runtime. Supported -devices include Intel® CPUs, integrated GPUs and discrete GPUs. - -The offload functionality in numba-dpex is based on Numba's `parfor` -loop-parallelizer. Our compiler extends Numba's `parfor` feature to generate -kernels and offload them to devices supported by DPC++ SYCL compiler runtime. -The offload functionality is supported via a new NumPy drop-in replacement -library: [dpnp](https://github.com/IntelPython/dpnp). Note that `dpnp` and NumPy-based -expressions can be used together in the same function, with `dpnp` expressions getting -offloaded by `numba-dpex` and NumPy expressions getting parallelized by Numba. +Data-parallel Extension for Numba* (numba-dpex) is an open-source standalone +extension for the [Numba](http://numba.pydata.org) Python JIT compiler. +Numba-dpex provides a [SYCL*](https://sycl.tech/)-like API for kernel +programming Python. SYCL* is an open standard developed by the [Unified +Acceleration Foundation](https://uxlfoundation.org/) as a vendor-agnostic way of +programming different types of data-parallel hardware such as multi-core CPUs, +GPUs, and FPGAs. Numba-dpex's kernel-programming API brings the same programming +model and a similar API to Python. The API allows expressing portable +data-parallel kernels in Python and then JIT compiling them for different +hardware targets. JIT compilation is supported for hardware that use the +[SPIR-V](https://www.khronos.org/spir/) intermediate representation format that +includes [OpenCL](https://www.khronos.org/opencl/) CPU (Intel, AMD) devices, +OpenCL GPU (Intel integrated and discrete GPUs) devices, and [oneAPI Level +Zero](https://spec.oneapi.io/level-zero/latest/index.html) GPU (Intel integrated +and discrete GPUs) devices. + +The kernel programming API does not yet support every SYCL* feature. Refer to +the [SYCL* and numba-dpex feature comparison](https://intelpython.github.io/numba-dpex/latest/supported_sycl_features.html) +page to get a summary of supported features. +Numba-dpex only implements SYCL*'s kernel programming API, all SYCL runtime +Python bindings are provided by the [dpctl](https://github.com/IntelPython/dpctl) +package. + +Along with the kernel programming API, numba-dpex extends Numba's +auto-parallelizer to bring device offload capabilities to `prange` loops and +NumPy-like vector expressions. The offload functionality is supported via the +NumPy drop-in replacement library: [dpnp](https://github.com/IntelPython/dpnp). +Note that `dpnp` and NumPy-based expressions can be used together in the same +function, with `dpnp` expressions getting offloaded by `numba-dpex` and NumPy +expressions getting parallelized by Numba. Refer the [documentation](https://intelpython.github.io/numba-dpex) and examples to learn more. diff --git a/docs/source/conf.py b/docs/source/conf.py index f0ef3a41e8..89cb5f3b8b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -5,10 +5,12 @@ # coding: utf-8 # Configuration file for the Sphinx documentation builder. -# -- Project information ----------------------------------------------------- +# -- Project information ------------------------------------------------------- import sys +import numba_dpex + sys.path.append(".") from sycl_spec_links import sycl_ext_links # noqa E402 @@ -18,16 +20,15 @@ author = "Intel Corporation" # The full version, including alpha/beta/rc tags -# release = "main" +release = numba_dpex.__version__ -# -- General configuration ---------------------------------------------------- +# -- General configuration ----------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx.ext.todo", - "sphinx.ext.intersphinx", "sphinx.ext.extlinks", "sphinx.ext.githubpages", "sphinx.ext.napoleon", @@ -35,6 +36,7 @@ "sphinxcontrib.googleanalytics", "myst_parser", "autoapi.extension", + "sphinx.ext.intersphinx", "sphinxcontrib.bibtex", ] @@ -52,12 +54,11 @@ extlinks = {} extlinks.update(sycl_ext_links) -# -- Options for HTML output ------------------------------------------------- +# -- Options for HTML output --------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -# html_theme = "pydata_sphinx_theme" html_theme = "furo" html_theme_options = { @@ -82,7 +83,7 @@ html_show_sourcelink = False -# -- Todo extension configuration ---------------------------------------------- +# -- Todo extension configuration --------------------------------------------- todo_include_todos = True todo_link_only = True @@ -90,9 +91,15 @@ intersphinx_mapping = {} -# -- Prepend module name to an object name or not ----------------------------------- +# -- Prepend module name to an object name or not ------------------------------ add_module_names = False + +# -- autodoc configurations ---------------------------------------------------- + +autodoc_typehints_format = "short" +python_use_unqualified_type_names = True + # -- Auto API configurations --------------------------------------------------- autoapi_dirs = [ diff --git a/docs/source/contribution_guide.rst b/docs/source/contribution_guide.rst deleted file mode 100644 index 971a7fd411..0000000000 --- a/docs/source/contribution_guide.rst +++ /dev/null @@ -1,26 +0,0 @@ -.. _contributions: -.. include:: ./ext_links.txt - -Contribution Guide -================== - -Classification of Contributions -------------------------------- - -Development Cycle ------------------ - -Issues and Pull Requests ------------------------- - -Coding Guidelines ------------------ - -Unit Testing -------------- - -Documentation -------------- - -Tips for Developers -------------------- diff --git a/docs/source/examples.rst b/docs/source/examples.rst deleted file mode 100644 index f05801f7ab..0000000000 --- a/docs/source/examples.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. _examples: -.. include:: ./ext_links.txt - -List of examples -================ - -.. todo:: - Provide list of examples for numba-dpex - -Benchmarks -********** - -.. todo:: - Provide instructions for dpbench - -Jupyter* Notebooks -****************** - -.. todo:: - Provide instructions for Jupyter Notebook samples diff --git a/docs/source/index.rst b/docs/source/index.rst index 0bfda03bbc..8582ce97ff 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,43 +1,15 @@ .. _index: .. include:: ./ext_links.txt -.. Feedbacks: -.. Use proper names (as provided by marketing) -.. Do not use "we/I" -.. Use correct acronym, expand acronym at the first instance - - -.. Follow cupy docs. - -.. Overview: -.. Main Features -.. Project Goal - -.. Getting Started: -.. Yevheni -.. Docker page - -.. User Guide: -.. https://intelpython.github.io/oneAPI-for-SciPy/details/programming_model/ -.. Kernel Programming API --> Kernel Programming Basics -.. Compiling and Offloading DPNP -.. - Introduce @dpjit -.. - Array constructor in dpjit -.. - ufunc (refer to numba) -.. - prange -.. - -.. Debugging using GDB -.. # Performance Tips -.. # Troubleshooting -.. Useful Links -.. # To-Do - -.. API Reference: - - Data Parallel Extension for Numba* ================================== +Numba-dpex is an open-source kernel-programming API and JIT compiler for +portable accelerator programming directly in Python. The API and the compiler is +modeled after the C++ SYCL* language and brings a similar programming model and +language design to Python. The page lists the relevant documentation to learn to +program data-parallel kernels using numba-dpex. + .. module:: numba_dpex .. toctree:: @@ -55,14 +27,7 @@ Data Parallel Extension for Numba* .. toctree:: :maxdepth: 1 - :caption: Development - - contribution_guide - -.. toctree:: - :maxdepth: 1 - :caption: Misc Notes + :caption: Miscellaneous Notes - examples license release-notes diff --git a/docs/source/user_guide/kernel_programming/math-functions.rst b/docs/source/user_guide/kernel_programming/math-functions.rst index 284f89e06e..f1ef2f512c 100644 --- a/docs/source/user_guide/kernel_programming/math-functions.rst +++ b/docs/source/user_guide/kernel_programming/math-functions.rst @@ -1,3 +1,5 @@ +.. include:: ./../../ext_links.txt + Scalar mathematical functions from the Python `math`_ module and the `dpnp`_ library can be used inside a kernel function. During compilation the diff --git a/docs/source/user_guide/kernel_programming/reduction.rst b/docs/source/user_guide/kernel_programming/reduction.rst deleted file mode 100644 index 0f57b4b40e..0000000000 --- a/docs/source/user_guide/kernel_programming/reduction.rst +++ /dev/null @@ -1,51 +0,0 @@ -Reduction on SYCL-supported Devices -=================================== - -Numba-dpex does not yet provide any specific decorator to implement -reduction kernels. However, a kernel reduction can be written explicitly. This -section provides two approaches for writing a reduction kernel as a -``numba_dpex.kernel`` function. - - -Example 1 ---------- - -This example demonstrates a summation reduction on a one-dimensional array. - -Full example can be found at ``numba_dpex/examples/sum_reduction.py``. - -In this example, to reduce the array we invoke the kernel multiple times. - -.. literalinclude:: ./../../../../numba_dpex/examples/sum_reduction.py - :pyobject: sum_reduction_kernel - -.. literalinclude:: ./../../../../numba_dpex/examples/sum_reduction.py - :pyobject: sum_reduce - -Example 2 ---------- - -Full example can be found at -``numba_dpex/examples/sum_reduction_recursive_ocl.py``. - -.. literalinclude:: ./../../../../numba_dpex/examples/kernel/sum_reduction_recursive_ocl.py - :pyobject: sum_reduction_kernel - -.. literalinclude:: ./../../../../numba_dpex/examples/kernel/sum_reduction_recursive_ocl.py - :pyobject: sum_recursive_reduction - -.. literalinclude:: ./../../../../numba_dpex/examples/kernel/sum_reduction_recursive_ocl.py - :pyobject: sum_reduce - -.. note:: - - Numba-dpex does not yet provide any analogue to the ``numba.cuda.reduce`` - decorator for writing reductions kernel. Such a decorator will be added in - future releases. - -Full examples -------------- - -- :file:`numba_dpex/examples/sum_reduction_recursive_ocl.py` -- :file:`numba_dpex/examples/sum_reduction_ocl.py` -- :file:`numba_dpex/examples/sum_reduction.py` diff --git a/docs/source/user_guide/kernel_programming/synchronization.rst b/docs/source/user_guide/kernel_programming/synchronization.rst deleted file mode 100644 index d590b0801b..0000000000 --- a/docs/source/user_guide/kernel_programming/synchronization.rst +++ /dev/null @@ -1,33 +0,0 @@ -Synchronization Functions -========================= - -``numba-dpex`` only supports some of the SYCL synchronization operations. For -synchronization of all threads in the same thread block, numba-dpex provides -a helper function called ``numba_dpex.barrier()``. This function implements the -same pattern as barriers in traditional multi-threaded programming: invoking the -function forces a thread to wait until all threads in the block reach the -barrier, at which point it returns control to all its callers. - -``numba_dpex.barrier()`` supports two memory fence options: - -- ``numba_dpex.GLOBAL_MEM_FENCE``: The barrier function will queue a memory - fence to ensure correct ordering of memory operations to global memory. Using - the option can be useful when work-items, for example, write to buffer or - image objects and then want to read the updated data. Passing no arguments to - ``numba_dpex.barrier()`` is equivalent to setting the global memory fence - option. - - .. .. literalinclude:: ./../../../../numba_dpex/examples/barrier.py - .. :pyobject: no_arg_barrier_support - -- ``numba_dpex.LOCAL_MEM_FENCE``: The barrier function will either flush - any variables stored in local memory or queue a memory fence to ensure - correct ordering of memory operations to local memory. - -.. .. literalinclude:: ./../../../../numba_dpex/examples/barrier.py -.. :pyobject: local_memory - -.. note:: - - The ``numba_dpex.barrier()`` function is semantically equivalent to - ``numba.cuda.syncthreads``. diff --git a/numba_dpex/core/runtime/context.py b/numba_dpex/core/runtime/context.py index 1efa58cde7..80f0253101 100644 --- a/numba_dpex/core/runtime/context.py +++ b/numba_dpex/core/runtime/context.py @@ -439,16 +439,18 @@ def acquire_meminfo_and_schedule_release( ): """Inserts LLVM IR to call nrt_acquire_meminfo_and_schedule_release. - DPCTLSyclEventRef - DPEXRT_nrt_acquire_meminfo_and_schedule_release( - NRT_api_functions *nrt, - DPCTLSyclQueueRef QRef, - NRT_MemInfo **meminfo_array, - size_t meminfo_array_size, - DPCTLSyclEventRef *depERefs, - size_t nDepERefs, - int *status, - ); + .. code-block:: c + + DPCTLSyclEventRef + DPEXRT_nrt_acquire_meminfo_and_schedule_release( + NRT_api_functions *nrt, + DPCTLSyclQueueRef QRef, + NRT_MemInfo **meminfo_array, + size_t meminfo_array_size, + DPCTLSyclEventRef *depERefs, + size_t nDepERefs, + int *status, + ); """ mod = builder.module @@ -475,16 +477,18 @@ def acquire_meminfo_and_schedule_release( def build_or_get_kernel(self, builder: llvmir.IRBuilder, args): """Inserts LLVM IR to call build_or_get_kernel. - DPCTLSyclKernelRef - DPEXRT_build_or_get_kernel( - const DPCTLSyclContextRef ctx, - const DPCTLSyclDeviceRef dev, - size_t il_hash, - const char *il, - size_t il_length, - const char *compile_opts, - const char *kernel_name, - ); + .. code-block:: c + + DPCTLSyclKernelRef + DPEXRT_build_or_get_kernel( + const DPCTLSyclContextRef ctx, + const DPCTLSyclDeviceRef dev, + size_t il_hash, + const char *il, + size_t il_length, + const char *compile_opts, + const char *kernel_name, + ); """ mod = builder.module @@ -511,7 +515,9 @@ def build_or_get_kernel(self, builder: llvmir.IRBuilder, args): def kernel_cache_size(self, builder: llvmir.IRBuilder): """Inserts LLVM IR to call kernel_cache_size. - size_t DPEXRT_kernel_cache_size(); + .. code-block:: c + + size_t DPEXRT_kernel_cache_size(); """ fn = cgutils.get_or_insert_function(